### Load the libraries will we need
library(readr)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(ggplot2)
Read the data and check
gapminder <- read_csv("raw_data/gapminder.csv")
Rows: 1704 Columns: 6── Column specification ──────────────────────────────────────────────────
Delimiter: ","
chr (2): country, continent
dbl (4): year, lifeExp, pop, gdpPercap
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(gapminder)
Create a subset of the data where the population less than a million in the year 2002
filter(gapminder, pop < 1e6, year == 2002)
Create a subset of the data where the life expectancy is greater than 75 in the years prior to 1987
filter(gapminder, lifeExp > 75, year < 1987)
Create a subset of the European data where the life expectancy is between 75 and 80 in the years 2002 or 2007.
filter(gapminder, continent == "Europe", lifeExp > 75, lifeExp < 80 , year == 2002 | year == 2007)
Can also use the between
function from
dplyr
and the %in%
function
filter(gapminder, continent == "Europe",
between(lifeExp, 75,80),
year %in% c(2002,2007))
Write a workflow to do the following:-
# Less-efficient solution before pipes are introduced
# create out_data folder before we start (no warning given if it already exists)
dir.create("out_data", showWarnings = FALSE)
gapminder2 <- filter(gapminder, year == 2002)
gapminder3 <- arrange(gapminder2, continent, desc(gdpPercap))
gapminder4 <- select(gapminder3, -year)
write_csv(gapminder4, "out_data/gapminder_2002.csv")
Re-written using pipes
filter(gapminder, year == 2002) %>%
arrange(continent, desc(gdpPercap)) %>%
select(-year) %>%
write_csv("out_data/gapminder_piped_2002.csv")
The violin plot is a popular alternative to the boxplot. Create a violin plot with geom_violin to visualise the differences in GDP between different continents.
ggplot(gapminder, aes(x = continent, y = gdpPercap)) + geom_violin()
Create a subset of the gapminder data frame containing just the rows for your country of birth
# don't forget that R is case-sensitive!
uk_data <- filter(gapminder, country == "United Kingdom")
Has there been an increase in life expectancy over time? - visualise the trend using a scatter plot (geom_point), line graph (geom_line) or smoothed line (geom_smooth).
ggplot(uk_data, aes(x = year, y = lifeExp)) + geom_point()
ggplot(uk_data, aes(x = year, y = lifeExp)) + geom_line()
ggplot(uk_data, aes(x = year, y = lifeExp)) + geom_smooth()
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
## can combine all plots
ggplot(uk_data, aes(x = year, y = lifeExp)) + geom_point() + geom_smooth()
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
Note: this exercise could also make use of the piping technique
filter(gapminder, country == "United Kingdom") %>%
ggplot(aes(x = year, y = lifeExp)) + geom_point() + geom_smooth()
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
What happens when you modify the geom_boxplot example to compare the gdp distributions for different years? - Look at the message ggplot2 prints above the plot and try to modify the code to give a separate boxplot for each year
# this is how we might expect the code to look like
ggplot(gapminder, aes(x = year, y = gdpPercap)) + geom_boxplot()
Warning: Continuous x aesthetic -- did you forget aes(group=...)?
The previous output hints that you might want to group by year - otherwise it thinks that year is a numerical variable
ggplot(gapminder, aes(x = year, y = gdpPercap, group=year)) + geom_boxplot()
You will often see this alternative of using the as.factor function to make year into a categorical variable.
ggplot(gapminder, aes(x = as.factor(year), y = gdpPercap)) + geom_boxplot()
Add an extra column; the first letter of each country name. Assigning a new variable on each line
gapminder2 <- mutate(gapminder, FirstLetter = substr(country, 1,1))
gapminder3 <- filter(gapminder2, FirstLetter == "Z")
gapminder3
A more efficient solution
gapminder %>%
mutate(FirstLetter = substr(country,1,1)) %>%
filter(FirstLetter == "Z")
## Get the European countries
filter(gapminder, continent == "Europe") %>%
## make heatmap. See the fill aesthetic to be life expectancy
ggplot(aes(x=year,y=country,fill=lifeExp)) + geom_tile()