Lecture 2.2 KEY – EDS 240

Note

This template follows lecture 2.2 slides. Please be sure to cross-reference the slides, which contain important information and additional context!

Setup

Find data & metadata on the EDI Data Portal.
Get data download link by right-clicking on the Download button > Copy Link Address > then paste into read_csv()

##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##                                    setup                                 ----
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#..........................load packages.........................
library(tidyverse)
library(chron)
library(naniar)
library(ggridges)
library(gghighlight)
library(ggbeeswarm)
library(see)
library(palmerpenguins) # for some minimal examples

#..........................import data...........................
mko <- read_csv("https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-sbc.2007.17&entityid=02629ecc08a536972dec021f662428aa")

##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##                                wrangle data                              ----
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

mko_clean <- mko |>

  # keep only necessary columns ----
  select(year, month, day, decimal_time, Temp_bot, Temp_top, Temp_mid) |>

  # create datetime column (not totally necessary for our plots, but it can be helpful to know how to do this!) ----
  unite(date, year, month, day, sep = "-", remove = FALSE) |>
  mutate(time = chron::times(decimal_time)) |>
  unite(date_time, date, time, sep = " ") |>

  # coerce data types ----
  mutate(date_time = as_datetime(date_time, "%Y-%m-%d %H:%M:%S", tz = "GMT"), 
         year = as.factor(year),
         month = as.factor(month),
         day = as.numeric(day)) |>

  # add month name by indexing the built-in `month.name` vector ----
  mutate(month_name = month.name[month]) |> 

  # replace 9999s with NAs ----
  naniar::replace_with_na(replace = list(Temp_bot = 9999, 
                                         Temp_top = 9999, 
                                         Temp_mid = 9999)) |>

  # select/reorder desired columns ----
  select(date_time, year, month, day, month_name, Temp_bot, Temp_mid, Temp_top)

##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##                            explore missing data                          ----
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#..........counts & percentages of missing data by year..........
see_NAs <- mko_clean |> 
  group_by(year) |> 
  naniar::miss_var_summary() |>
  filter(variable == "Temp_bot")

#...................visualize missing Temp_bot...................
bottom <- mko_clean |> select(Temp_bot)
missing_temps <- naniar::vis_miss(bottom)

Histograms

represent distribution of a numeric variable(s), which is cut into several bins – height of bar represents # of observations in that bin

Too many groups

Note the message, to remind us to consider adjusting our binwidth

# histogram with all 12 months ----
mko_clean |> 
  mutate(month_name = factor(month_name, levels = month.name)) |> 
  ggplot(aes(x = Temp_bot, fill = month_name)) +
  geom_histogram(position = "identity", alpha = 0.5)

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Warning: Removed 74425 rows containing non-finite outside the scale range
(`stat_bin()`).

Alt 1: small multiples

# histogram faceted by month ----
mko_clean |> 
  mutate(month_name = factor(month_name, levels = month.name)) |> 
  ggplot(aes(x = Temp_bot)) +
  geom_histogram() +
  facet_wrap(~month_name)

Alt 2: fewer groups + update colors + modify bin widths

# histogram with fewer months ----
mko_clean |> 
  mutate(month_name = factor(month_name, levels = month.name)) |> 
  filter(month_name %in% c("April", "June", "October")) |> 
  ggplot(aes(x = Temp_bot, fill = month_name)) + 
  geom_histogram(position = "identity", alpha = 0.5, color = "black", binwidth = 1) +
  scale_fill_manual(values = c("#2C5374", "#ADD8E6", "#8B3A3A"))

Density plots

represent data distribution of a numeric variable(s); uses KDE to show probability density function of the variable, the y-axis represents the estimated density, i.e. the relative likelihood of a value occurring, and the area under each curve is equal to 1

Too many groups

# density plot with all 12 months ----
mko_clean |> 
  mutate(month_name = factor(x = month_name, levels = month.name)) |> 
  ggplot(aes(x = Temp_bot, fill = month_name)) +
  geom_density(alpha = 0.5)

Alt 1: small multiples

# density plot faceted by month ---- 
mko_clean |> 
  mutate(month_name = factor(month_name, levels = month.name)) |> 
  ggplot(aes(x = Temp_bot)) +
  geom_density(fill = "gray30") +
  facet_wrap(~month_name)

Alt 2: fewer groups + update colors + modify band widths

# density plot with fewer months ----
mko_clean |> 
  filter(month_name %in% c("April", "June", "October")) |> 
  ggplot(aes(x = Temp_bot, fill = month_name)) +
  geom_density(alpha = 0.5, adjust = 0.5) + 
  scale_fill_manual(values = c("#2C5374", "#ADD8E6", "#8B3A3A"))

A few more histograms & density plots

Distinction: histograms vs. density plots

# create some dummy data ----
dummy_data <- data.frame(value = c(rnorm(n = 100, mean = 5),
                                   rnorm(n = 200, mean = 10)),
                         group = rep(c("A", "B"),
                                     times = c(100, 200)))

# histogram ----
ggplot(dummy_data, aes(x = value, fill = group)) +
  geom_histogram(position = "identity", alpha = 0.7) +
  geom_rug(aes(color = group), alpha = 0.75)
# density plot ----
ggplot(dummy_data, aes(x = value, fill = group)) +
  geom_density(alpha = 0.7) +
  geom_rug(aes(color = group), alpha = 0.75)

Combining geoms

# histogram + density plot ----
ggplot(mko_clean, aes(x = Temp_bot, y = after_stat(density))) + # scale down hist to match density curve
  geom_histogram(fill = "gray", color = "black", alpha = 0.75) +
  geom_density(size = 1)

Compare groups to a whole

# use `after_stat(count)` to plot density of observations ----
ggplot(penguins, aes(x = body_mass_g, y = after_stat(count))) +
 
  # plot full distribution curve with label "all penguins"; remove 'species' col so that this doesn't get faceted later on ----
  geom_density(data = select(penguins, -species), 
               aes(fill = "all penguins"), color = "transparent") +
  
  # plot second curve with label "species" ----
  geom_density(aes(fill = "species"), color = "transparent") +
  
  # facet wrap by species ----
  facet_wrap(~species, nrow = 1) +
  
  # update colors, x-axis label, legend position ----
  scale_fill_manual(values = c("grey","green4"), name = NULL) +
  labs(x = "Body Mass (g)") +
  theme(legend.position = "top")

Ridgeline plots

show distribution of a numeric variable for multiple groups

# basic ridgeline plot ----
ggplot(mko_clean, aes(x = Temp_bot, y = month_name)) +
  ggridges::geom_density_ridges()
# fill with color gradient ----
ggplot(mko_clean, aes(x = Temp_bot, y = month_name, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient() +
  scale_fill_gradientn(colors = c("#2C5374","#849BB4", "#D9E7EC", "#EF8080", "#8B3A3A"))

Alt 1: reorder groups + adjust overlap & tails

# ridgeline plot with reordered months ----
ggplot(mko_clean, aes(x = Temp_bot, y = month_name, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient(rel_min_height = 0.01, scale = 3) +
  scale_y_discrete(limits = rev(month.name)) +
  scale_fill_gradientn(colors = c("#2C5374","#849BB4", "#D9E7EC", "#EF8080", "#8B3A3A"))

Remember, you can also reorder factor levels during the data wrangling stage:

# e.g. by month: ----
mko_clean |> 
  mutate(month_name = factor(month_name, levels = rev(month.name))) |> 
  ggplot(aes(x = Temp_bot, y = month_name, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient(rel_min_height = 0.01, scale = 3) +
  scale_fill_gradientn(colors = c("#2C5374","#849BB4", "#D9E7EC", "#EF8080", "#8B3A3A"))
# e.g. by median temp ---
mko_clean |> 
  mutate(month_name = fct_reorder(month_name, Temp_bot, .fun = median)) |> 
  ggplot(aes(x = Temp_bot, y = month_name, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient(rel_min_height = 0.01, scale = 3) +
  scale_fill_gradientn(colors = c("#2C5374","#849BB4", "#D9E7EC", "#EF8080", "#8B3A3A"))

Alt 2: add quantiles

# ridgeline plot with quantiles ----
ggplot(mko_clean, aes(x = Temp_bot, y = month_name)) +
  ggridges::stat_density_ridges(rel_min_height = 0.01, scale = 3,
                                quantile_lines = TRUE, quantiles = 2) +
  scale_y_discrete(limits = rev(month.name))

Alt 3: jitter raw data

# jittered points ----
ggplot(penguins, aes(x = body_mass_g, y = species)) +
  ggridges::geom_density_ridges(jittered_points = TRUE, 
                                alpha = 0.5, point_size = 0.5)

# raincloud ----
ggplot(penguins, aes(x = body_mass_g, y = species)) +
  ggridges::geom_density_ridges(jittered_points = TRUE, 
                                alpha = 0.5, point_size = 0.5, scale = 0.6,
                                position = "raincloud")

Boxplots

summarize the distribution of a numeric variable for one or several groups

# boxplot with all 12 months ----
ggplot(mko_clean, aes(x = month_name, y = Temp_bot)) +
  geom_boxplot() +
  scale_x_discrete(limits = rev(month.name)) +
  coord_flip()

Alt 1: modify outliers

# boxplot with modified outliers ----
ggplot(mko_clean, aes(x = month_name, y = Temp_bot)) +
  geom_boxplot(outlier.color = "purple", outlier.shape = 1, outlier.size = 5) +
  scale_x_discrete(limits = rev(month.name)) +
  coord_flip()

Alt 2: hightlight a group

# highlight a particular group ----
ggplot(mko_clean, aes(x = month_name, y = Temp_bot, fill = month_name)) +
  geom_boxplot() +
  scale_x_discrete(limits = rev(month.name)) +
  gghighlight::gghighlight(month_name == "October") +
  coord_flip() +
  theme(legend.position = "none")

Alt 3: jitter raw data (using `{palmerpenguins}` data)

# add jittered data points ----
ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(alpha = 0.5, width = 0.2)

Alt 4: dodged groups

# dodge groups by year ----
ggplot(penguins, aes(x = species, y = body_mass_g, color = as.factor(year))) +
  geom_boxplot(outlier.shape = NA) +
  geom_point(alpha = 0.5, 
             position = position_jitterdodge(jitter.width = 0.2))

Alt 5: overlay beeswarm

# boxplot + beeswarm ----
ggplot(penguins, aes(x = species, y = body_mass_g)) +
  geom_boxplot(outlier.shape = NA) +
  ggbeeswarm::geom_beeswarm(size = 1)

Violin plots:

visualize distribution of a numeric variable for one or several groups; great for multiple groups with lots of data

# violin plot ----
ggplot(mko_clean, aes(x = month_name, y = Temp_bot)) +
  geom_violin() +
  scale_x_discrete(limits = rev(month.name)) +
  coord_flip()

Alt 1: overlay boxplot

# violin + boxplot ----
ggplot(mko_clean, aes(x = month_name, y = Temp_bot)) +
  geom_violin() +
  geom_boxplot(width = 0.1, color = "gray", alpha = 0.5, 
               outlier.color = "red") +
  scale_x_discrete(limits = rev(month.name)) +
  coord_flip()

Alt 2: half-violin half-dot plot

# half violin + half dot plot ----
ggplot(penguins, aes(x = species, y = bill_length_mm, fill = species)) +
  see::geom_violindot(size_dots = 5, alpha = 0.5) +
  theme(legend.position = "none")

Setup

Histograms

Too many groups

Alt 1: small multiples

Alt 2: fewer groups + update colors + modify bin widths

Density plots

Too many groups

Alt 1: small multiples

Alt 2: fewer groups + update colors + modify band widths

A few more histograms & density plots

Distinction: histograms vs. density plots

Combining geoms

Compare groups to a whole

Ridgeline plots

Alt 1: reorder groups + adjust overlap & tails

Alt 2: add quantiles

Alt 3: jitter raw data

Boxplots

Alt 1: modify outliers

Alt 2: hightlight a group

Alt 3: jitter raw data (using {palmerpenguins} data)

Alt 4: dodged groups

Alt 5: overlay beeswarm

Violin plots:

Alt 1: overlay boxplot

Alt 2: half-violin half-dot plot

Alt 3: jitter raw data (using `{palmerpenguins}` data)