##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##                                    setup                                 ----
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#..........................load packages.........................
library(tidyverse)
library(chron)
library(naniar)
library(ggridges)
library(gghighlight)
library(ggbeeswarm)
library(see)
library(palmerpenguins) # for some minimal examples
#..........................import data...........................
mko <- read_csv("https://portal.edirepository.org/nis/dataviewer?packageid=knb-lter-sbc.2007.17&entityid=02629ecc08a536972dec021f662428aa")
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##                                wrangle data                              ----
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
mko_clean <- mko |>
  # keep only necessary columns ----
  select(year, month, day, decimal_time, Temp_bot, Temp_top, Temp_mid) |>
  # create datetime column (not totally necessary for our plots, but it can be helpful to know how to do this!) ----
  unite(date, year, month, day, sep = "-", remove = FALSE) |>
  mutate(time = chron::times(decimal_time)) |>
  unite(date_time, date, time, sep = " ") |>
  # coerce data types ----
  mutate(date_time = as_datetime(date_time, "%Y-%m-%d %H:%M:%S", tz = "GMT"), 
         year = as.factor(year),
         month = as.factor(month),
         day = as.numeric(day)) |>
  # add month name by indexing the built-in `month.name` vector ----
  mutate(month_name = month.name[month]) |> 
  # replace 9999s with NAs ----
  naniar::replace_with_na(replace = list(Temp_bot = 9999, 
                                         Temp_top = 9999, 
                                         Temp_mid = 9999)) |>
  # select/reorder desired columns ----
  select(date_time, year, month, day, month_name, Temp_bot, Temp_mid, Temp_top)
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##                            explore missing data                          ----
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#..........counts & percentages of missing data by year..........
see_NAs <- mko_clean |> 
  group_by(year) |> 
  naniar::miss_var_summary() |>
  filter(variable == "Temp_bot")
#...................visualize missing Temp_bot...................
bottom <- mko_clean |> select(Temp_bot)
missing_temps <- naniar::vis_miss(bottom)
Note
This template follows lecture 2.2 slides. Please be sure to cross-reference the slides, which contain important information and additional context!
Setup
- Find data & metadata on the EDI Data Portal.
- Get data download link by right-clicking on the Download button > Copy Link Address > then paste into read_csv()
Histograms
- represent distribution of a numeric variable(s), which is cut into several bins – height of bar represents # of observations in that bin
Too many groups
Note the message, to remind us to consider adjusting our binwidth
# histogram with all 12 months ----
mko_clean |> 
  mutate(month_name = factor(month_name, levels = month.name)) |> 
  ggplot(aes(x = Temp_bot, fill = month_name)) +
  geom_histogram(position = "identity", alpha = 0.5)`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.Warning: Removed 74425 rows containing non-finite outside the scale range
(`stat_bin()`).
Alt 1: small multiples
Alt 2: fewer groups + update colors + modify bin widths
# histogram with fewer months ----
mko_clean |> 
  mutate(month_name = factor(month_name, levels = month.name)) |> 
  filter(month_name %in% c("April", "June", "October")) |> 
  ggplot(aes(x = Temp_bot, fill = month_name)) + 
  geom_histogram(position = "identity", alpha = 0.5, color = "black", binwidth = 1) +
  scale_fill_manual(values = c("#2C5374", "#ADD8E6", "#8B3A3A"))
Density plots
- represent data distribution of a numeric variable(s); uses KDE to show probability density function of the variable, the y-axis represents the estimated density, i.e. the relative likelihood of a value occurring, and the area under each curve is equal to 1
Too many groups
Alt 1: small multiples
Alt 2: fewer groups + update colors + modify band widths
A few more histograms & density plots
Distinction: histograms vs. density plots
# create some dummy data ----
dummy_data <- data.frame(value = c(rnorm(n = 100, mean = 5),
                                   rnorm(n = 200, mean = 10)),
                         group = rep(c("A", "B"),
                                     times = c(100, 200)))
# histogram ----
ggplot(dummy_data, aes(x = value, fill = group)) +
  geom_histogram(position = "identity", alpha = 0.7) +
  geom_rug(aes(color = group), alpha = 0.75)
# density plot ----
ggplot(dummy_data, aes(x = value, fill = group)) +
  geom_density(alpha = 0.7) +
  geom_rug(aes(color = group), alpha = 0.75)

Combining geoms
Compare groups to a whole
# use `after_stat(count)` to plot density of observations ----
ggplot(penguins, aes(x = body_mass_g, y = after_stat(count))) +
 
  # plot full distribution curve with label "all penguins"; remove 'species' col so that this doesn't get faceted later on ----
  geom_density(data = select(penguins, -species), 
               aes(fill = "all penguins"), color = "transparent") +
  
  # plot second curve with label "species" ----
  geom_density(aes(fill = "species"), color = "transparent") +
  
  # facet wrap by species ----
  facet_wrap(~species, nrow = 1) +
  
  # update colors, x-axis label, legend position ----
  scale_fill_manual(values = c("grey","green4"), name = NULL) +
  labs(x = "Body Mass (g)") +
  theme(legend.position = "top")
Ridgeline plots
- show distribution of a numeric variable for multiple groups
# basic ridgeline plot ----
ggplot(mko_clean, aes(x = Temp_bot, y = month_name)) +
  ggridges::geom_density_ridges()
# fill with color gradient ----
ggplot(mko_clean, aes(x = Temp_bot, y = month_name, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient() +
  scale_fill_gradientn(colors = c("#2C5374","#849BB4", "#D9E7EC", "#EF8080", "#8B3A3A"))

Alt 1: reorder groups + adjust overlap & tails
# ridgeline plot with reordered months ----
ggplot(mko_clean, aes(x = Temp_bot, y = month_name, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient(rel_min_height = 0.01, scale = 3) +
  scale_y_discrete(limits = rev(month.name)) +
  scale_fill_gradientn(colors = c("#2C5374","#849BB4", "#D9E7EC", "#EF8080", "#8B3A3A"))
Remember, you can also reorder factor levels during the data wrangling stage:
# e.g. by month: ----
mko_clean |> 
  mutate(month_name = factor(month_name, levels = rev(month.name))) |> 
  ggplot(aes(x = Temp_bot, y = month_name, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient(rel_min_height = 0.01, scale = 3) +
  scale_fill_gradientn(colors = c("#2C5374","#849BB4", "#D9E7EC", "#EF8080", "#8B3A3A"))
# e.g. by median temp ---
mko_clean |> 
  mutate(month_name = fct_reorder(month_name, Temp_bot, .fun = median)) |> 
  ggplot(aes(x = Temp_bot, y = month_name, fill = after_stat(x))) +
  ggridges::geom_density_ridges_gradient(rel_min_height = 0.01, scale = 3) +
  scale_fill_gradientn(colors = c("#2C5374","#849BB4", "#D9E7EC", "#EF8080", "#8B3A3A"))

Alt 2: add quantiles
Alt 3: jitter raw data
# jittered points ----
ggplot(penguins, aes(x = body_mass_g, y = species)) +
  ggridges::geom_density_ridges(jittered_points = TRUE, 
                                alpha = 0.5, point_size = 0.5)
# raincloud ----
ggplot(penguins, aes(x = body_mass_g, y = species)) +
  ggridges::geom_density_ridges(jittered_points = TRUE, 
                                alpha = 0.5, point_size = 0.5, scale = 0.6,
                                position = "raincloud")
Boxplots
- summarize the distribution of a numeric variable for one or several groups
# boxplot with all 12 months ----
ggplot(mko_clean, aes(x = month_name, y = Temp_bot)) +
  geom_boxplot() +
  scale_x_discrete(limits = rev(month.name)) +
  coord_flip()
Alt 1: modify outliers
Alt 2: hightlight a group
Alt 3: jitter raw data (using {palmerpenguins} data)
Alt 4: dodged groups
Alt 5: overlay beeswarm
Violin plots:
- visualize distribution of a numeric variable for one or several groups; great for multiple groups with lots of data
# violin plot ----
ggplot(mko_clean, aes(x = month_name, y = Temp_bot)) +
  geom_violin() +
  scale_x_discrete(limits = rev(month.name)) +
  coord_flip()












