TidyTuesday Starbucks Data

Data

The data I use are available here. Let’s go ✌

I have no initial idea what I want to present and so made several exploratory plots to see what I deal with.

library(tidyverse)
library(magrittr)
library(hermitage)
sb <- read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-12-21/starbucks.csv')
## Rows: 1147 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): product_name, size, trans_fat_g, fiber_g
## dbl (11): milk, whip, serv_size_m_l, calories, total_fat_g, saturated_fat_g,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sb %>% names(.)
##  [1] "product_name"    "size"            "milk"            "whip"           
##  [5] "serv_size_m_l"   "calories"        "total_fat_g"     "saturated_fat_g"
##  [9] "trans_fat_g"     "cholesterol_mg"  "sodium_mg"       "total_carbs_g"  
## [13] "fiber_g"         "sugar_g"         "caffeine_mg"
sb %>% skimr::skim(.)

Table: Table 1: Data summary

Name Piped data
Number of rows 1147
Number of columns 15
_______________________
Column type frequency:
character 4
numeric 11
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
product_name 0 1 8 47 0 93 0
size 0 1 4 7 0 11 0
trans_fat_g 0 1 1 3 0 7 0
fiber_g 0 1 1 2 0 12 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
milk 0 1 2.51 1.68 0 1.0 2.0 4 5 ▇▃▃▃▃
whip 0 1 0.25 0.43 0 0.0 0.0 0 1 ▇▁▁▁▂
serv_size_m_l 0 1 461.34 172.18 0 354.0 473.0 591 887 ▁▇▆▆▁
calories 0 1 228.39 137.67 0 130.0 220.0 320 640 ▆▇▆▃▁
total_fat_g 0 1 6.19 5.97 0 1.0 4.5 10 28 ▇▃▂▁▁
saturated_fat_g 0 1 3.88 4.01 0 0.2 2.5 7 20 ▇▃▂▁▁
cholesterol_mg 0 1 15.24 17.97 0 0.0 5.0 30 75 ▇▂▂▁▁
sodium_mg 0 1 139.65 93.09 0 70.0 135.0 200 370 ▇▇▇▃▂
total_carbs_g 0 1 37.72 23.26 0 20.0 37.0 53 96 ▇▇▇▅▂
sugar_g 0 1 34.99 22.46 0 18.0 34.0 49 89 ▇▇▇▅▂
caffeine_mg 0 1 91.86 78.11 0 30.0 75.0 150 475 ▇▃▁▁▁

Exploratory plots

# calories by drink size
sb %>% 
  mutate(
    size = factor(size),
    size = fct_reorder(size, calories),
    alpha = scales::rescale(calories, to = 0, 1)
  ) %>% 
ggplot(aes(x = size, y = calories, color = size)) +
  geom_point(size = 2, alpha = 0.7, shape = 5) +
  theme_void(base_size = 15, base_family = "Varela Round") +
  theme(
    plot.background = element_rect(fill = "#FCD3B7"),
    text = element_text(color = "#50270A"),
    axis.title.y = element_text(color = "#50270A", angle = 90),
    axis.text.y = element_text(color = "#50270A"),
    legend.position = "none",
    legend.box.margin = margin(2, 2, 2, 2),
    axis.text.x = element_text(color = "#50270A"),
    panel.spacing = unit(5, "lines"),
    plot.margin = margin(15, 15, 15, 15),
    plot.caption = element_text(hjust = 0)
  ) +
  scale_color_manual(values = hermitage_palette("cottages_vincent")) +
  labs(title = "Energy (kcal) by drink size", y = "energy, kcal",
       caption = "Source | Starbucks data | for TidyTuesday\nplot | Elena Dudukina | @evpatora\ncolors | package{hermitage}")
# calories by drink size and sugar
sb %>% 
  mutate(
    size = factor(size),
    size = fct_reorder(size, calories)
  ) %>% 
  filter(calories > 200)  %>% 
  ggplot(aes(x = sugar_g, y = total_fat_g, color = size)) +
  geom_point(size = 2, alpha = 0.9, shape = 5) +
  facet_wrap(~size) +
  theme_void(base_size = 13, base_family = "Varela Round") +
  theme(
    plot.background = element_rect(fill = "#FCD3B7"),
    text = element_text(color = "#50270A"),
    axis.title.y = element_text(color = "#50270A", angle = 90),
    axis.title.x = element_text(color = "#50270A"),
    axis.text.y = element_text(color = "#50270A"),
    axis.text.x = element_text(color = "#50270A"),
    legend.position = "none",
    legend.box.margin = margin(2, 2, 2, 2),
    panel.spacing = unit(5, "lines"),
    plot.margin = margin(15, 15, 15, 15),
    plot.caption = element_text(hjust = 0)
  ) +
  scale_color_manual(values = hermitage_palette("cottages_vincent")) +
  labs(title = "Energy (kcal) by drink total fat (g) and sugar (g) added", y = "total fat, g", x = "sugar, g",
       caption = "Source | Starbucks data | for TidyTuesday\nplot | Elena Dudukina | @evpatora\ncolors | package{hermitage}")

Another exploratory graph.

# calories by product and sugar for beverages with > 200 kcal
sb %>% 
  mutate(
    product_name = factor(product_name),
    product_name = fct_reorder(product_name, calories)
  ) %>% 
  filter(calories > 200)  %>% 
  ggplot(aes(x = sugar_g, y = calories, color = product_name)) +
  geom_point(size = 1, shape = 5) +
  facet_wrap(~product_name, labeller = label_wrap_gen(15)) +
  theme_void(base_size = 13, base_family = "Varela Round") +
  theme(
    plot.background = element_rect(fill = "#FFEDE1"),
    text = element_text(color = "#50270A"),
    axis.title.y = element_text(color = "#50270A", angle = 90),
    axis.title.x = element_text(color = "#50270A"),
    axis.text.y = element_text(color = "#50270A"),
    legend.position = "none",
    axis.text.x = element_text(color = "#50270A"),
    panel.spacing = unit(3, "lines"),
    plot.margin = margin(15, 15, 15, 15),
    plot.caption = element_text(hjust = 0)
  ) +
  scale_color_manual(values = hermitage_palette("parsons_2")) +
  labs(title = "Energy (kcal) by product and sugar (g) added", y = "energy, kcal", x = "sugar, g",
       caption = "Source | Starbucks data | for TidyTuesday\nplot | Elena Dudukina | @evpatora\ncolors | package{hermitage}")

It should come as no surprise that amount and type of milk, amount of sugar and the beverage size determine the overall kcal content in the cup. Why not leverage the power of ggplot to make a graph that tells me what kinds of hot lattes I can get given that I want maximum amount of caffeine and minimum amount of sugar because this is how I like my lattes. In winter season, iced coffee is a deal-breaker and > 15 g sugar per beverage is also a deal-breaker. Don’t worry, I will compensate for that with having more cookies 🍪

Final graph

# most caffeinated and not so sweet hot lattes

sb_caf_rank <- sb %>%
  mutate(
    product_name = fct_reorder(product_name, caffeine_mg)
  ) %>%
  group_by(product_name, milk, calories) %>%
  arrange(caffeine_mg) %>%
  mutate(
    label = case_when(
      str_detect(product_name, "Iced") ~ NA_character_,
      sugar_g > 15 ~ NA_character_,
      caffeine_mg == max(caffeine_mg) & calories == min(calories) & sugar_g <= 15 & str_detect(product_name, "Iced", negate = T) & str_detect(product_name, "Latte") ~ paste0(product_name, " | ", size, " | ", sugar_g, " g sugar | ", total_fat_g, " g fat"),
      T ~ NA_character_
    ),
    point = case_when(
      !is.na(label) ~ "4",
      T ~ "3"
    )
  ) %>%
  ungroup()  %>%
  mutate(label = case_when(
    caffeine_mg == max(caffeine_mg) ~ paste0("MOST CAFFEINATED BEVERAGE IN STARBUCKS\n", product_name, " | ", size, " | ", sugar_g, " g sugar | ", total_fat_g, " g fat"),
    T ~ label
  ))

sb_plot <- sb_caf_rank  %>%
  ggplot(aes(x = calories, y = caffeine_mg, color = calories, fill = calories)) +
  geom_point(aes(shape = point, size = point), alpha = 0.55) +
  facet_wrap(~milk) +
  theme_void(base_size = 14, base_family = "Varela Round") +
  theme(
    plot.background = element_rect(fill = "#FFEDE1"),
    text = element_text(color = "#50270A"),
    axis.title.y = element_text(color = "#50270A", angle = 90),
    axis.title.x = element_text(color = "#50270A"),
    axis.text.y = element_text(color = "#50270A"),
    axis.text.x = element_text(color = "#50270A"),
    legend.position = "bottom",
    legend.margin = margin(10, 10, 10, 10),
    panel.spacing = unit(3, "lines"),
    plot.margin = margin(15, 15, 15, 15),
    plot.caption = element_text(hjust = 0)
  ) +
  ggrepel::geom_text_repel(mapping = aes(label = label), color = "#00704A", segment.curvature = -0.6,
                           nudge_x = 500, nudge_y = 20, point.size = 10, fontface = "bold",
                           segment.linetype = 6, direction = "y", hjust = "left", size = 2) +
  scale_color_gradientn(colours = hermitage_palette("faberge", "continuous", n = 71)) +
  scale_fill_gradientn(colours = hermitage_palette("faberge", "continuous", n = 71)) +
  guides(color = "none", shape = "none", size = "none", fill = guide_colorbar(title = "Energy, kcal")) +
  labs(title = "Caffeinated & Not So Sweet Hot Lattes You Can Get in Starbucks",
       subtitle = "by milk type",
       y = "caffeine, mg", x = "energy, kcal",
       caption = "Source | Starbucks data | for TidyTuesday\nplot | Elena Dudukina | @evpatora\ncolors | package{hermitage}\nIced lattes and any lattes with > 15 g of sugar are not labeleld")

sb_plot
## Warning: Using size for a discrete variable is not advised.
## Warning: Removed 1102 rows containing missing values (geom_text_repel).
ggsave(sb_plot, filename = "starbucks.jpeg", dpi = 400, units = "cm", width = 30, height = 31, path = path)
## Warning: Using size for a discrete variable is not advised.
## Warning: Removed 1102 rows containing missing values (geom_text_repel).
Elena Dudukina
Elena Dudukina
Clinical Specialist and PhD student

I am interested in women’s health, reproductive epidemiology, pharmacoepidemiology, causal inference, directed acyclic graphs, and R stats.

Related