1 Preliminary Work

library("tidyverse")

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

2 Load Data

Have a look at variables and scale levels:

data_toy_properties <- tribble(
  ~variable_name,   ~values,                              ~scale_level,                ~quantity1,   ~quantity2, ~r_type,
  "austrian",       "TRUE and FALSE",                     "nominal",                   "discrete",   2,          "logical",
  "marital_status", "married, single, divorced, widowed", "nominal",                   "discrete",   4,          "factor",
  "name",           "string",                             "nominal",                   "discrete",   Inf,        "character",
  "?",              "?",                                  "nominal",                   "continuous", Inf,        "?",
  "age_cut",        "child, adolescent, adult",           "ordinal",                   "discrete",   3,          "ordered factor",
  "?",              "?",                                  "ordinal",                   "continuous", Inf,        "?",
  "year_birth",     "2023, 2024, etc.",                   "metric (interval-scaled)",  "discrete",   Inf,        "integer",
  "temperature",    "[20, 50] °C",                        "metric (interval-scaled)",  "continuous", Inf,        "double",
  "test_score",     "{0, 1, ..., 100} points",            "metric (ratio-scaled)",     "discrete",   101,        "integer",
  "height",         "[0, 300] cm",                        "metric (ratio-scaled)",     "continuous", Inf,        "double"
)

data_toy_properties

## # A tibble: 10 × 6
##    variable_name  values                  scale_level quantity1 quantity2 r_type
##    <chr>          <chr>                   <chr>       <chr>         <dbl> <chr> 
##  1 austrian       TRUE and FALSE          nominal     discrete          2 logic…
##  2 marital_status married, single, divor… nominal     discrete          4 factor
##  3 name           string                  nominal     discrete        Inf chara…
##  4 ?              ?                       nominal     continuo…       Inf ?     
##  5 age_cut        child, adolescent, adu… ordinal     discrete          3 order…
##  6 ?              ?                       ordinal     continuo…       Inf ?     
##  7 year_birth     2023, 2024, etc.        metric (in… discrete        Inf integ…
##  8 temperature    [20, 50] °C             metric (in… continuo…       Inf double
##  9 test_score     {0, 1, ..., 100} points metric (ra… discrete        101 integ…
## 10 height         [0, 300] cm             metric (ra… continuo…       Inf double

Or more compact:

data_toy_properties_wide_1 <- data_toy_properties |>
  select(variable_name, scale_level, quantity1) |>
  pivot_wider(
    names_from = quantity1, # take values from quantity1 (discrete and continuous) and make columns
    values_from = variable_name, # these values are inserted in the new columns discrete and continuous
    # there is more than one value per cell to insert, therefore a function is used in each cell
    values_fn = ~ paste(.x, collapse = ", ") # short form of anonymous function
    # values_fn = function(x) paste(x, collapse = ", ")  # long form of anonymous function
  ) |>
  rename(
    "scale level \\ quantity" = scale_level # rename column scale_level
  )

data_toy_properties_wide_1

## # A tibble: 4 × 3
##   `scale level \\ quantity` discrete                       continuous 
##   <chr>                     <chr>                          <chr>      
## 1 nominal                   austrian, marital_status, name ?          
## 2 ordinal                   age_cut                        ?          
## 3 metric (interval-scaled)  year_birth                     temperature
## 4 metric (ratio-scaled)     test_score                     height

The following is the “inverse” map with pivot_longer. pivot_longer is used often to prepare data for plots.

data_toy_properties_wide_1 |>
  pivot_longer(
    cols = c("discrete", "continuous"), # chose columns
    names_to = "quantity_1", # the names (discrete and continuous) will be stored in new variable quantity_1
    values_to = "variable_name" # the values (austrian, marital_status, name)
  ) |>
  separate_rows(variable_name, sep = "\\s*,\\s*") |> # to get for e.g. austrian, marital_status, name 3 rows instead of one
  rename("scale_level" = "scale level \\ quantity") |> # rename column name
  select(variable_name, scale_level, quantity_1) # new ordering

## # A tibble: 10 × 3
##    variable_name  scale_level              quantity_1
##    <chr>          <chr>                    <chr>     
##  1 austrian       nominal                  discrete  
##  2 marital_status nominal                  discrete  
##  3 name           nominal                  discrete  
##  4 ?              nominal                  continuous
##  5 age_cut        ordinal                  discrete  
##  6 ?              ordinal                  continuous
##  7 year_birth     metric (interval-scaled) discrete  
##  8 temperature    metric (interval-scaled) continuous
##  9 test_score     metric (ratio-scaled)    discrete  
## 10 height         metric (ratio-scaled)    continuous

And how it is used in R:

data_toy_properties |>
  select(r_type, scale_level, quantity1) |>
  pivot_wider(
    names_from = quantity1, # take values from quantitiy1 (discrecte and continuous) and make columns
    values_from = r_type, # these values are inserted in the new columns discrete and continuous
    # there is more than one value per cell to insert, therefore a function is used in each cell
    values_fn = ~ paste(.x, collapse = ", ") # short form of anonymous function
    # values_fn = function(x) paste(x, collapse = ", ")  # long form of anonymous function
  ) |>
  rename(
    "scale level \\ quantity" = scale_level
  )

## # A tibble: 4 × 3
##   `scale level \\ quantity` discrete                   continuous
##   <chr>                     <chr>                      <chr>     
## 1 nominal                   logical, factor, character ?         
## 2 ordinal                   ordered factor             ?         
## 3 metric (interval-scaled)  integer                    double    
## 4 metric (ratio-scaled)     integer                    double

Read data with warning, because we do not specify the column types:

data_toy_temp <- read_csv("toy.csv")

## Rows: 20 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): name, marital_status, age_cat
## dbl (4): year_birth, temperature, test_score, height
## lgl (1): austrian
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data_toy_temp |> head(3)

## # A tibble: 3 × 8
##   name  austrian marital_status year_birth age_cat temperature test_score height
##   <chr> <lgl>    <chr>               <dbl> <chr>         <dbl>      <dbl>  <dbl>
## 1 Franz FALSE    divorced             1990 adult          37           NA   180.
## 2 Sepp  FALSE    single               2009 adoles…        38.5         80   160.
## 3 Maria TRUE     single               2005 adult          36            5   159.

Store values for marital_status:

marital_status_values <- data_toy_temp |>
  pull(marital_status) |>
  unique()
marital_status_values

## [1] "divorced" "single"   "married"  "widowed"

Store values for age.cat:

age_cat_values <- data_toy_temp |>
  pull(age_cat) |>
  unique()
age_cat_values

## [1] "adult"      "adolescent" "child"      NA

Change ordering (maybe it was already ok) and erase NA

age_cat_values <- age_cat_values[c(3, 2, 1)]
age_cat_values

## [1] "child"      "adolescent" "adult"

Read data again:

data_coltypes <- cols(
  name = col_character(),
  austrian = col_logical(),
  marital_status = col_factor(marital_status_values),
  year_birth = col_integer(),
  age_cat = col_factor(levels = age_cat_values, ordered = TRUE),
  temperature = col_double(),
  test_score = col_integer()
)

data_toy <- read_csv("toy.csv", col_types = data_coltypes)

data_toy |>
  head(3)

## # A tibble: 3 × 8
##   name  austrian marital_status year_birth age_cat temperature test_score height
##   <chr> <lgl>    <fct>               <int> <ord>         <dbl>      <int>  <dbl>
## 1 Franz FALSE    divorced             1990 adult          37           NA   180.
## 2 Sepp  FALSE    single               2009 adoles…        38.5         80   160.
## 3 Maria TRUE     single               2005 adult          36            5   159.

data_toy |>
  glimpse()

## Rows: 20
## Columns: 8
## $ name           <chr> "Franz", "Sepp", "Maria", "Georg", "Karl", "Ulrike", "U…
## $ austrian       <lgl> FALSE, FALSE, TRUE, TRUE, FALSE, TRUE, TRUE, TRUE, FALS…
## $ marital_status <fct> divorced, single, single, single, married, married, div…
## $ year_birth     <int> 1990, 2009, 2005, 2019, 1930, 1980, 1995, 1999, 2005, 2…
## $ age_cat        <ord> adult, adolescent, adult, child, adult, adult, adult, a…
## $ temperature    <dbl> 37.0, 38.5, 36.0, 35.4, 39.0, 38.0, 36.7, 37.9, 36.2, 3…
## $ test_score     <int> NA, 80, 5, 30, 1, 9, 50, 30, 80, 90, 70, 77, 55, 99, 80…
## $ height         <dbl> 180.4, 160.5, 158.9, 130.2, 174.3, 172.1, 159.5, 182.5,…

Levels of all factor variables:

data_toy |>
  select(where(is.factor)) |>
  summary()

##   marital_status       age_cat  
##  divorced:5      child     : 1  
##  single  :8      adolescent: 1  
##  married :5      adult     :17  
##  widowed :2      NA's      : 1

Levels of all ordered variables:

data_toy |>
  select(where(is.ordered)) |>
  summary()

##        age_cat  
##  child     : 1  
##  adolescent: 1  
##  adult     :17  
##  NA's      : 1

Full summary:

data_toy |>
  summary()

##      name            austrian        marital_status   year_birth  
##  Length:20          Mode :logical   divorced:5      Min.   :1930  
##  Class :character   FALSE:8         single  :8      1st Qu.:1983  
##  Mode  :character   TRUE :9         married :5      Median :1999  
##                     NA's :3         widowed :2      Mean   :1990  
##                                                     3rd Qu.:2002  
##                                                     Max.   :2019  
##                                                     NA's   :1     
##        age_cat    temperature      test_score        height     
##  child     : 1   Min.   :35.40   Min.   : 1.00   Min.   :130.2  
##  adolescent: 1   1st Qu.:36.42   1st Qu.:30.00   1st Qu.:166.5  
##  adult     :17   Median :37.45   Median :50.00   Median :171.2  
##  NA's      : 1   Mean   :37.44   Mean   :52.41   Mean   :170.5  
##                  3rd Qu.:38.42   3rd Qu.:80.00   3rd Qu.:178.5  
##                  Max.   :39.50   Max.   :99.00   Max.   :192.5  
##                                  NA's   :3       NA's   :3

3 Some Boxplots

3.1 Simple boxplot

year_birth has na-values hence warnings:

data_toy |> # data
  ggplot(aes(y = year_birth)) + # aesthetics (define what is on y axis) is used for all geom objects
  geom_boxplot() # define geom object (plot type here boxplot)

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

We can define aesthetics in geom_boxplot instead of ggplot. If we have more geom objects, then this can be necessary. Here it makes no difference.

data_toy |> # data
  ggplot() + # empty command
  geom_boxplot(aes(y = year_birth)) # aesthetics is defined only for this geom object

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

To avoid warning, we could use filter to exclude rows where year_birth is NA. Or we use na.rm = TRUE in ggplot to do the same.

data_toy |>
  ggplot(aes(y = year_birth)) +
  geom_boxplot(na.rm = TRUE)

Change dimensions in code chunk with e.g. {r, fig.width=1, fig.height=1.2} (the real text size stays the same at 11pt). Be careful: Here in r chunk we can only use inch not cm. So if we want to get the same text/plot ratio in e.g. pdf, we should also use inch later in ggsave.

data_toy |>
  ggplot(aes(y = year_birth)) +
  geom_boxplot(na.rm = TRUE)

Or with {r, fig.width=2, fig.height=2.4}

data_toy |>
  ggplot(aes(y = year_birth)) +
  geom_boxplot(na.rm = TRUE)

Save last plot. If we use width and height we change again the text/plot ratio because the textsize stays constant at 11pt. If we want to have the same text/plot ratio as in above r chunk, we should use inch in ggsave. Formats png and jpg are no vector graphics, I prefer pdf and svg in most cases.

ggsave("my_first_plot_a.pdf", width = 2, height = 2.4, units = "in") # compare text/plot ratio, should be as in last plot
ggsave("my_first_plot_b.pdf", width = 1, height = 1.2, units = "in") # compare text/plot ratio, should be as in plot above last plot
ggsave("my_first_plot_c.pdf", width = 2, height = 2.4, units = "in", dpi = 100) # compare file size, does not change for pdf (vector graphics) so dpi is unnecessary
ggsave("my_first_plot_d.pdf", width = 2, height = 2.4, units = "in", dpi = 600) # compare file size, does not change for pdf (vector graphics) so dpi is unnecessary
ggsave("my_first_plot_e.png", width = 2, height = 2.4, units = "in", dpi = 100) # here dpi makes a difference
ggsave("my_first_plot_f.png", width = 2, height = 2.4, units = "in", dpi = 600) # here dpi makes a difference compare file size with plot e

Or change text size for (ideally all) plots in document:

data_toy |>
  ggplot(aes(y = year_birth)) +
  geom_boxplot(na.rm = TRUE) +
  theme_grey(base_size = 16)

ggsave("my_first_plot_g.pdf", width = 2, height = 2.5, units = "in")

It is also possible to store the plot as variable:

plot1 <- data_toy |>
  ggplot(aes(y = year_birth)) +
  geom_boxplot(na.rm = TRUE)

Next we can show the plot:

plot1

Modify the plot e.g. theme:

plot1 +
  # theme_classic()
  # theme_dark()
  # theme_minimal()
  # theme_grey()
  theme_bw()

And of course save the plot:

ggsave("my_first_plot_h.pdf", plot1, width = 2, height = 2.4, units = "in")

3.2 Colors and transparency

fh1 <- rgb(r = 97 / 255, g = 164 / 255, b = 215 / 255) # fh lightblue
fh1a <- rgb(r = 97 / 255, g = 164 / 255, b = 215 / 255, alpha = 0.15) # transparancy: fh1 15% and background (maybe white) 85%
factor <- 0.15
fh1b <- rgb(t(col2rgb(fh1) / 255 * factor + 1 * (1 - factor))) # fh1 15% and white 85%
fh2 <- rgb(r = 22 / 255, g = 48 / 255, b = 114 / 255) # fh darkblue

ggplot() +
  geom_rect(aes(xmin = 0, xmax = 1, ymin = 0, ymax = 1), fill = fh1) + # fh1
  geom_rect(aes(xmin = 1, xmax = 2, ymin = 0, ymax = 1), fill = fh1a) + # fh1a, grids are darker, because background is gray and not white
  geom_rect(aes(xmin = 2, xmax = 3, ymin = 0, ymax = 1), fill = fh1b) + # fh1b, no transparency, so we do not see the grids
  geom_rect(aes(xmin = 3, xmax = 4, ymin = 0, ymax = 1), fill = fh2) + # fh2
  theme_bw() # it is important to use white background, otherwise transparancy of fh1a is different, see grid lines

3.3 Some properties

data_toy |>
  ggplot(aes(y = year_birth)) +
  geom_boxplot(
    color = fh2, # lines are plotted in fh2
    fill = fh1, # areas are plotted in fh1
    linewidth = 1, # width of lines
    median.linewidth = 1, # with of median line
    outlier.shape = 23, # shape of outlier
    outlier.size = 3, # size of outlier
    outlier.stroke = 1, # width of outlier lines
    na.rm = TRUE
  ) +
  labs(
    title = "Year of Birth",
    y = ""
  ) +
  scale_x_continuous(breaks = NULL) + # This ensures no x-axis labels or ticks
  scale_y_continuous( # control the position of breaks, normally I keep standard values
    breaks = 1920 + (0:4) * 20,
    minor_breaks = 1930 + (0:4) * 20
  ) +
  theme(
    panel.background = element_rect(fill = fh1b), # background color
    panel.grid.major.y = element_line(color = "black", linetype = "solid", linewidth = 0.1), # major y-grid
    panel.grid.minor.y = element_line(color = "black", linetype = "dashed", linewidth = 0.05), # minor y-grid
    # panel.grid.minor.y = element_blank(),  # or if we do not like to see minor grid
    text = element_text(face = "bold", size = 11), # font properties
    axis.ticks = element_blank() # no ticks e.g. right of values 1940, 1960, ... on y-axis
  )

3.4 More than one geom in one plot

We could store the above plot as variable e.g. plot2 and reuse it here. but it can make a difference which geom object (violin, boxplot) and gridlines etc. are used first.

data_toy |>
  ggplot(aes(x = 0, y = year_birth)) + # geom_violin and geom_point need x value
  geom_violin( # violin plot
    linewidth = 1,
    color = fh2,
    fill = fh1,
    alpha = 0.3,
    width = 4.5,
    na.rm = TRUE
  ) +
  geom_boxplot( # boxplot
    color = fh2,
    fill = fh1,
    linewidth = 1,
    median.linewidth = 1,
    outlier.shape = 23,
    outlier.size = 3,
    outlier.stroke = 1,
    na.rm = TRUE
  ) +
  stat_boxplot( # error bars
    geom = "errorbar",
    color = fh2,
    linewidth = 1,
    na.rm = TRUE
  ) +
  geom_point( # data points
    position = position_jitter(width = 0.2), # jittered is used (randomly x value)
    color = fh2,
    na.rm = TRUE
  ) +
  labs(
    title = "Year of Birth",
    x = "",
    y = ""
  ) +
  scale_x_continuous(
    breaks = NULL # This ensures no x-axis labels or ticks
  ) +
  scale_y_continuous( # control the position of breaks, normally I keep standard values
    breaks = 1920 + (0:4) * 20,
    minor_breaks = 1930 + (0:4) * 20
  ) +
  theme(
    panel.background = element_rect(fill = fh1b), # background color
    panel.grid.major.y = element_line(color = "black", linetype = "solid", linewidth = 0.1), # major y-grid
    panel.grid.minor.y = element_line(color = "black", linetype = "dashed", linewidth = 0.05), # minor y-grid
    # panel.grid.minor.y = element_blank(),  # or if we do not like to see minor grid
    text = element_text(face = "bold", size = 11), # font properties
    axis.ticks = element_blank() # no ticks e.g. right of values 1940, 1960, ... on y-axis
  )

3.5 Plot by factor variable

Plot by marital_status

data_toy |>
  ggplot(aes(x = marital_status, y = year_birth)) + # we get 4 boxplot because marital_status has 4 values
  geom_boxplot(
    color = fh2,
    fill = fh1,
    linewidth = 1,
    median.linewidth = 1,
    outlier.shape = 23,
    outlier.size = 3,
    outlier.stroke = 1,
    na.rm = TRUE
  ) +
  stat_boxplot(
    geom = "errorbar",
    color = fh2,
    linewidth = 1,
    na.rm = TRUE
  ) +
  labs(
    title = "Year of Birth\nBy marital status",
    x = "",
    y = ""
  ) +
  scale_y_continuous( # control the position of breaks, normally I keep standard values
    breaks = 1920 + (0:4) * 20,
    minor_breaks = 1930 + (0:4) * 20
  ) +
  theme(
    panel.background = element_rect(fill = fh1b),
    panel.grid.major.y = element_line(color = "black", linetype = "solid", linewidth = 0.1),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.y = element_line(color = "black", linetype = "dashed", linewidth = 0.05),
    text = element_text(face = "bold", size = 11),
    axis.ticks = element_blank()
  )

3.6 Using variable for property

We again plot by marital_status, but furthermore we use this variable for color and fill. There is no real advantage in using different colors here for the 3 boxes.

factor2 <- 0.3
darkred_alpha <- rgb(t(col2rgb("darkred") / 255 * factor2 + 1 * (1 - factor2)))
darkgreen_alpha <- rgb(t(col2rgb("darkgreen") / 255 * factor2 + 1 * (1 - factor2)))
darkblue_alpha <- rgb(t(col2rgb("darkblue") / 255 * factor2 + 1 * (1 - factor2)))
gray42_alpha <- rgb(t(col2rgb("gray42") / 255 * factor2 + 1 * (1 - factor2)))

data_toy |>
  ggplot(aes(x = marital_status, y = year_birth)) +
  geom_boxplot(aes(color = marital_status, fill = marital_status), # each boxplot gets its own color, we do not use default colors, therefore we use scale_color_manual and scale_fill_manual later
    linewidth = 1,
    median.linewidth = 1,
    outlier.shape = 23,
    outlier.size = 3,
    outlier.stroke = 1,
    na.rm = TRUE
  ) +
  stat_boxplot(aes(color = marital_status),
    geom = "errorbar",
    linewidth = 1,
    na.rm = TRUE
  ) +
  labs(
    title = "Year of Birth\nBy marital status",
    x = "",
    y = ""
  ) +
  scale_color_manual(values = c("single" = "darkred", "married" = "darkblue", "divorced" = "darkgreen", "widowed" = "gray42")) + # use our own color selection for color
  scale_fill_manual(values = c("single" = darkred_alpha, "married" = darkblue_alpha, "divorced" = darkgreen_alpha, "widowed" = gray42_alpha)) + # use our own color selection for fill
  theme(
    legend.position = "none", # legend is not necessary here
    axis.ticks = element_blank()
  )

3.7 Using facet grid

Here we use marital_status again:

data_toy |>
  ggplot(aes(y = year_birth)) + # we do not use x="marital_status" because we use facets
  geom_boxplot(aes(color = marital_status, fill = marital_status),
    linewidth = 1,
    median.linewidth = 1,
    outlier.shape = 23,
    outlier.size = 3,
    na.rm = TRUE
  ) +
  stat_boxplot(aes(color = marital_status),
    geom = "errorbar",
    linewidth = 1,
    na.rm = TRUE
  ) +
  labs(
    title = "Geburtsjahr\nnach Familienstand",
    x = "",
    y = ""
  ) +
  facet_grid(. ~ marital_status, # facet creates again 4 boxplots
    labeller = as_labeller(c("single" = "ledig", "married" = "verheiratet", "divorced" = "geschieden", "widowed" = "verwitwet")) # be careful here!
  ) +
  scale_x_continuous(
    breaks = NULL # This ensures no x-axis labels or ticks
  ) +
  scale_y_continuous( # control the position of breaks, normally I keep standard values
    breaks = 1920 + (0:4) * 20,
    minor_breaks = 1930 + (0:4) * 20
  ) +
  scale_color_manual(values = c("single" = "darkred", "married" = "darkblue", "divorced" = "darkgreen", "widowed" = "gray42")) + # use our own color selection for color
  scale_fill_manual(values = c("single" = darkred_alpha, "married" = darkblue_alpha, "divorced" = darkgreen_alpha, "widowed" = gray42_alpha)) + # use our own color selection for fill
  theme(
    legend.position = "none", # we do not need legend here
    axis.ticks = element_blank() # no ticks on x-axis
  )

But we can use another factor to get a “matrix”. Here we use austrian for rows and marital_status for columns.

data_toy |>
  filter(!is.na(austrian)) |> # we do not want to get a row for austrian == NA
  ggplot(aes(y = year_birth)) + # we do not use x="marital_status" because we use facet
  geom_boxplot(aes(color = marital_status, fill = marital_status),
    linewidth = 1,
    median.linewidth = 1,
    outlier.shape = 23,
    outlier.size = 3,
    na.rm = TRUE
  ) +
  stat_boxplot(aes(color = marital_status),
    geom = "errorbar",
    linewidth = 1,
    na.rm = TRUE
  ) +
  labs(
    title = "Geburtsjahr\nnach Staatsbürgerschaft und Familienstand",
    x = "",
    y = ""
  ) +
  facet_grid(austrian ~ marital_status,
    labeller = as_labeller(c(
      "single" = "ledig", "married" = "verheiratet", "divorced" = "geschieden", "widowed" = "verwitwet", # be careful here
      "TRUE" = "Österreicher/in", "FALSE" = "kein/e Österreicher/in" # be careful here
    ))
  ) +
  scale_x_continuous(
    breaks = NULL # This ensures no x-axis labels or ticks
  ) +
  scale_y_continuous( # control the position of breaks, normally I keep standard values
    breaks = 1920 + (0:4) * 20,
    minor_breaks = 1930 + (0:4) * 20
  ) +
  scale_color_manual(values = c("single" = "darkred", "married" = "darkblue", "divorced" = "darkgreen", "widowed" = "gray42")) + # use our own color selection for color
  scale_fill_manual(values = c("single" = darkred_alpha, "married" = darkblue_alpha, "divorced" = darkgreen_alpha, "widowed" = gray42_alpha)) + # use our own color selection for fill
  theme(
    legend.position = "none", # we do not need legend here
    axis.ticks = element_blank() # no ticks on x-axis
  )

Maybe we want to use different colors for Austrian as well. For that we define a new variable in our data_toy.

# for demonstration only
data_toy |>
  mutate(
    color = interaction(marital_status, austrian, sep = "_")
  ) |>
  distinct(color)

## # A tibble: 8 × 1
##   color         
##   <fct>         
## 1 divorced_FALSE
## 2 single_FALSE  
## 3 single_TRUE   
## 4 married_FALSE 
## 5 married_TRUE  
## 6 divorced_TRUE 
## 7 <NA>          
## 8 widowed_TRUE

factor2 <- 0.3 # blend with white
darkred_alpha <- rgb(t(col2rgb("darkred") / 255 * factor2 + 1 * (1 - factor2)))
darkgreen_alpha <- rgb(t(col2rgb("darkgreen") / 255 * factor2 + 1 * (1 - factor2)))
darkblue_alpha <- rgb(t(col2rgb("darkblue") / 255 * factor2 + 1 * (1 - factor2)))
gray42_alpha <- rgb(t(col2rgb("gray42") / 255 * factor2 + 1 * (1 - factor2)))

factor3 <- 0.5 # blend with black
darkred_alpha2 <- rgb(t(col2rgb("darkred") / 255 * factor3 + 0 * (1 - factor3)))
darkgreen_alpha2 <- rgb(t(col2rgb("darkgreen") / 255 * factor3 + 0 * (1 - factor3)))
darkblue_alpha2 <- rgb(t(col2rgb("darkblue") / 255 * factor3 + 0 * (1 - factor3)))
gray42_alpha2 <- rgb(t(col2rgb("gray42") / 255 * factor3 + 0 * (1 - factor3)))

# blend with white
darkred_alpha3 <- rgb(t(col2rgb(darkred_alpha2) / 255 * factor2 + 1 * (1 - factor2)))
darkgreen_alpha3 <- rgb(t(col2rgb(darkgreen_alpha) / 255 * factor2 + 1 * (1 - factor2)))
darkblue_alpha3 <- rgb(t(col2rgb(darkblue_alpha2) / 255 * factor2 + 1 * (1 - factor2)))
gray42_alpha3 <- rgb(t(col2rgb(gray42_alpha2) / 255 * factor2 + 1 * (1 - factor2)))

data_toy |>
  filter(!is.na(austrian)) |>
  mutate(color = interaction(marital_status, austrian, sep = "_")) |> # add the color e.g. single_FALSE
  ggplot(aes(y = year_birth)) + # we do not use x = "marital_status" because we use facet
  geom_boxplot(aes(color = color, fill = color),
    linewidth = 1,
    median.linewidth = 1,
    outlier.shape = 23,
    outlier.size = 3,
    na.rm = TRUE
  ) +
  stat_boxplot(aes(color = color),
    geom = "errorbar",
    linewidth = 1,
    na.rm = TRUE
  ) +
  labs(
    title = "Geburtsjahr\nnach Staatsbürgerschaft und Familienstand",
    x = "",
    y = ""
  ) +
  facet_grid(austrian ~ marital_status,
    labeller = as_labeller(c(
      "single" = "ledig", "married" = "verheiratet", "divorced" = "geschieden", "widowed" = "verwitwet", # be careful here
      "TRUE" = "Österreicher/in", "FALSE" = "kein/e Österreicher/in" # be careful here
    ))
  ) +
  scale_x_continuous(
    breaks = NULL # This ensures no x-axis labels or ticks
  ) +
  scale_y_continuous( # control the position of breaks, normally I keep standard values
    breaks = 1920 + (0:4) * 20,
    minor_breaks = 1930 + (0:4) * 20
  ) +
  scale_color_manual(values = c(
    "single_FALSE" = "darkred", "single_TRUE" = darkred_alpha2,
    "married_FALSE" = "darkblue", "married_TRUE" = darkblue_alpha2,
    "divorced_FALSE" = "darkgreen", "divorced_TRUE" = darkgreen_alpha2,
    "widowed_FALSE" = "gray42", "widowed_TRUE" = gray42_alpha2
  )) + # use our own color selection for color
  scale_fill_manual(values = c(
    "single_FALSE" = darkred_alpha, "single_TRUE" = darkred_alpha3,
    "married_FALSE" = darkblue_alpha, "married_TRUE" = darkblue_alpha3,
    "divorced_FALSE" = darkgreen_alpha, "divorced_TRUE" = darkgreen_alpha3,
    "widowed_FALSE" = gray42_alpha, "widowed_TRUE" = gray42_alpha3
  )) + # use our own color selection for fill
  theme(
    legend.position = "none", # we do not need legend here
    axis.ticks = element_blank() # no ticks on x-axis
  )

GGPlot2 Introduction

Arno Kimeswenger