Data transformation with dplyr

Author

Roland Krause/Milena Zizovic

Published

February 7, 2023

Aims

This tutorial allows you to explore dplyr functionality based on the individual lectures. Every question can be answered with a combination of pipes (|>). Do not use temporary variables and any statements outside of the range of the tidyverse.

Part one - Base dplyr

Import the data from the website.

Assign to the name judgments

Solution
judgments <- readr::read_tsv("https://biostat2.uni.lu/practicals/data/judgments.tsv", 
                             show_col_types = FALSE)

Use glimpse() to identify columns and column types in judgments.

Select all columns that refer to the STAI questionnaire.

judgments |> 
  select(starts_with("STAI") )
# A tibble: 188 × 42
   STAI_pre_1_1 STAI_pre_1_2 STAI_pre_1_3 STAI_pre_1_4 STAI_pre_1_5 STAI_pre_1_6
          <dbl>        <dbl>        <dbl>        <dbl>        <dbl>        <dbl>
 1            2            1            2            2            2            2
 2            3            2            3            1            3            2
 3            4            3            3            3            4            2
 4            2            2            2            2            3            1
 5            1            1            1            1            2            1
 6            2            2            1            1            2            1
 7            2            2            1            1            2            1
 8            1            1            1            1            1            1
 9            2            2            1            1            2            1
10            4            2            3            3            3            1
# ℹ 178 more rows
# ℹ 36 more variables: STAI_pre_1_7 <dbl>, STAI_pre_2_1 <dbl>,
#   STAI_pre_2_2 <dbl>, STAI_pre_2_3 <dbl>, STAI_pre_2_4 <dbl>,
#   STAI_pre_2_5 <dbl>, STAI_pre_2_6 <dbl>, STAI_pre_2_7 <dbl>,
#   STAI_pre_3_1 <dbl>, STAI_pre_3_2 <dbl>, STAI_pre_3_3 <dbl>,
#   STAI_pre_3_4 <dbl>, STAI_pre_3_5 <dbl>, STAI_pre_3_6 <dbl>,
#   STAI_post_1_1 <dbl>, STAI_post_1_2 <dbl>, STAI_post_1_3 <dbl>, …

Retrieve all subjects younger than 20 which are in the stress group.

The column for the group is condition.

judgments |> 
  filter(age < 20, condition == "stress")
# A tibble: 58 × 158
   start_date end_date  finished condition subject gender   age mood_pre
   <chr>      <chr>        <dbl> <chr>       <dbl> <chr>  <dbl>    <dbl>
 1 11/3/2014  11/3/2014        1 stress          1 female    19       59
 2 11/3/2014  11/3/2014        1 stress          3 female    19       22
 3 11/3/2014  11/3/2014        1 stress         13 female    19       30
 4 11/3/2014  11/3/2014        1 stress         18 female    19       55
 5 11/3/2014  11/3/2014        1 stress         14 female    18       53
 6 11/3/2014  11/3/2014        1 stress         17 female    18       79
 7 11/3/2014  11/3/2014        1 stress         15 female    17       70
 8 11/3/2014  11/3/2014        1 stress         25 female    19       60
 9 11/3/2014  11/3/2014        1 stress         22 female    18       13
10 11/3/2014  11/3/2014        1 stress         32 female    19       74
# ℹ 48 more rows
# ℹ 150 more variables: mood_post <dbl>, STAI_pre_1_1 <dbl>,
#   STAI_pre_1_2 <dbl>, STAI_pre_1_3 <dbl>, STAI_pre_1_4 <dbl>,
#   STAI_pre_1_5 <dbl>, STAI_pre_1_6 <dbl>, STAI_pre_1_7 <dbl>,
#   STAI_pre_2_1 <dbl>, STAI_pre_2_2 <dbl>, STAI_pre_2_3 <dbl>,
#   STAI_pre_2_4 <dbl>, STAI_pre_2_5 <dbl>, STAI_pre_2_6 <dbl>,
#   STAI_pre_2_7 <dbl>, STAI_pre_3_1 <dbl>, STAI_pre_3_2 <dbl>, …

Arrange all observations by STAI_pre so that the subject with the lowest stress level is on top.

What is the subject in question?

judgments |> 
 arrange(STAI_pre)
# A tibble: 188 × 158
   start_date end_date  finished condition subject gender   age mood_pre
   <chr>      <chr>        <dbl> <chr>       <dbl> <chr>  <dbl>    <dbl>
 1 11/6/2014  11/6/2014        1 stress        142 female    22       96
 2 11/3/2014  11/3/2014        1 control         9 male      20      100
 3 11/7/2014  11/7/2014        1 stress        162 male      19       77
 4 11/3/2014  11/3/2014        1 control        39 male      20       87
 5 11/5/2014  11/5/2014        1 control       105 female    21       93
 6 11/7/2014  11/7/2014        1 control       176 female    21       75
 7 11/7/2014  11/7/2014        1 control       179 male      23       66
 8 11/4/2014  11/4/2014        1 control        64 female    18       76
 9 11/6/2014  11/6/2014        1 control       143 female    17       82
10 11/5/2014  11/5/2014        1 control       106 female    19       83
# ℹ 178 more rows
# ℹ 150 more variables: mood_post <dbl>, STAI_pre_1_1 <dbl>,
#   STAI_pre_1_2 <dbl>, STAI_pre_1_3 <dbl>, STAI_pre_1_4 <dbl>,
#   STAI_pre_1_5 <dbl>, STAI_pre_1_6 <dbl>, STAI_pre_1_7 <dbl>,
#   STAI_pre_2_1 <dbl>, STAI_pre_2_2 <dbl>, STAI_pre_2_3 <dbl>,
#   STAI_pre_2_4 <dbl>, STAI_pre_2_5 <dbl>, STAI_pre_2_6 <dbl>,
#   STAI_pre_2_7 <dbl>, STAI_pre_3_1 <dbl>, STAI_pre_3_2 <dbl>, …

Abbreviate the gender column such that only the first character remains

judgments |> 
  mutate(gender = str_sub(gender, 1,1))
# A tibble: 188 × 158
   start_date end_date  finished condition subject gender   age mood_pre
   <chr>      <chr>        <dbl> <chr>       <dbl> <chr>  <dbl>    <dbl>
 1 11/3/2014  11/3/2014        1 control         2 f         24       81
 2 11/3/2014  11/3/2014        1 stress          1 f         19       59
 3 11/3/2014  11/3/2014        1 stress          3 f         19       22
 4 11/3/2014  11/3/2014        1 stress          4 f         22       53
 5 11/3/2014  11/3/2014        1 control         7 f         22       48
 6 11/3/2014  11/3/2014        1 stress          6 f         22       73
 7 11/3/2014  11/3/2014        1 control         5 f         18       NA
 8 11/3/2014  11/3/2014        1 control         9 m         20      100
 9 11/3/2014  11/3/2014        1 stress         16 f         21       67
10 11/3/2014  11/3/2014        1 stress         13 f         19       30
# ℹ 178 more rows
# ℹ 150 more variables: mood_post <dbl>, STAI_pre_1_1 <dbl>,
#   STAI_pre_1_2 <dbl>, STAI_pre_1_3 <dbl>, STAI_pre_1_4 <dbl>,
#   STAI_pre_1_5 <dbl>, STAI_pre_1_6 <dbl>, STAI_pre_1_7 <dbl>,
#   STAI_pre_2_1 <dbl>, STAI_pre_2_2 <dbl>, STAI_pre_2_3 <dbl>,
#   STAI_pre_2_4 <dbl>, STAI_pre_2_5 <dbl>, STAI_pre_2_6 <dbl>,
#   STAI_pre_2_7 <dbl>, STAI_pre_3_1 <dbl>, STAI_pre_3_2 <dbl>, …

Create a new STAI_pre_category column.

Use case_when() or case_match() to categorize values in STAI_pre as “low”, “normal” or “high”. For values < 25 in STAI_pre assign “low”, for values > 65 assign “high”, and for all other values assign “normal”.

To easily see the new column, use relocate() to move it to the first position of the dataframe.

judgments |>
  mutate(STAI_pre_category = case_match(
    STAI_pre, 
    c(0:24)  ~ "low",
    c(25:65) ~ "normal",
    .default = "high")) |>
  relocate(STAI_pre_category)
# A tibble: 188 × 159
   STAI_pre_category start_date end_date finished condition subject gender   age
   <chr>             <chr>      <chr>       <dbl> <chr>       <dbl> <chr>  <dbl>
 1 normal            11/3/2014  11/3/20…        1 control         2 female    24
 2 normal            11/3/2014  11/3/20…        1 stress          1 female    19
 3 normal            11/3/2014  11/3/20…        1 stress          3 female    19
 4 normal            11/3/2014  11/3/20…        1 stress          4 female    22
 5 normal            11/3/2014  11/3/20…        1 control         7 female    22
 6 normal            11/3/2014  11/3/20…        1 stress          6 female    22
 7 normal            11/3/2014  11/3/20…        1 control         5 female    18
 8 low               11/3/2014  11/3/20…        1 control         9 male      20
 9 normal            11/3/2014  11/3/20…        1 stress         16 female    21
10 normal            11/3/2014  11/3/20…        1 stress         13 female    19
# ℹ 178 more rows
# ℹ 151 more variables: mood_pre <dbl>, mood_post <dbl>, STAI_pre_1_1 <dbl>,
#   STAI_pre_1_2 <dbl>, STAI_pre_1_3 <dbl>, STAI_pre_1_4 <dbl>,
#   STAI_pre_1_5 <dbl>, STAI_pre_1_6 <dbl>, STAI_pre_1_7 <dbl>,
#   STAI_pre_2_1 <dbl>, STAI_pre_2_2 <dbl>, STAI_pre_2_3 <dbl>,
#   STAI_pre_2_4 <dbl>, STAI_pre_2_5 <dbl>, STAI_pre_2_6 <dbl>,
#   STAI_pre_2_7 <dbl>, STAI_pre_3_1 <dbl>, STAI_pre_3_2 <dbl>, …

Rescale values in the REI group

Divide all entries in the REI questionnaire by 5, the maximal value, so the values will be between 0 and 1.

across() allows modification of multiple columns in one go.

judgments |> 
  mutate(across(contains("REI"), 
                \(x) x / 5)) |>
  relocate(contains("REI"))
# A tibble: 188 × 158
   REI_1 REI_2 REI_3 REI_4 REI_5 REI_6 REI_7 REI_8 REI_9 REI_10 REI_11 REI_12
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>  <dbl>
 1   1     0.8   1     0.8   0.8   1     0.6   0.8   0.6    0.8    1      1  
 2  NA    NA    NA    NA    NA    NA    NA    NA    NA     NA     NA     NA  
 3  NA    NA    NA    NA    NA    NA    NA    NA    NA     NA     NA     NA  
 4  NA    NA    NA    NA    NA    NA    NA    NA    NA     NA     NA     NA  
 5   0.6   0.6   0.6   0.6   0.8   0.6   0.6   0.6   0.6    0.6    0.6   NA  
 6  NA    NA    NA    NA    NA    NA    NA    NA    NA     NA     NA     NA  
 7   0.8   0.8   0.8   0.8   0.8   0.8   0.6   0.8   0.8    0.8    0.8    0.6
 8   0.8   1     1     1     1     1     1     1     0.6    0.8    1      0.8
 9  NA    NA    NA    NA    NA    NA    NA    NA    NA     NA     NA     NA  
10  NA    NA    NA    NA    NA    NA    NA    NA    NA     NA     NA     NA  
# ℹ 178 more rows
# ℹ 146 more variables: REI_13 <dbl>, REI_14 <dbl>, REI_15 <dbl>, REI_16 <dbl>,
#   REI_17 <dbl>, REI_18 <dbl>, REI_19 <dbl>, REI_20 <dbl>, REI_21 <dbl>,
#   REI_22 <dbl>, REI_23 <dbl>, REI_24 <dbl>, REI_25 <dbl>, REI_26 <dbl>,
#   REI_27 <dbl>, REI_28 <dbl>, REI_29 <dbl>, REI_30 <dbl>, REI_31 <dbl>,
#   REI_32 <dbl>, REI_33 <dbl>, REI_34 <dbl>, REI_35 <dbl>, REI_36 <dbl>,
#   REI_37 <dbl>, REI_38 <dbl>, REI_39 <dbl>, REI_40 <dbl>, …


Part two - Grouping and summarizing

Use judgments to compute basic statistics for all moral dilemma columns considering the conditions.

  1. Compute the mean, the median, the standard deviation as well as min and max values.
  2. Find meaningful short names for the functions such as med for median().
  3. Assign judgments_condition_stats to the results.
judgments |> 
  group_by(condition) |> 
  summarize(across(starts_with("moral_dilemma"),
                   list(
                     mean = mean,
                     sd = sd,
                     med = median ,
                     min = min,
                     max = max
                   ))) -> judgments_condition_stats

Sort by groups

  1. Find the number of subjects by age, gender and condition, e.g. how many 20 years of age females are in the stress group.
  2. Sort the resulting tibble such that the condition that contains the most populous group is sorted first (i.e. stress or control appear together).
  3. Ensure that the resulting tibble does not contain groups.
judgments |> 
  group_by( condition, gender, age) |> 
  summarize(n = n()) |> 
  arrange(desc(n), .by_group = TRUE) |> 
  ungroup()
`summarise()` has grouped output by 'condition', 'gender'. You can override
using the `.groups` argument.
# A tibble: 33 × 4
   condition gender   age     n
   <chr>     <chr>  <dbl> <int>
 1 control   female    18    25
 2 control   female    19    17
 3 control   female    21     7
 4 control   female    20     4
 5 control   female    22     4
 6 control   female    23     3
 7 control   female    17     2
 8 control   female    24     2
 9 control   female    26     1
10 control   male      18     7
# ℹ 23 more rows


Part three - joins and pivots

Judgements reprise

Pivoting for readability

Take the results from above that are available as judgments_condition_stats and bring the data into a more readable format.

  1. Make the table longer. After this step, your tibble should contain three columns:
  • The condition
  • the name of the moral dilemma and stats (in one column)
  • the values of the stats by moral dilemma.
  1. Split the moral dilemma and stats column. You can use separate_wider_delim or mutate to create a column that contains the moral dilemma (dog, trolley, etc) and the stats (median, min, etc.).
  2. Make the table wider again and move the stats to individual columns.

The final table should look something like this.

condition moral_dilemma mean sd
control dog 7.24 2.23
stress wallet 7.4 2.01
judgments_condition_stats |> 
  pivot_longer(cols = contains("moral_dilemma"),
               names_to = "dilemma",
               names_prefix = "moral_dilemma_",
               values_to = "stat_values") |> 
  separate_wider_delim(cols = dilemma, 
                       names = c("moral", "stat"),
                       delim = "_") |> 
  pivot_wider(names_from = stat,
              values_from = stat_values) |> 
  mutate(across(c(mean, sd), \(x) round(x, 2)))
# A tibble: 14 × 7
   condition moral    mean    sd   med   min   max
   <chr>     <chr>   <dbl> <dbl> <dbl> <dbl> <dbl>
 1 control   dog      7.24  2.23     8     1     9
 2 control   wallet   6.87  2.16     8     1     9
 3 control   plane    6.77  2.36     8     1     9
 4 control   resume   6.73  1.78     7     1     9
 5 control   kitten   7.74  1.66     8     2     9
 6 control   trolley  3.32  1.69     3     1     8
 7 control   control  6.41  2.02     7     1     9
 8 stress    dog      7.45  2.11     8     1     9
 9 stress    wallet   7.4   1.86     8     2     9
10 stress    plane    7.22  1.96     8     1     9
11 stress    resume   7.1   1.63     7     2     9
12 stress    kitten   8.06  1.55     9     2     9
13 stress    trolley  3.81  1.78     4     1     9
14 stress    control  6.27  2.12     7     1     9

Genetic variants

The tribble below contains changes of the sequence of a gene. The format in the input is the expected sequence (the reference allele), the position and the variant, commonly called alternative allele.

In T6G, T is the reference allele, 6 is the position (along the gene) and G is the variant allele.

variants <- tribble(
  ~sampleid, ~var1, ~var2, ~var3,
  "S1", "A3T", "T5G", "T6G",
  "S2", "A3G", "T5G", NA,
  "S3", "A3T", "T6C", "G10C",
  "S4", "A3T", "T6C", "G10C"
)

Clean the table of genetic variants such that all variants appear as a column labeled by their position.

The table should look something like this.

sampleid 3 5 6
S1 T G G
S2 G G NA
S3
variants |> 
pivot_longer(cols = -sampleid,  
             names_to = "varcol", 
             values_to = "mutation") |> 
  mutate(pos = str_extract(mutation, "\\d+")) |> 
  mutate(maf = str_extract(mutation, "\\w$")) |> 
  select(sampleid, pos, maf) |> 
  pivot_wider(id_cols = everything(), 
              names_from = pos, 
              values_from = maf)
# A tibble: 4 × 6
  sampleid `3`   `5`   `6`   `NA`  `10` 
  <chr>    <chr> <chr> <chr> <chr> <chr>
1 S1       T     G     G     <NA>  <NA> 
2 S2       G     G     <NA>  <NA>  <NA> 
3 S3       T     <NA>  C     <NA>  C    
4 S4       T     <NA>  C     <NA>  C    

Select relevant variants

In this table the variants are labeled according to their effect on stability of the gene product.

variant_significance <- tribble(
  ~variant, ~significance,
  "A3T", "unknown",
  "A3G", "damaging",
  "T5G", "benign",
  "T6G", "damaging",
  "T6C", "benign",
  "G10C", "unknown"
)

Identify the subjects in the table variants that carry variants labeled as damaging in variant_significance table.

The final output should be vector of sample ids.

variants |>
  pivot_longer(
    cols = -sampleid,
    names_to = "varcol", 
    values_to = "mutation"
    ) |> 
  inner_join(variant_significance, 
             by = c("mutation" = "variant")) |> 
  filter(significance == "damaging") |> 
  pull(sampleid)
[1] "S1" "S2"

You can use several join flavours and the %in% operator to achieve the same result.

Try using semi_join and the is.element() function!

# is.element solution
variants |>
  pivot_longer(cols = -sampleid,
               names_to = "varcol",
               values_to = "mutation") |>
  filter(is.element(
    mutation ,
    variant_significance |>
      filter(significance == "damaging") |>
      pull(variant))) |>
  pull(sampleid)
[1] "S1" "S2"
# semi join
variants |>
  pivot_longer(
    cols = -sampleid,
    names_to = "varcol", 
    values_to = "mutation"
    ) |> 
  semi_join(variant_significance |>
             filter(significance == "damaging"),
             join_by(mutation == variant)) |> 
  pull(sampleid)
[1] "S1" "S2"