AMEC 2026 | Indiana University School of Medicine

R for SNMA

Mirindi Kabangu

Instructor

Mirindi Kabangu

Scope

Overview

The “Real-World” Medical Researcher Problem

You are a resident on a busy clinical rotation. You’ve been handed a messy Vital Signs log from a 3-patient pilot study.

  • The Data:
  • The Goal:
    • Download the CSV
    • Split blood pressure into two columns
    • Remove the “F” from Temperature
    • Create a “Flag” column for any patient with a systolic BP > 140.5
    • Make a bar chart of all the Systolic BPs
  • You have 2 minutes

print(messy_vitals)
# A tibble: 3 × 3
  patient           bp     temp  
  <chr>             <chr>  <chr> 
1 Gemma Dunn        128/78 98.6F 
2 Priscilla Holding 148/92 101.2F
3 Aniya Warner      115/73 97.5F 

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/")
# A tibble: 3 × 4
  patient           sys   dia   temp  
  <chr>             <chr> <chr> <chr> 
1 Gemma Dunn        128   78    98.6F 
2 Priscilla Holding 148   92    101.2F
3 Aniya Warner      115   73    97.5F 

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp))
# A tibble: 3 × 4
  patient           sys   dia    temp
  <chr>             <chr> <chr> <dbl>
1 Gemma Dunn        128   78     98.6
2 Priscilla Holding 148   92    101. 
3 Aniya Warner      115   73     97.5

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp)) |>
  # Convert types
  mutate(across(c(sys, dia), as.numeric))
# A tibble: 3 × 4
  patient             sys   dia  temp
  <chr>             <dbl> <dbl> <dbl>
1 Gemma Dunn          128    78  98.6
2 Priscilla Holding   148    92 101. 
3 Aniya Warner        115    73  97.5

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp)) |>
  # Convert types
  mutate(across(c(sys, dia), as.numeric)) |>
  # Add the Clinical Flag
  mutate(flag = sys >= 140)
# A tibble: 3 × 5
  patient             sys   dia  temp flag 
  <chr>             <dbl> <dbl> <dbl> <lgl>
1 Gemma Dunn          128    78  98.6 FALSE
2 Priscilla Holding   148    92 101.  TRUE 
3 Aniya Warner        115    73  97.5 FALSE

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp)) |>
  # Convert types
  mutate(across(c(sys, dia), as.numeric)) |>
  # Add the Clinical Flag
  mutate(flag = sys >= 140) |>
  # Instant Visualization
  ggplot(aes(x = patient, y = sys, fill = flag)) +
  geom_col()

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp)) |>
  # Convert types
  mutate(across(c(sys, dia), as.numeric)) |>
  # Add the Clinical Flag
  mutate(flag = sys >= 140) |>
  # Instant Visualization
  ggplot(aes(x = patient, y = sys, fill = flag)) +
  geom_col() +
  scale_fill_manual(values = c("gray", "#990000")) + # IU Crimson!
  theme_minimal()

Another Challenge

You are a resident on a busy clinical rotation. You’ve been handed a messy of cancer treatement data.

  • The Data
  • The Goal:
    • Download the CSV
    • Calculate the mean and standard deviations for each row
    • Calculate the p-values

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary()
Characteristic N = 2001
Chemotherapy Treatment
    Drug A 98 (49%)
    Drug B 102 (51%)
Age 47 (38, 57)
    Unknown 11
Grade
    I 68 (34%)
    II 68 (34%)
    III 64 (32%)
Tumor Response 61 (32%)
    Unknown 7
1 n (%); Median (Q1, Q3)

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
  )
Characteristic Drug A
N = 981
Drug B
N = 1021
Age 46 (37, 60) 48 (39, 56)
    Unknown 7 4
Grade

    I 35 (36%) 33 (32%)
    II 32 (33%) 36 (35%)
    III 31 (32%) 33 (32%)
Tumor Response 28 (29%) 33 (34%)
    Unknown 3 4
1 Median (Q1, Q3); n (%)

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  )
Characteristic Drug A
N = 981
Drug B
N = 1021
Age 47 (15) 47 (14)
    Unknown 7 4
Grade

    I 35 (36%) 33 (32%)
    II 32 (33%) 36 (35%)
    III 31 (32%) 33 (32%)
Tumor Response 28 (29%) 33 (34%)
    Unknown 3 4
1 Mean (SD); n (%)

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  ) |>
  # Add statistical comparison (p-values)
  add_p()
Characteristic Drug A
N = 981
Drug B
N = 1021
p-value2
Age 47 (15) 47 (14) 0.7
    Unknown 7 4
Grade

0.9
    I 35 (36%) 33 (32%)
    II 32 (33%) 36 (35%)
    III 31 (32%) 33 (32%)
Tumor Response 28 (29%) 33 (34%) 0.5
    Unknown 3 4
1 Mean (SD); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  ) |>
  # Add statistical comparison (p-values)
  add_p() |>
  # Add a "Total" column
  add_overall()
Characteristic Overall
N = 2001
Drug A
N = 981
Drug B
N = 1021
p-value2
Age 47 (14) 47 (15) 47 (14) 0.7
    Unknown 11 7 4
Grade


0.9
    I 68 (34%) 35 (36%) 33 (32%)
    II 68 (34%) 32 (33%) 36 (35%)
    III 64 (32%) 31 (32%) 33 (32%)
Tumor Response 61 (32%) 28 (29%) 33 (34%) 0.5
    Unknown 7 3 4
1 Mean (SD); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
    label = list(
      age ~ "Patient Age",
      grade ~ "Tumor Grade",
      response ~ "Treatment Response"
    ),
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  ) |>
  # Add statistical comparison (p-values)
  add_p() |>
  # Add a "Total" column
  add_overall() |>
  # Make it look professional
  bold_labels() |>
  modify_header(label = "**Variable**")
Variable Overall
N = 2001
Drug A
N = 981
Drug B
N = 1021
p-value2
Patient Age 47 (14) 47 (15) 47 (14) 0.7
    Unknown 11 7 4
Tumor Grade


0.9
    I 68 (34%) 35 (36%) 33 (32%)
    II 68 (34%) 32 (33%) 36 (35%)
    III 64 (32%) 31 (32%) 33 (32%)
Treatment Response 61 (32%) 28 (29%) 33 (34%) 0.5
    Unknown 7 3 4
1 Mean (SD); n (%)
2 Wilcoxon rank sum test; Pearson’s Chi-squared test

Your Turn

Exercise 1: Isolate High-Grade Tumors

The blood_storage dataset contains a Grade variable (1-4). Filter the data to show only the most aggressive tumors (Grade 4) and assign it to high_grade_cancers.

Tip

high_grade_cancers <- blood_storage |> filter(Grade == 4) nrow(high_grade_cancers)
high_grade_cancers <- blood_storage |> 
  filter(Grade == 4)

nrow(high_grade_cancers)

Exercise 2: Defining Clinical Risk

A PSA (Prostate Specific Antigen) level \(\ge\) 10 is often considered high risk. Use mutate() to create a new column called high_psa that is TRUE if PreopPSA is 10 or greater.

Tip

psa_analysis <- blood_storage |> mutate(high_psa = PreopPSA >= 10) head(psa_analysis)
psa_analysis <- blood_storage |> 
  mutate(high_psa = PreopPSA >= 10)

head(psa_analysis)

Exercise 3: Complex Data Preparation

The PI wants to look at younger patients (Age \(\le\) 60) who had a cancer recurrence (Recurrence == 1). Select only those patients and calculate their average PSA.

Tip

summary_stats <- blood_storage |> filter(Age <= 60, Recurrence == 1) |> summarize(mean_psa = mean(PreopPSA, na.rm = TRUE)) summary_stats
summary_stats <- blood_storage |> 
  filter(Age <= 60, Recurrence == 1) |> 
  summarize(mean_psa = mean(PreopPSA, na.rm = TRUE))

summary_stats