AMEC 2026 | Indiana University School of Medicine

R for SNMA

Mirindi Kabangu

Instructor

Mirindi Kabangu

Scope

Overview

The “Real-World” Medical Researcher Problem

You are a resident on a busy clinical rotation. You’ve been handed a messy Vital Signs log from a 3-patient pilot study.

The Data:
The Goal:
- Download the CSV
- Split blood pressure into two columns
- Remove the “F” from Temperature
- Create a “Flag” column for any patient with a systolic BP > 140.5
- Make a bar chart of all the Systolic BPs
You have 2 minutes

print(messy_vitals)

# A tibble: 3 × 3
  patient           bp     temp  
  <chr>             <chr>  <chr> 
1 Gemma Dunn        128/78 98.6F 
2 Priscilla Holding 148/92 101.2F
3 Aniya Warner      115/73 97.5F

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/")

# A tibble: 3 × 4
  patient           sys   dia   temp  
  <chr>             <chr> <chr> <chr> 
1 Gemma Dunn        128   78    98.6F 
2 Priscilla Holding 148   92    101.2F
3 Aniya Warner      115   73    97.5F

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp))

# A tibble: 3 × 4
  patient           sys   dia    temp
  <chr>             <chr> <chr> <dbl>
1 Gemma Dunn        128   78     98.6
2 Priscilla Holding 148   92    101. 
3 Aniya Warner      115   73     97.5

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp)) |>
  # Convert types
  mutate(across(c(sys, dia), as.numeric))

# A tibble: 3 × 4
  patient             sys   dia  temp
  <chr>             <dbl> <dbl> <dbl>
1 Gemma Dunn          128    78  98.6
2 Priscilla Holding   148    92 101. 
3 Aniya Warner        115    73  97.5

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp)) |>
  # Convert types
  mutate(across(c(sys, dia), as.numeric)) |>
  # Add the Clinical Flag
  mutate(flag = sys >= 140)

# A tibble: 3 × 5
  patient             sys   dia  temp flag 
  <chr>             <dbl> <dbl> <dbl> <lgl>
1 Gemma Dunn          128    78  98.6 FALSE
2 Priscilla Holding   148    92 101.  TRUE 
3 Aniya Warner        115    73  97.5 FALSE

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp)) |>
  # Convert types
  mutate(across(c(sys, dia), as.numeric)) |>
  # Add the Clinical Flag
  mutate(flag = sys >= 140) |>
  # Instant Visualization
  ggplot(aes(x = patient, y = sys, fill = flag)) +
  geom_col()

messy_vitals |>
  # Separate BP into two numbers
  separate(bp, into = c("sys", "dia"), sep = "/") |>
  # Clean Temp units
  mutate(temp = parse_number(temp)) |>
  # Convert types
  mutate(across(c(sys, dia), as.numeric)) |>
  # Add the Clinical Flag
  mutate(flag = sys >= 140) |>
  # Instant Visualization
  ggplot(aes(x = patient, y = sys, fill = flag)) +
  geom_col() +
  scale_fill_manual(values = c("gray", "#990000")) + # IU Crimson!
  theme_minimal()

Another Challenge

You are a resident on a busy clinical rotation. You’ve been handed a messy of cancer treatement data.

The Data
The Goal:
- Download the CSV
- Calculate the mean and standard deviations for each row
- Calculate the p-values

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary()

Characteristic	N = 200¹
Chemotherapy Treatment
Drug A	98 (49%)
Drug B	102 (51%)
Age	47 (38, 57)
Unknown	11
Grade
I	68 (34%)
II	68 (34%)
III	64 (32%)
Tumor Response	61 (32%)
Unknown	7
¹ n (%); Median (Q1, Q3)

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
  )

Characteristic	Drug A N = 98¹	Drug B N = 102¹
Age	46 (37, 60)	48 (39, 56)
Unknown	7	4
Grade
I	35 (36%)	33 (32%)
II	32 (33%)	36 (35%)
III	31 (32%)	33 (32%)
Tumor Response	28 (29%)	33 (34%)
Unknown	3	4
¹ Median (Q1, Q3); n (%)

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  )

Characteristic	Drug A N = 98¹	Drug B N = 102¹
Age	47 (15)	47 (14)
Unknown	7	4
Grade
I	35 (36%)	33 (32%)
II	32 (33%)	36 (35%)
III	31 (32%)	33 (32%)
Tumor Response	28 (29%)	33 (34%)
Unknown	3	4
¹ Mean (SD); n (%)

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  ) |>
  # Add statistical comparison (p-values)
  add_p()

Characteristic	Drug A N = 98¹	Drug B N = 102¹	p-value²
Age	47 (15)	47 (14)	0.7
Unknown	7	4
Grade			0.9
I	35 (36%)	33 (32%)
II	32 (33%)	36 (35%)
III	31 (32%)	33 (32%)
Tumor Response	28 (29%)	33 (34%)	0.5
Unknown	3	4
¹ Mean (SD); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  ) |>
  # Add statistical comparison (p-values)
  add_p() |>
  # Add a "Total" column
  add_overall()

Characteristic	Overall N = 200¹	Drug A N = 98¹	Drug B N = 102¹	p-value²
Age	47 (14)	47 (15)	47 (14)	0.7
Unknown	11	7	4
Grade				0.9
I	68 (34%)	35 (36%)	33 (32%)
II	68 (34%)	32 (33%)	36 (35%)
III	64 (32%)	31 (32%)	33 (32%)
Tumor Response	61 (32%)	28 (29%)	33 (34%)	0.5
Unknown	7	3	4
¹ Mean (SD); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

library(gtsummary)

# Use the built-in 'trial' clinical dataset
gtsummary::trial |>
  select(trt, age, grade, response) |>
  # Create the summary table
  tbl_summary(
    by = trt, # Split by treatment group
    label = list(
      age ~ "Patient Age",
      grade ~ "Tumor Grade",
      response ~ "Treatment Response"
    ),
    statistic = list(all_continuous() ~ "{mean} ({sd})")
  ) |>
  # Add statistical comparison (p-values)
  add_p() |>
  # Add a "Total" column
  add_overall() |>
  # Make it look professional
  bold_labels() |>
  modify_header(label = "**Variable**")

Variable	Overall N = 200¹	Drug A N = 98¹	Drug B N = 102¹	p-value²
Patient Age	47 (14)	47 (15)	47 (14)	0.7
Unknown	11	7	4
Tumor Grade				0.9
I	68 (34%)	35 (36%)	33 (32%)
II	68 (34%)	32 (33%)	36 (35%)
III	64 (32%)	31 (32%)	33 (32%)
Treatment Response	61 (32%)	28 (29%)	33 (34%)	0.5
Unknown	7	3	4
¹ Mean (SD); n (%)
² Wilcoxon rank sum test; Pearson’s Chi-squared test

Your Turn

Exercise 1: Isolate High-Grade Tumors

The blood_storage dataset contains a Grade variable (1-4). Filter the data to show only the most aggressive tumors (Grade 4) and assign it to high_grade_cancers.

Exercise 2: Defining Clinical Risk

A PSA (Prostate Specific Antigen) level \(\ge\) 10 is often considered high risk. Use mutate() to create a new column called high_psa that is TRUE if PreopPSA is 10 or greater.

Tip

psa_analysis <- blood_storage |> 
  mutate(high_psa = PreopPSA >= 10)

head(psa_analysis)

Exercise 3: Complex Data Preparation

The PI wants to look at younger patients (Age \(\le\) 60) who had a cancer recurrence (Recurrence == 1). Select only those patients and calculate their average PSA.

Tip

summary_stats <- blood_storage |> 
  filter(Age <= 60, Recurrence == 1) |> 
  summarize(mean_psa = mean(PreopPSA, na.rm = TRUE))

summary_stats