Automatic Subgrouped Summaries (Table 1) in R
The so-called Table 1 often contains summaries of some descriptive variables of the units of the datasets. Table 2 then shows the results of some model fit. (And the cleverly named “Table 2 fallacy” means that you wrongly interpret all parameters from your model causally.)
It is always best to use proper packages to do things for you but how could you write a function to make that table automatically? A good general solution could definitely speed things up a lot. Here’s an example.
# First have a function that computes every summary you can imagine
# on all of the variables in your dataset.
summarize_variables <- function(data,
# It's good to keep ID variables on board
# even though you need to ignore them all the time.
id_var_names,
# Metadata for variables includes
# units of measurement and
# human-readable names to use in the table.
variable_meta_file,
name_variable,
unit_variable,
description_variable) {
id_presence <- data %>%
summarize_ids(id_var_names = id_var_names)
missingness <- data %>%
summarize_missingness(id_var_names = id_var_names)
numerics <- data %>%
summarize_numerics(
id_var_names = id_var_names,
variable_meta_file = variable_meta_file,
name_variable = {{ name_variable }},
unit_variable = {{ unit_variable }}
)
discretes <- data %>%
summarize_discretes(id_var_names = id_var_names)
summaries <- bind_rows(id_presence, missingness, discretes, numerics) %>%
arrange(variable, unit, measure)
return(summaries)
}
# One function deals with different kinds of values.
summarize_ids <- function(data, id_var_names) {
summary_functions <- list(
count = function(x) { n_unique(x) },
proportion = function(x) { n_unique(x) / length(x) },
percentage = function(x) { (n_unique(x) / length(x)) * 100 }
)
summary <- data %>%
select(any_of(id_var_names)) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
) %>%
group_by(variable) %>%
summarize(
across(value, summary_functions, .names = "{.fn}")
) %>%
pivot_longer(
cols = -variable,
names_to = "measure",
values_to = "value"
) %>%
mutate(unit = "Identity") %>%
select(variable, unit, measure, value)
return(summary)
}
summarize_missingness <- function(data, id_var_names) {
summary_functions <- list(
count = function(x) { sum(x) },
proportion = function(x) { sum(x) / length(x) },
percentage = function(x) { (sum(x) / length(x)) * 100 }
)
summary <- data %>%
select(-any_of(id_var_names)) %>%
mutate(across(
everything(),
is.na
)) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
) %>%
group_by(variable) %>%
summarize(
across(value, summary_functions, .names = "{.fn}")
) %>%
pivot_longer(
cols = -variable,
names_to = "measure",
values_to = "value"
) %>%
mutate(unit = "Missing") %>%
select(variable, unit, measure, value)
return(summary)
}
summarize_numerics <- function(data,
id_var_names,
variable_meta_file,
name_variable,
unit_variable) {
has_numerics <- data %>%
map_lgl(is.numeric) %>%
any()
if (has_numerics == FALSE) {
empty_result <- tibble::tibble(
variable = character(),
unit = character(),
measure = character(),
value = double(),
.rows = 0
)
return(empty_result)
} else {
# One can add summary functions to the list if something is missing.
summary_functions <- list(
median = function(x) { median(x, na.rm = TRUE) },
mean = function(x) { mean(x, na.rm = TRUE) },
sd = function(x) { sd(x, na.rm = TRUE) },
variance = function(x) { var(x, na.rm = TRUE) },
minimum = function(x) { min(x, na.rm = TRUE) },
maximum = function(x) { max(x, na.rm = TRUE) },
type8_25_centile = function(x) {
quantile(x, probs = c(0.25), na.rm = TRUE, type = 8)
},
type8_75_centile = function(x) {
quantile(x, probs = c(0.75), na.rm = TRUE, type = 8)
},
sum = function(x) { sum(x, na.rm = TRUE) },
interquantile_range = function(x) { stats::IQR(x, na.rm = TRUE, type = 7) },
mad = function(x) {
stats::mad(x, center = median(x, na.rm = TRUE), na.rm = TRUE)
}
)
units <- variable_meta_file %>%
read_csv(col_types = cols(.default = "c")) %>%
select(variable = {{ name_variable }}, unit = {{ unit_variable }}) %>%
drop_na() %>%
distinct()
variables_in_data <- data %>%
select(where(is.numeric)) %>%
colnames()
variables_in_units <- units %>%
pull(variable)
all_have_units <- all(variables_in_data %in% variables_in_units)
# Units should be clear and exist in variable metadata.
if (not(all_have_units)) {
vars_without_units <- variables_in_data %>%
magrittr::extract(!(variables_in_data %in% variables_in_units))
stop_message <- str_c(
"Units not found for ",
length(vars_without_units),
" variables in data."
)
cat(
"Units missing in metadata for...\n",
str_c(vars_without_units, collapse = ", "),
"\n"
)
stop(stop_message)
}
summary <- data %>%
select(-any_of(id_var_names)) %>%
select_if(is.numeric) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
) %>%
filter(not(is.na(value))) %>%
group_by(variable) %>%
summarize(
across(value, summary_functions, .names = "{.fn}")
) %>%
pivot_longer(
cols = -variable,
names_to = "measure",
values_to = "value"
) %>%
left_join(units, by = "variable") %>%
select(variable, unit, measure, value)
return(summary)
}
}
summarize_discretes <- function(data, id_var_names) {
assert_that("data.frame" %in% class(data))
assert_that(is.character(id_var_names))
assert_that(length(id_var_names) > 0)
is_discrete <- function(x) {
!is.numeric(x)
}
has_discretes <- data %>%
select(-any_of(id_var_names)) %>%
map_lgl(is_discrete) %>%
any()
if (has_discretes == FALSE) {
empty_result <- tibble::tibble(
variable = character(),
unit = character(),
measure = character(),
value = double(),
.rows = 0
)
return(empty_result)
} else {
summary <- data %>%
select(-any_of(id_var_names)) %>%
select(where(is_discrete)) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
) %>%
filter(!is.na(value)) %>%
group_by(variable, value) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(
proportion = count / sum(count),
percentage = proportion * 100,
odds = proportion / (1 - proportion)
) %>%
ungroup() %>%
rename(unit = value) %>%
pivot_longer(
cols = count:odds,
names_to = "measure",
values_to = "value"
) %>%
select(variable, unit, measure, value)
return(summary)
}
}
# Then one can apply the summarizer by subgroup with this function.
summarize_subgroups <- function(data,
id_var_names,
subgroup_variable,
variable_meta_file,
name_variable,
unit_variable,
description_variable) {
assert_that("data.frame" %in% class(data))
assert_that(is.character(id_var_names))
assert_that(length(id_var_names) > 0)
assert_that(is.character(variable_meta_file))
assert_that(length(variable_meta_file) == 1)
subgroup_summaries <- data %>%
nest(data = -{{ subgroup_variable }}) %>%
mutate(summary = map(
data,
summarize_variables,
id_var_names = id_var_names,
variable_meta_file = variable_meta_file,
name_variable = {{ name_variable }},
unit_variable = {{ unit_variable }},
description_variable = {{ description_variable }}
)) %>%
select({{ subgroup_variable }}, summary) %>%
unnest(summary)
return(subgroup_summaries)
}
# Then we need to the summaries and present them
# in the Table 1 style. This requires lots of annoying string-formatting.
tabulate_summary <- function(data,
id_var_names,
subgroup_variable,
subgroups,
variable_meta_file,
name_variable,
unit_variable,
description_variable) {
assert_that("data.frame" %in% class(data))
assert_that(is.character(id_var_names))
assert_that(length(id_var_names) > 0)
assert_that(is.character(subgroups))
assert_that(length(subgroups) > 0)
assert_that(is.character(variable_meta_file))
assert_that(length(variable_meta_file) == 1)
format_summary <- function(summary_table,
id_var_names,
subgroups,
variable_meta_file,
name_variable,
description_variable) {
format_value_col <- function(summary_table) {
assert_that(ncol(summary_table) == 5)
# I just pick these measures as they are generally enough.
measure_ordering <- c("median", "minimum", "maximum", "count", "percentage")
summary_table <- summary_table %>%
filter(measure %in% measure_ordering) %>%
# Make pieces for "median (min–max)" and "count (percentage%)".
mutate(value = clean_number(value)) %>%
mutate(value = str_trim(value, side = "both")) %>%
mutate(value = case_when(
measure == "maximum" ~ str_c(value, ")", sep = ""),
measure == "minimum" ~ str_c("(", value, " –", sep = ""),
measure == "percentage" ~ str_c("(", value, "%)", sep = ""),
TRUE ~ value
)) %>%
# Arrange for concatenation.
mutate(measure = factor(measure, levels = measure_ordering)) %>%
arrange(subgroup, variable, unit, measure) %>%
mutate(measure = as.character(measure)) %>%
# Glue strings.
group_by(subgroup, variable, unit) %>%
summarize(
measure = str_c(measure, collapse = " "),
value = str_c(value, collapse = " "),
.groups = "drop"
) %>%
mutate(value = str_replace(value, " – ", "–")) %>%
select(subgroup, variable, unit, measure, value)
return(summary_table)
}
pivot_subgroups <- function(summary_table, id_var_names, subgroups) {
assert_that(ncol(summary_table) == 5)
summary_table <- summary_table %>%
pivot_wider(names_from = "subgroup", values_from = "value") %>%
select(variable, unit, measure, all_of(subgroups)) %>%
mutate(is_id = variable %in% id_var_names) %>%
arrange(is_id, variable, desc(unit), measure) %>%
select(-is_id)
return(summary_table)
}
format_variable_col <- function(summary_table,
variable_meta_file,
name_variable,
description_variable) {
assert_that(ncol(summary_table) == (3 + length(subgroups)))
variables_in_data <- summary_table %>%
pull("variable") %>%
unique()
description_table <- variable_meta_file %>%
read_csv(col_types = cols(.default = "c")) %>%
select(
variable = {{ name_variable }},
description = {{ description_variable }}
) %>%
filter(variable %in% variables_in_data) %>%
drop_na() %>%
distinct()
vars_with_descriptions <- description_table %>%
pull(variable)
# All variables should have descriptive names in variable metadata.
all_have_descriptions <- all(variables_in_data %in% vars_with_descriptions)
if (not(all_have_descriptions)) {
vars_without_desc <- variables_in_data %>%
magrittr::extract(!(variables_in_data %in% vars_with_descriptions))
stop_message <- str_c(
"Description is missing in metadata for ",
length(vars_without_desc),
" variable(s) in data (",
str_c(vars_without_desc, collapse = ", "),
")."
)
stop(stop_message)
}
summary_table <- description_table %>%
right_join(summary_table, by = "variable") %>%
select(-variable) %>%
rename(variable = description)
is_duplicate <- summary_table %>%
pull("variable") %>%
base::duplicated()
summary_table <- summary_table %>%
mutate(is_duplicate_value = is_duplicate) %>%
mutate(variable = if_else(
condition = is_duplicate_value,
true = " ",
false = variable
)) %>%
select(-is_duplicate_value)
return(summary_table)
}
format_measure_col <- function(summary_table) {
summary_table <- summary_table %>%
mutate(measure = str_replace(
measure,
pattern = "minimum maximum",
replacement = "(range)"
)) %>%
mutate(measure = str_replace(
measure,
pattern = "percentage",
replacement = "(%)")
) %>%
mutate(measure = str_to_sentence(measure)) %>%
group_by(variable) %>%
mutate(measure = if_else(
condition = base::duplicated(measure),
true = " ",
false = measure
)) %>%
ungroup()
return(summary_table)
}
format_subgroup_cols <- function(summary_table, subgroups) {
summary_table <- summary_table %>%
mutate(across(
all_of(subgroups),
function(x) { if_else(is.na(x), true = "0 (0%)", false = x) }
))
return(summary_table)
}
summary_table <- summary_table %>%
filter(
measure %in% c("median", "minimum", "maximum", "count", "percentage")
) %>%
filter(subgroup %in% subgroups) %>%
format_value_col() %>%
pivot_subgroups(
id_var_names = id_var_names,
subgroups = subgroups
) %>%
format_measure_col() %>%
format_variable_col(
variable_meta_file = variable_meta_file,
name_variable = {{ name_variable }},
description_variable = {{ description_variable }}
) %>%
format_subgroup_cols(subgroups = subgroups) %>%
rename(Variable = variable, Unit = unit, Measure = measure)
return(summary_table)
}
summary_table <- data %>%
summarize_subgroups(
id_var_names = id_var_names,
subgroup_variable = {{ subgroup_variable }},
variable_meta_file = variable_meta_file,
name_variable = {{ name_variable }},
unit_variable = {{ unit_variable }},
description_variable = {{ description_variable }}
) %>%
rename(subgroup = {{ subgroup_variable }}) %>%
format_summary(
id_var_names = id_var_names,
subgroups = subgroups,
variable_meta_file = variable_meta_file,
name_variable = {{ name_variable }},
description_variable = {{ description_variable }}
)
return(summary_table)
}