Summarize All Variables Automatically in R
A basic inspection and description you often want to do on a dataset is to compute different summaries of all the variables and make sure they make sense. There certainly exists actual packages that help to do this but it’s not very difficult to write on your own.
Often you pick a particular summary measure for your report but other people might have picked other kinds of measures. So a rich summarization of a whole dataset for all measures could be a good thing to share in supplemental files or the equivalent of that. Still, visualizations are definitely the best when it comes to getting a sense of the distributions.
summarize_variables <- function(data,
id_var_names,
variable_meta_file,
name_variable,
unit_variable,
description_variable) {
id_presence <- data %>%
summarize_ids(id_var_names = id_var_names)
missingness <- data %>%
summarize_missingness(id_var_names = id_var_names)
numerics <- data %>%
summarize_numerics(
id_var_names = id_var_names,
variable_meta_file = variable_meta_file,
name_variable = {{ name_variable }},
unit_variable = {{ unit_variable }}
)
discretes <- data %>%
summarize_discretes(id_var_names = id_var_names)
summaries <- bind_rows(id_presence, missingness, discretes, numerics) %>%
arrange(variable, unit, measure)
return(summaries)
}
#' Summarize the ID variables of any dataset (part of summarize_variables).
#'
summarize_ids <- function(data, id_var_names) {
summary_functions <- list(
count = function(x) { n_unique(x) },
proportion = function(x) { n_unique(x) / length(x) },
percentage = function(x) { (n_unique(x) / length(x)) * 100 }
)
summary <- data %>%
select(any_of(id_var_names)) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
) %>%
group_by(variable) %>%
summarize(
across(value, summary_functions, .names = "{.fn}")
) %>%
pivot_longer(
cols = -variable,
names_to = "measure",
values_to = "value"
) %>%
mutate(unit = "Identity") %>%
select(variable, unit, measure, value)
return(summary)
}
#' Summarize missing values of any dataset (part of summarize_variables).
#'
summarize_missingness <- function(data, id_var_names) {
summary_functions <- list(
count = function(x) { sum(x) },
proportion = function(x) { sum(x) / length(x) },
percentage = function(x) { (sum(x) / length(x)) * 100 }
)
summary <- data %>%
select(-any_of(id_var_names)) %>%
mutate(across(
everything(),
is.na
)) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
) %>%
group_by(variable) %>%
summarize(
across(value, summary_functions, .names = "{.fn}")
) %>%
pivot_longer(
cols = -variable,
names_to = "measure",
values_to = "value"
) %>%
mutate(unit = "Missing") %>%
select(variable, unit, measure, value)
return(summary)
}
#' Summarize numeric variables of any dataset (part of summarize_variables).
#'
summarize_numerics <- function(data,
id_var_names,
variable_meta_file,
name_variable,
unit_variable) {
has_numerics <- data %>%
map_lgl(is.numeric) %>%
any()
if (has_numerics == FALSE) {
empty_result <- tibble::tibble(
variable = character(),
unit = character(),
measure = character(),
value = double(),
.rows = 0
)
return(empty_result)
} else {
summary_functions <- list(
median = function(x) { median(x, na.rm = TRUE) },
mean = function(x) { mean(x, na.rm = TRUE) },
sd = function(x) { sd(x, na.rm = TRUE) },
variance = function(x) { var(x, na.rm = TRUE) },
minimum = function(x) { min(x, na.rm = TRUE) },
maximum = function(x) { max(x, na.rm = TRUE) },
type8_25_centile = function(x) {
quantile(x, probs = c(0.25), na.rm = TRUE, type = 8)
},
type8_75_centile = function(x) {
quantile(x, probs = c(0.75), na.rm = TRUE, type = 8)
},
sum = function(x) { sum(x, na.rm = TRUE) },
interquantile_range = function(x) { stats::IQR(x, na.rm = TRUE, type = 7) },
mad = function(x) {
stats::mad(x, center = median(x, na.rm = TRUE), na.rm = TRUE)
}
)
units <- variable_meta_file %>%
read_csv(col_types = cols(.default = "c")) %>%
select(variable = {{ name_variable }}, unit = {{ unit_variable }}) %>%
drop_na() %>%
distinct()
variables_in_data <- data %>%
select(where(is.numeric)) %>%
colnames()
variables_in_units <- units %>%
pull(variable)
all_have_units <- all(variables_in_data %in% variables_in_units)
if (not(all_have_units)) {
vars_without_units <- variables_in_data %>%
magrittr::extract(!(variables_in_data %in% variables_in_units))
stop_message <- str_c(
"Units not found for ",
length(vars_without_units),
" variables in data."
)
cat(
"Units missing in metadata for...\n",
str_c(vars_without_units, collapse = ", "),
"\n"
)
stop(stop_message)
}
summary <- data %>%
select(-any_of(id_var_names)) %>%
select_if(is.numeric) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
) %>%
filter(not(is.na(value))) %>%
group_by(variable) %>%
summarize(
across(value, summary_functions, .names = "{.fn}")
) %>%
pivot_longer(
cols = -variable,
names_to = "measure",
values_to = "value"
) %>%
left_join(units, by = "variable") %>%
select(variable, unit, measure, value)
return(summary)
}
}
#' Summarize discrete variables of any dataset (part of summarize_variables).
#'
summarize_discretes <- function(data, id_var_names) {
assert_that("data.frame" %in% class(data))
assert_that(is.character(id_var_names))
assert_that(length(id_var_names) > 0)
is_discrete <- function(x) {
!is.numeric(x)
}
has_discretes <- data %>%
select(-any_of(id_var_names)) %>%
map_lgl(is_discrete) %>%
any()
if (has_discretes == FALSE) {
empty_result <- tibble::tibble(
variable = character(),
unit = character(),
measure = character(),
value = double(),
.rows = 0
)
return(empty_result)
} else {
summary <- data %>%
select(-any_of(id_var_names)) %>%
select(where(is_discrete)) %>%
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
) %>%
filter(!is.na(value)) %>%
group_by(variable, value) %>%
summarise(count = n(), .groups = "drop_last") %>%
mutate(
proportion = count / sum(count),
percentage = proportion * 100,
odds = proportion / (1 - proportion)
) %>%
ungroup() %>%
rename(unit = value) %>%
pivot_longer(
cols = count:odds,
names_to = "measure",
values_to = "value"
) %>%
select(variable, unit, measure, value)
return(summary)
}
}