Skip to main content eteppo

Automatic Subgrouped Summaries (Table 1) in R

Published: 2023-08-04
Updated: 2023-08-04

The so-called Table 1 often contains summaries of some descriptive variables of the units of the datasets. Table 2 then shows the results of some model fit. (And the cleverly named “Table 2 fallacy” means that you wrongly interpret all parameters from your model causally.)

It is always best to use proper packages to do things for you but how could you write a function to make that table automatically? A good general solution could definitely speed things up a lot. Here’s an example.

# First have a function that computes every summary you can imagine
# on all of the variables in your dataset.
summarize_variables <- function(data, 
                                # It's good to keep ID variables on board
                                # even though you need to ignore them all the time.
                                id_var_names, 
                                # Metadata for variables includes 
                                # units of measurement and
                                # human-readable names to use in the table.
                                variable_meta_file,
                                name_variable,
                                unit_variable,
                                description_variable) {

  id_presence <- data %>%
    summarize_ids(id_var_names = id_var_names)

  missingness <- data %>%
    summarize_missingness(id_var_names = id_var_names)

  numerics <- data %>%
    summarize_numerics(
      id_var_names = id_var_names, 
      variable_meta_file = variable_meta_file,
      name_variable = {{ name_variable }},
      unit_variable = {{ unit_variable }}
    )

  discretes <- data %>%
    summarize_discretes(id_var_names = id_var_names)

  summaries <- bind_rows(id_presence, missingness, discretes, numerics) %>%
    arrange(variable, unit, measure)
  
  return(summaries)
  
}

# One function deals with different kinds of values.
summarize_ids <- function(data, id_var_names) {

  summary_functions <- list(
    count = function(x) { n_unique(x) },
    proportion = function(x) { n_unique(x) / length(x) },
    percentage = function(x) { (n_unique(x) / length(x)) * 100 }
  )

  summary <- data %>%
    select(any_of(id_var_names)) %>%
    pivot_longer(
      cols = everything(), 
      names_to = "variable", 
      values_to = "value"
    ) %>%
    group_by(variable) %>%
    summarize(
      across(value, summary_functions, .names = "{.fn}") 
    ) %>%
    pivot_longer(
      cols = -variable, 
      names_to = "measure", 
      values_to = "value"
    ) %>%
    mutate(unit = "Identity") %>%
    select(variable, unit, measure, value)
  
  return(summary)

}

summarize_missingness <- function(data, id_var_names) {
  
  summary_functions <- list(
    count = function(x) { sum(x) },
    proportion = function(x) { sum(x) / length(x) },
    percentage = function(x) { (sum(x) / length(x)) * 100 }
  )
  
  summary <- data %>%
    select(-any_of(id_var_names)) %>%
    mutate(across(
      everything(),
      is.na
    )) %>%
    pivot_longer(
      cols = everything(), 
      names_to = "variable", 
      values_to = "value"
    ) %>%
    group_by(variable) %>%
    summarize(
      across(value, summary_functions, .names = "{.fn}") 
    ) %>%
    pivot_longer(
      cols = -variable, 
      names_to = "measure", 
      values_to = "value"
    ) %>%
    mutate(unit = "Missing") %>%
    select(variable, unit, measure, value)
  
  return(summary)
  
}

summarize_numerics <- function(data, 
                               id_var_names, 
                               variable_meta_file,
                               name_variable,
                               unit_variable) {
  
  has_numerics <- data %>%
    map_lgl(is.numeric) %>%
    any()

  if (has_numerics == FALSE) {

    empty_result <- tibble::tibble(
      variable = character(),
      unit = character(),
      measure = character(),
      value = double(),
      .rows = 0
    )

    return(empty_result)

  } else {
    
    # One can add summary functions to the list if something is missing.
    summary_functions <- list(
      median = function(x) { median(x, na.rm = TRUE) },
      mean = function(x) { mean(x, na.rm = TRUE) },
      sd = function(x) { sd(x, na.rm = TRUE) },
      variance = function(x) { var(x, na.rm = TRUE) },
      minimum = function(x) { min(x, na.rm = TRUE) },
      maximum = function(x) { max(x, na.rm = TRUE) },
      type8_25_centile = function(x) { 
        quantile(x, probs = c(0.25), na.rm = TRUE, type = 8) 
      },
      type8_75_centile = function(x) { 
        quantile(x, probs = c(0.75), na.rm = TRUE, type = 8) 
      },
      sum = function(x) { sum(x, na.rm = TRUE) },
      interquantile_range = function(x) { stats::IQR(x, na.rm = TRUE, type = 7) },
      mad = function(x) { 
        stats::mad(x, center = median(x, na.rm = TRUE), na.rm = TRUE) 
      }
    )
    
    units <- variable_meta_file %>%
      read_csv(col_types = cols(.default = "c")) %>%
      select(variable = {{ name_variable }}, unit = {{ unit_variable }}) %>%
      drop_na() %>%
      distinct()

    variables_in_data <- data %>%
      select(where(is.numeric)) %>%
      colnames()

    variables_in_units <- units %>%
      pull(variable)
    
    all_have_units <- all(variables_in_data %in% variables_in_units)
    
    # Units should be clear and exist in variable metadata.
    if (not(all_have_units)) {
      
      vars_without_units <- variables_in_data %>%
        magrittr::extract(!(variables_in_data %in% variables_in_units))
        
      stop_message <- str_c(
        "Units not found for ", 
        length(vars_without_units),
        " variables in data."
      )
      
      cat(
        "Units missing in metadata for...\n",
        str_c(vars_without_units, collapse = ", "),
        "\n"
      )

      stop(stop_message)

    }

    summary <- data %>%
      select(-any_of(id_var_names)) %>%
      select_if(is.numeric) %>%
      pivot_longer(
        cols = everything(), 
        names_to = "variable", 
        values_to = "value"
      ) %>%
      filter(not(is.na(value))) %>%
      group_by(variable) %>%
      summarize(
        across(value, summary_functions, .names = "{.fn}") 
      ) %>%
      pivot_longer(
        cols = -variable,
        names_to = "measure", 
        values_to = "value"
      ) %>%
      left_join(units, by = "variable") %>%
      select(variable, unit, measure, value)
      
    return(summary)

  }
  
}  

summarize_discretes <- function(data, id_var_names) {

  assert_that("data.frame" %in% class(data))
  assert_that(is.character(id_var_names))
  assert_that(length(id_var_names) > 0)
  
  is_discrete <- function(x) {
    !is.numeric(x) 
  }

  has_discretes <- data %>%
    select(-any_of(id_var_names)) %>%
    map_lgl(is_discrete) %>%
    any()
  
  if (has_discretes == FALSE) {
    
    empty_result <- tibble::tibble(
      variable = character(),
      unit = character(),
      measure = character(),
      value = double(),
      .rows = 0
    )

    return(empty_result)

  } else {
  
    summary <- data %>%
      select(-any_of(id_var_names)) %>%
      select(where(is_discrete)) %>%
      pivot_longer(
        cols = everything(), 
        names_to = "variable", 
        values_to = "value"
      ) %>%
      filter(!is.na(value)) %>%
      group_by(variable, value) %>%
      summarise(count = n(), .groups = "drop_last") %>%
      mutate(
        proportion = count / sum(count),
        percentage = proportion * 100,
        odds = proportion / (1 - proportion)
      ) %>%
      ungroup() %>%
      rename(unit = value) %>%
      pivot_longer(
        cols = count:odds, 
        names_to = "measure", 
        values_to = "value"
      ) %>%
      select(variable, unit, measure, value)
    
    return(summary)

  }
    
}

# Then one can apply the summarizer by subgroup with this function.
summarize_subgroups <- function(data, 
                                id_var_names, 
                                subgroup_variable,
                                variable_meta_file,
                                name_variable,
                                unit_variable,
                                description_variable) {
  
  assert_that("data.frame" %in% class(data))
  assert_that(is.character(id_var_names))
  assert_that(length(id_var_names) > 0)
  assert_that(is.character(variable_meta_file))
  assert_that(length(variable_meta_file) == 1)

  subgroup_summaries <- data %>%
    nest(data = -{{ subgroup_variable }}) %>%
    mutate(summary = map(
      data,
      summarize_variables, 
      id_var_names = id_var_names, 
      variable_meta_file = variable_meta_file,
      name_variable = {{ name_variable }},
      unit_variable = {{ unit_variable }},
      description_variable = {{ description_variable }}
    )) %>%
    select({{ subgroup_variable }}, summary) %>%
    unnest(summary)

  return(subgroup_summaries)
  
}

# Then we need to the summaries and present them
# in the Table 1 style. This requires lots of annoying string-formatting.
tabulate_summary <- function(data, 
                             id_var_names, 
                             subgroup_variable,
                             subgroups,
                             variable_meta_file,
                             name_variable,
                             unit_variable,
                             description_variable) {

  assert_that("data.frame" %in% class(data))
  assert_that(is.character(id_var_names))
  assert_that(length(id_var_names) > 0)
  assert_that(is.character(subgroups))
  assert_that(length(subgroups) > 0)
  assert_that(is.character(variable_meta_file))
  assert_that(length(variable_meta_file) == 1)

  format_summary <- function(summary_table,
                             id_var_names,
                             subgroups,
                             variable_meta_file,
                             name_variable,
                             description_variable) {

    format_value_col <- function(summary_table) {
      
      assert_that(ncol(summary_table) == 5)

      # I just pick these measures as they are generally enough.
      measure_ordering <- c("median", "minimum", "maximum", "count", "percentage")

      summary_table <- summary_table %>%
        filter(measure %in% measure_ordering) %>%
        # Make pieces for "median (min–max)" and "count (percentage%)".
        mutate(value = clean_number(value)) %>%
        mutate(value = str_trim(value, side = "both")) %>%      
        mutate(value = case_when(
          measure == "maximum" ~ str_c(value, ")", sep = ""), 
          measure == "minimum" ~ str_c("(", value, " –", sep = ""),
          measure == "percentage" ~ str_c("(", value, "%)", sep = ""),
          TRUE ~ value
        )) %>%
        # Arrange for concatenation.
        mutate(measure = factor(measure, levels = measure_ordering)) %>%
        arrange(subgroup, variable, unit, measure) %>%
        mutate(measure = as.character(measure)) %>%
        # Glue strings.
        group_by(subgroup, variable, unit) %>%
        summarize(
          measure = str_c(measure, collapse = " "),
          value = str_c(value, collapse = " "),
          .groups = "drop"
        ) %>%
        mutate(value = str_replace(value, " – ", "–")) %>%
        select(subgroup, variable, unit, measure, value)

      return(summary_table)
      
    }

    pivot_subgroups <- function(summary_table, id_var_names, subgroups) {

      assert_that(ncol(summary_table) == 5)

      summary_table <- summary_table %>%
        pivot_wider(names_from = "subgroup", values_from = "value") %>%
        select(variable, unit, measure, all_of(subgroups)) %>%
        mutate(is_id = variable %in% id_var_names) %>%        
        arrange(is_id, variable, desc(unit), measure) %>%
        select(-is_id)

      return(summary_table)

    }

    format_variable_col <- function(summary_table, 
                                    variable_meta_file, 
                                    name_variable, 
                                    description_variable) {
      
      assert_that(ncol(summary_table) == (3 + length(subgroups)))
      
      variables_in_data <- summary_table %>%
        pull("variable") %>%
        unique()

      description_table <- variable_meta_file %>%
        read_csv(col_types = cols(.default = "c")) %>%
        select(
          variable = {{ name_variable }}, 
          description = {{ description_variable }}
        ) %>%
        filter(variable %in% variables_in_data) %>%
        drop_na() %>%
        distinct()

      vars_with_descriptions <- description_table %>%
          pull(variable)

      # All variables should have descriptive names in variable metadata.
      all_have_descriptions <- all(variables_in_data %in% vars_with_descriptions)
      if (not(all_have_descriptions)) {
        
        vars_without_desc <- variables_in_data %>%
          magrittr::extract(!(variables_in_data %in% vars_with_descriptions))

        stop_message <- str_c(
          "Description is missing in metadata for ",
          length(vars_without_desc),
          " variable(s) in data (",
          str_c(vars_without_desc, collapse = ", "),
          ")."
        )

        stop(stop_message)
      
      }

      summary_table <- description_table %>%
        right_join(summary_table, by = "variable") %>%
        select(-variable) %>%
        rename(variable = description)
      
      is_duplicate <- summary_table %>% 
          pull("variable") %>% 
          base::duplicated()

      summary_table <- summary_table %>%
        mutate(is_duplicate_value = is_duplicate) %>%
        mutate(variable = if_else(
          condition = is_duplicate_value, 
          true = " ", 
          false = variable
        )) %>%
        select(-is_duplicate_value)

      return(summary_table)
      
    }

    format_measure_col <- function(summary_table) {
      
      summary_table <- summary_table %>%
        mutate(measure = str_replace(
          measure, 
          pattern = "minimum maximum", 
          replacement = "(range)"
        )) %>%
        mutate(measure = str_replace(
          measure, 
          pattern = "percentage", 
          replacement = "(%)")
        ) %>%
        mutate(measure = str_to_sentence(measure)) %>%
        group_by(variable) %>%
        mutate(measure = if_else(
          condition = base::duplicated(measure),
          true = " ", 
          false = measure
        )) %>%
        ungroup()

      return(summary_table)
      
    }

    format_subgroup_cols <- function(summary_table, subgroups) {
      
      summary_table <- summary_table %>%
        mutate(across(
          all_of(subgroups),
          function(x) { if_else(is.na(x), true = "0 (0%)", false = x) }
        ))

      return(summary_table)

    }

    summary_table <- summary_table %>%
      filter(
        measure %in% c("median", "minimum", "maximum", "count", "percentage")
      ) %>%
      filter(subgroup %in% subgroups) %>%
      format_value_col() %>%
      pivot_subgroups(
        id_var_names = id_var_names, 
        subgroups = subgroups
      ) %>%
      format_measure_col() %>%
      format_variable_col(
        variable_meta_file = variable_meta_file, 
        name_variable = {{ name_variable }}, 
        description_variable = {{ description_variable }}
      ) %>%
      format_subgroup_cols(subgroups = subgroups) %>%
      rename(Variable = variable, Unit = unit, Measure = measure)

    return(summary_table)

  }
 
  summary_table <- data %>%
    summarize_subgroups(
      id_var_names = id_var_names, 
      subgroup_variable = {{ subgroup_variable }},
      variable_meta_file = variable_meta_file,
      name_variable = {{ name_variable }},
      unit_variable = {{ unit_variable }},
      description_variable = {{ description_variable }}
    ) %>%
    rename(subgroup = {{ subgroup_variable }}) %>%
    format_summary(
      id_var_names = id_var_names,
      subgroups = subgroups,
      variable_meta_file = variable_meta_file,
      name_variable = {{ name_variable }},
      description_variable = {{ description_variable }}
    )

  return(summary_table)

}