Skip to main content eteppo

Benjamini-Hochberg Adjusted Mann Whitney U Tests in R

Published: 2023-08-04
Updated: 2023-08-04

Mann Whitney U test is a very general, little-assuming comparison of two continuous distributions. Benjamini-Hochberg is a simple way to interpret a large amount of p-values as a single batch so that in that batch you would make a certain proportion of false null hypothesis rejections in the usual weird frequentist way of thinking. Combining these, you can make a large amount of hypothesis tests with relative ease in your mind. In the bigger picture though, consider this as one of your last options.

When we want to things in the tidyverse way, this is what you might get.

# Helper for dealing with missing values.
# Sometimes you want them to be NA, sometimes for printing etc. "NA"...
implicit_missing <- function(x, empty_value) {
  assert_that(is.character(x))
  magrittr::inset(x, is.na(x), empty_value)
}

# Mann Whitney U test for one comparison.
# We will apply this repeatedly later.
learn_mann_whitney_u <- function(data, formula) {
  
  # Check the input.
  assert_that(is.data.frame(data))
  assert_that("formula" %in% class(formula), not(is.character(formula)))

  # Predictor should be a binary variable.
  is_qualifying <- function(data, formula) {
    binary_var_name <- formula %>%
      all.vars() %>%
      magrittr::extract(2)
    n_obs_levels <- data %>%
      magrittr::extract2(binary_var_name) %>%
      remove_na() %>%
      unique() %>%
      length()
    is_binary <- n_obs_levels == 2
    return(is_binary)
  }
  
  # Instead of an error, return empty results for non-qualifying inputs.
  # It will be obvious in the results.
  if (!is_qualifying(data, formula)) {
    
    empty_result <- tibble::tibble(
      median_difference = NA_real_,
      statistic = NA_real_,
      p.value = NA_real_,
      conf.low = NA_real_,
      conf.high = NA_real_,
      method = NA_character_,
      alternative = NA_character_,
      exp2_median_diff = NA_real_
    )

    # Note: Invalid inputs cause only warnings.
    warning("Incorrect input: returning empty result.")

    return(empty_result)
  
  } else {

    # This is the actual test. Be careful with the settings and what they mean.
    result <- stats::wilcox.test(
        formula = formula,
        data = data,
        paired = FALSE, 
        exact = FALSE, 
        mu = 0,
        conf.int = TRUE,
        conf.level = 0.95,
        alternative = "two.sided",
        na.action = "na.omit"
      ) %>%
      broom::tidy() %>%
      rename(median_difference = estimate) %>%
      mutate(exp2_median_diff = 2^median_difference)

    return(result)

  }
}

# Benjamini-Hochberg adjustment for tidy parameters.
learn_benjamini_hochberg <- function(parameters, p_value) {
  p_values <- pull(parameters, {{ p_value }})

  if (all(is.na(p_values))) {
    parameters <- mutate(parameters, bh_fdr = NA_real_)
    warning("Missing values were returned because all input p-values were missing.")
    return(parameters)

  } else {
    bh_fdrs <- stats::p.adjust(p_values, method = "BH")
    parameters <- mutate(parameters, bh_fdr = bh_fdrs)
    return(parameters)
  }
}

# Now apply to a vector of outcomes.
learn_mwu_bh <- function(data, predictor, outcomes, outcome_set) {

  # Always check the inputs. Future-you will be thankful.
  assert_that("data.frame" %in% class(data))
  assert_that(nrow(data) > 1)
  assert_that(is.character(predictor)) 
  assert_that(length(predictor) == 1)
  
  # Predictor must be binary.
  assert_that(
    is_binary(data[[predictor]]), 
    msg = str_c(
        "Predictor '", 
        predictor, 
        "' is not binary with unique values {",
        str_c(unique(implicit_missing(data[[predictor]], "NA")), collapse = ", "),
        "}."
    )
  )

  assert_that(is.character(outcome_set))
  assert_that(length(outcome_set) == 1)
  assert_that(is.character(outcomes))
  assert_that(length(outcomes) > 1)

  data %>%
    pivot_longer(
      cols = any_of(outcomes),
      names_to = "outcome_name", 
      values_to = "outcome_value"
    ) %>%
    nest(data = -outcome_name) %>%
    mutate(formula = str_c(
      "outcome_value ~ ", 
      predictor, 
      collapse = ""
    )) %>%
    rowwise() %>%
    mutate(parameters = list(
      learn_mann_whitney_u(data, formula = as.formula(formula))
    )) %>%
    ungroup() %>%
    select(-data, -formula) %>%
    unnest(parameters) %>%
    rename("{outcome_set}" := outcome_name) %>%
    learn_benjamini_hochberg(p_value = p.value)

}