Taking Random Subsets of Dataframes in R
R is very inefficient (use Julia or your algorithms are or it’s sometimes useful anyway to run everything with random subsets of the whole dataset. tidyverse
didn’t seem to have a straight-forward function so I wrote this.
# It's often more reliable to hang onto the ID variables even if you need to always
# handle them separately within the function. You don't want to suddenly mix IDs.
subset_random <- function(data, id_var_names, max_variables, max_observations) {
if (ncol(data) > max_variables) {
n_id_vars <- length(id_var_names)
n_cols <- ncol(data)
data <- data %>%
# Sample only from non-ID variables.
select(
# ID variables first if not already like that.
all_of(id_var_names),
everything()
) %>%
select(
# A sample of indeces after ID variables.
base::sample(x = seq(n_id_vars + 1, n_cols), size = max_variables)
)
}
if (nrow(data) > max_observations) {
data <- data %>%
slice_sample(n = max_observations)
}
return(data)
}