Skip to main content eteppo

Tidy Uniform Manifold Approximation And Projection in R

Published: 2023-08-04
Updated: 2023-08-04
learn_umaps <- function(data, 
                        id_var_names, 
			n_neighbors = c(10, 15), 
			n_components = 2) {
    
    missing_count <- data %>%
        select(-any_of(id_var_names)) %>%
        as.matrix() %>%
        as.numeric() %>%
        is.na() %>%
        sum()
    
    assert_that(missing_count < 1)
    
    # umap hyperparameters (umap::umap.defaults)
    #            n_neighbors: n_neighbors
    #           n_components: n_components (defaults to 2)
    #                 metric: euclidean
    #               n_epochs: 200
    #                  input: data
    #                   init: spectral
    #               min_dist: 0.1
    #       set_op_mix_ratio: 1
    #     local_connectivity: 1
    #              bandwidth: 1
    #                  alpha: 1
    #                  gamma: 1
    #   negative_sample_rate: 5
    #                      a: NA
    #                      b: NA
    #                 spread: 1
    #           random_state: NA
    #        transform_state: NA
    #            knn_repeats: 1
    #                verbose: FALSE
    #        umap_learn_args: NA 
    
    fit_umap <- function(data, id_var_names, n_neighbors) {
        custom_settings <- umap::umap.defaults
        custom_settings$n_neighbors <- n_neighbors
        custom_settings$n_components <- n_components
        
        umap_object <- data %>%
            select(-any_of(id_var_names)) %>%
            umap::umap(config = custom_settings, method = "naive")
        
        return(umap_object)
    }
    
    # Extract results from the non-tidy objects.
    tidy_umap <- function(umap_object, data, id_var_names) {
        column_names <- str_c("umap", 1:ncol(umap_object$layout), sep = "")

        umap_features <- umap_object$layout %>%
            tibble::as_tibble(.name_repair = ~ column_names)
        
        umap_obs <- data %>%
            select(any_of(id_var_names)) %>%
            bind_cols(umap_features) %>%
            mutate(n_neighbors = umap_object$config$n_neighbors)
        
        return(umap_obs)
    }
    
    # Everything is mapped over multiple neighbor values.
    # You might want to compare small (5) and larger (30) neighborhoods
    # because the meaning of this hyperparameter is not very obvious.
    umap_output <- n_neighbors %>%
        map(fit_umap, data = data, id_var_names = id_var_names) %>% 
        map_dfr(tidy_umap, data = data, id_var_names = id_var_names)
    
    return(umap_output)
    
}