Tidy Topological Overlap Matrix (TOM) Networks in R

Published: 2023-08-04
Updated: 2023-08-04
Topological Overlap Matrix (TOM) Networks are a good twist on plain correlation matrices in that this TOM effectively measures indirect relationships between variables. If two variables (nodes) are related indirectly, this can better reveal clusters or modular structures in the network.
WGCNA package makes the calculations fast but returns non-tidy results which once again forces us to do some wrangling in order to use the results easily in our own ways.
# This function returns a list of tidy dataframes for
# nodes, edges, modules, and networks, as well as the
# raw hclust-objects and adjacency matrices.
learn_toms <- function(data, id_var_names, power, min_cluster_size) {
    
    assert_that("data.frame" %in% class(data))
    assert_that(is.integer(power))

    # This calculates a result for one power of the Pearson correlation.
    # Below we apply it repeatedly to a vector of powers.
    learn_tom <- function(data, power, id_var_names, min_cluster_size) {
        assert_that(length(power) == 1)
        assert_that(is.character(id_var_names))
        assert_that(length(id_var_names) > 0)
        
        id_data <- data %>%
          select(any_of(id_var_names))
        
        data <- data %>%
          select(-any_of(id_var_names))

        n_nodes <- data %>%
          colnames() %>%
          length()

        # Compute TOM matrix, scale-freedom measures, and other network concepts.
        # Stay alert about the options used.
        adjacency_matrix <- data %>%
          WGCNA::adjacency(
            power = power, 
            type = "unsigned",
            corFnc = "cor",
            corOptions = list(use = "pairwise.complete.obs", method = "pearson"),
          ) %>%
          WGCNA::TOMsimilarity(
            TOMType = "unsigned",
            TOMDenom = "min"
          ) %>%
          magrittr::set_rownames(colnames(data)) %>%
          magrittr::set_colnames(colnames(data))

        assert_that(sum(is.na(adjacency_matrix)) == 0)

        concepts <- WGCNA::fundamentalNetworkConcepts(
          adj = adjacency_matrix, 
          GS = NULL
        )
        
        # Scale-freedom can be used to select the best network
        # from multiple networks with different powers for the
        # Pearson correlation.
        scale_freedom <- WGCNA::scaleFreeFitIndex(
          k = chuck(concepts, "Connectivity"), 
          nBreaks = 10, 
          removeFirst = FALSE
        )
        
        # Cluster nodes using TOM dissimilarity, hierarchical average 
        # agglomeration, and dynamic tree cutting.
        hclust_object <- as.dist((1 - adjacency_matrix)) %>%
          fastcluster::hclust(method = "average")

        clusters <- hclust_object %>%
          dynamicTreeCut::cutreeDynamic(
            distM = (1 - adjacency_matrix),
            minClusterSize = min_cluster_size
          ) %>%
          WGCNA::labels2colors()

        module_data <- data %>%
          WGCNA::moduleEigengenes(clusters) %>%
          chuck("eigengenes") %>%
          mutate(power = power) %>%
          bind_cols(id_data) %>%
          select(any_of(id_var_names), power, everything())

        cluster_memberships <- module_data %>%
          select(-any_of(id_var_names), -power) %>%
          learn_pearsons(y = data) %>%
          mutate(membership = abs(correlation)) %>%
          select(from = to, cluster = from, membership) %>%
          pivot_wider(
            names_from = "cluster", 
            values_from = "membership"
          )

        assert_that(nrow(cluster_memberships) == n_nodes)

        network_data <- tibble::tibble(power = power) %>%
          mutate(
            scale_freedom_r2 = pull(scale_freedom, "Rsquared.SFT"),
            scale_freedom_ar2 = pull(scale_freedom, "truncatedExponentialAdjRsquared"),
            density = chuck(concepts, "Density"),
            centralization = chuck(concepts, "Centralization"),
            heterogeneity = chuck(concepts, "Heterogeneity")
          )

        assert_that(nrow(network_data) == 1)
        
        node_data <- tibble::tibble(from = colnames(data)) %>%
          mutate(
            power = power,
            cluster = clusters,
            dendrogram_order = hclust_object$order,
            connectivity = chuck(concepts, "Connectivity"),
            scaled_connectivity = chuck(concepts, "ScaledConnectivity"),
            cluster_coefficient = chuck(concepts, "ClusterCoef"),
            maximum_adjacency_ratio = chuck(concepts, "MAR")
          ) %>%
          left_join(cluster_memberships, by = "from") %>%
          arrange(dendrogram_order)

        assert_that(nrow(node_data) == n_nodes)
        
        # Remove loops and directional duplicates.
        is_loop_or_duplicate <- upper.tri(adjacency_matrix, diag = TRUE)
        
        edge_data <- adjacency_matrix %>%
          magrittr::inset(is_loop_or_duplicate, NA_real_) %>%
          magrittr::set_colnames(colnames(data)) %>%
          tibble::as_tibble() %>%
          mutate(from = colnames(data), power = power) %>%
          pivot_longer(
              cols = all_of(colnames(data)), 
              names_to = "to",
              values_to = "tom_similarity"
          ) %>%
          filter(!is.na(tom_similarity))

        result <- list(
          edge_data = edge_data, 
          node_data = node_data, 
          network_data = network_data,
          module_data = module_data,
          # We will also return some raw-ish results.
          # Wrangling again back to these formats is too much.
          hclust_object = hclust_object,
          adjacency_matrix = adjacency_matrix
        )

        return(result)
    }

    # This combines multiple result lists into one list.
    combine_parameters <- function(lists) {

      edge_data <- lists %>%
        map(chuck, "edge_data") %>%
        reduce(bind_rows)
      
      node_data <- lists %>%
        map(chuck, "node_data") %>%
        reduce(bind_rows)
      
      network_data <- lists %>%
        map(chuck, "network_data") %>%
        reduce(bind_rows)
      
      module_data <- lists %>%
          map(chuck, "module_data") %>%
          reduce(bind_rows)

      powers <- network_data %>%
          pull(power)

      hclust_objects <- lists %>%
          map(chuck, "hclust_object") %>%
          magrittr::set_names(as.character(powers))

      adjacencies <- lists %>%
          map(chuck, "adjacency_matrix") %>%
          magrittr::set_names(as.character(powers))

      result <- list(
        edge_data = edge_data, 
        node_data = node_data, 
        network_data = network_data,
        module_data = module_data,
        hclust_objects = hclust_objects,
        adjacencies = adjacencies
      )

      return(result)
    }

    z_scored_data <- data %>%
      mutate(across(-any_of(id_var_names), z_score))

    parameter_list <- power %>%
      map(
        learn_tom, 
        data = z_scored_data, 
        id_var_names = id_var_names,
        min_cluster_size = min_cluster_size
      ) %>%
      combine_parameters()

    return(parameter_list)

}