|
| 1 | +#' Create a 3-layer Sankey diagram (Metabolites → MSETs → GSETs) |
| 2 | +#' |
| 3 | +#' Builds a Sankey diagram with three layers based on a hypeR-GEM object: |
| 4 | +#' metabolites → metabolite sets (MSETs) → gene sets (GSETs). |
| 5 | +#' |
| 6 | +#' @param hypeR_GEM_obj A list with components \code{$mapped_metabolite_signatures} |
| 7 | +#' (a list of data frames) and \code{$gene_tables} (a list of data frames). |
| 8 | +#' @param msets_list A list of metabolite-set collections. Each element should be |
| 9 | +#' a named list where names are set names and values are character vectors of |
| 10 | +#' metabolite identifiers matching \code{key}. |
| 11 | +#' @param gsets_list A list of gene-set collections. Each element should be a |
| 12 | +#' named list where names are set names and values are character vectors of genes. |
| 13 | +#' @param key Column name in your metabolite tables used as the metabolite |
| 14 | +#' identifier (e.g., \code{"refmet_name"}, \code{"HMDB"}). Default: \code{"refmet_name"}. |
| 15 | +#' @param font_size Numeric font size for node labels. Default: 12. |
| 16 | +#' @param node_width Numeric node width. Default: 30. |
| 17 | +#' |
| 18 | +#' @return A \code{networkD3::sankeyNetwork} htmlwidget. |
| 19 | +#' @export |
| 20 | +#' |
| 21 | +#' @importFrom networkD3 sankeyNetwork |
| 22 | +#' @importFrom dplyr bind_rows select mutate filter group_by summarise rename right_join |
| 23 | +#' distinct pull all_of any_of |
| 24 | +#' @importFrom tidyr separate_rows |
| 25 | +#' @importFrom stringr str_trim str_c |
| 26 | +#' @importFrom rlang sym .data |
| 27 | +#' @importFrom stats na.omit |
| 28 | +sankey_plot <- function(hypeR_GEM_obj, |
| 29 | + msets_list, |
| 30 | + gsets_list, |
| 31 | + key = "refmet_name", |
| 32 | + font_size = 12, |
| 33 | + node_width = 30) { |
| 34 | + |
| 35 | + # ---------- basic checks ---------- |
| 36 | + if (!is.list(hypeR_GEM_obj) || |
| 37 | + is.null(hypeR_GEM_obj$mapped_metabolite_signatures) || |
| 38 | + is.null(hypeR_GEM_obj$gene_tables)) { |
| 39 | + stop("`hypeR_GEM_obj` must contain $mapped_metabolite_signatures and $gene_tables.") |
| 40 | + } |
| 41 | + if (!is.list(msets_list) || !length(msets_list)) { |
| 42 | + stop("`msets_list` must be a non-empty list of MSET collections.") |
| 43 | + } |
| 44 | + if (!is.list(gsets_list) || !length(gsets_list)) { |
| 45 | + stop("`gsets_list` must be a non-empty list of GSET collections.") |
| 46 | + } |
| 47 | + if (!is.character(key) || length(key) != 1L) { |
| 48 | + stop("`key` must be a single character column name.") |
| 49 | + } |
| 50 | + |
| 51 | + # remove empty inner elements |
| 52 | + msets_list <- msets_list[lengths(msets_list) > 0L] |
| 53 | + gsets_list <- gsets_list[lengths(gsets_list) > 0L] |
| 54 | + |
| 55 | + # ---------- build edges and tables ---------- |
| 56 | + met2MSET_edge_list <- .met2MSET_links(hypeR_GEM_obj, msets_list, key = key) |
| 57 | + |
| 58 | + met_table <- .create_met_table(hypeR_GEM_obj, msets_list, key = key) |
| 59 | + gene_table <- .create_gene_table(hypeR_GEM_obj, gsets_list) |
| 60 | + msets_table <- .create_msets_table(met_table, key = key) |
| 61 | + gsets_table <- .create_gsets_table(gene_table) |
| 62 | + |
| 63 | + msets2gsets_edge_list <- .msets2gsets_links( |
| 64 | + met_table, msets_table, gene_table, gsets_table, key = key |
| 65 | + ) |
| 66 | + |
| 67 | + links <- rbind(met2MSET_edge_list, msets2gsets_edge_list) |
| 68 | + if (!nrow(links)) stop("No edges found to plot (all overlaps were zero).") |
| 69 | + |
| 70 | + # ---------- node index ---------- |
| 71 | + nodes <- data.frame(name = union(unique(links$source), unique(links$target)), |
| 72 | + stringsAsFactors = FALSE) |
| 73 | + links$source <- match(links$source, nodes$name) - 1L |
| 74 | + links$target <- match(links$target, nodes$name) - 1L |
| 75 | + |
| 76 | + if (any(is.na(links$source)) || any(is.na(links$target)) || |
| 77 | + any(links$source < 0L) || any(links$target < 0L)) { |
| 78 | + stop("Internal error: node indexing failed.") |
| 79 | + } |
| 80 | + |
| 81 | + networkD3::sankeyNetwork( |
| 82 | + Links = links, Nodes = nodes, |
| 83 | + Source = "source", Target = "target", Value = "weight", |
| 84 | + NodeID = "name", fontSize = font_size, nodeWidth = node_width, |
| 85 | + sinksRight = FALSE |
| 86 | + ) |
| 87 | +} |
| 88 | + |
| 89 | +# ----- internals ----------------------------------------------------------- |
| 90 | + |
| 91 | +#' @keywords internal |
| 92 | +.met2MSET_links <- function(hypeR_GEM_obj, msets_list, key = "refmet_name") { |
| 93 | + met_df <- dplyr::bind_rows(hypeR_GEM_obj$mapped_metabolite_signatures, |
| 94 | + .id = "signature_type") |
| 95 | + |
| 96 | + if (!all(c("signature_type", key) %in% names(met_df))) { |
| 97 | + stop("`mapped_metabolite_signatures` must include columns 'signature_type' and '", key, "'.") |
| 98 | + } |
| 99 | + |
| 100 | + # split metabolites by signature_type |
| 101 | + met_groups <- split(met_df[[key]], met_df[["signature_type"]]) |
| 102 | + met_groups <- lapply(met_groups, function(v) unique(stats::na.omit(as.character(v)))) |
| 103 | + |
| 104 | + # flatten one level: list(named vectors) |
| 105 | + msets <- unlist(msets_list, recursive = FALSE, use.names = TRUE) |
| 106 | + msets <- lapply(msets, function(v) unique(stats::na.omit(as.character(v)))) |
| 107 | + if (is.null(names(msets))) names(msets) <- paste0("MSET_", seq_along(msets)) |
| 108 | + |
| 109 | + edge_list <- base::expand.grid( |
| 110 | + source = base::names(met_groups), |
| 111 | + target = base::names(msets), |
| 112 | + stringsAsFactors = FALSE |
| 113 | + ) |
| 114 | + edge_list$weight <- mapply(function(i, j) { |
| 115 | + base::length(base::intersect(met_groups[[i]], msets[[j]])) |
| 116 | + }, edge_list$source, edge_list$target) |
| 117 | + |
| 118 | + edge_list[edge_list$weight > 0L, , drop = FALSE] |
| 119 | +} |
| 120 | + |
| 121 | +#' @keywords internal |
| 122 | +.find_association <- function(id, sets) { |
| 123 | + sets <- lapply(sets, function(x) unlist(x, use.names = FALSE)) |
| 124 | + is_member <- vapply(sets, function(x) id %in% x, logical(1)) |
| 125 | + names(sets)[is_member] |
| 126 | +} |
| 127 | + |
| 128 | +#' @keywords internal |
| 129 | +.find_associated_sets <- function(id, sets_list) { |
| 130 | + out <- unlist(lapply(sets_list, .find_association, id = id), use.names = TRUE) |
| 131 | + paste(out, collapse = ";") |
| 132 | +} |
| 133 | + |
| 134 | +#' @keywords internal |
| 135 | +.create_met_table <- function(hypeR_GEM_obj, msets_list, key = "refmet_name", sep = ";") { |
| 136 | + k <- rlang::sym(key) |
| 137 | + |
| 138 | + # metabolites + associated MSETs (by signature type) |
| 139 | + met_df <- dplyr::bind_rows(hypeR_GEM_obj$mapped_metabolite_signatures, .id = "type") |> |
| 140 | + dplyr::select(!!k, dplyr::all_of("type")) |> |
| 141 | + dplyr::mutate( |
| 142 | + associated_msets = vapply( |
| 143 | + X = .data[[key]], |
| 144 | + FUN = .find_associated_sets, |
| 145 | + FUN.VALUE = character(1), |
| 146 | + sets_list = msets_list |
| 147 | + ) |
| 148 | + ) |
| 149 | + |
| 150 | + gene_tab <- dplyr::bind_rows(hypeR_GEM_obj$gene_tables, .id = "type") |
| 151 | + if (!all(c("symbol", "associated_metabolites") %in% names(gene_tab))) { |
| 152 | + stop("`gene_tables` must contain columns 'symbol' and 'associated_metabolites'.") |
| 153 | + } |
| 154 | + |
| 155 | + met_table <- gene_tab |> |
| 156 | + dplyr::select(dplyr::all_of(c("symbol", "associated_metabolites"))) |> |
| 157 | + tidyr::separate_rows(dplyr::all_of("associated_metabolites"), sep = sep) |> |
| 158 | + dplyr::mutate(associated_metabolites = stringr::str_trim(.data$associated_metabolites)) |> |
| 159 | + dplyr::filter(.data$associated_metabolites != "", !is.na(.data$associated_metabolites)) |> |
| 160 | + dplyr::group_by(metabolite = .data$associated_metabolites) |> |
| 161 | + dplyr::summarise( |
| 162 | + associated_genes = paste(unique(.data$symbol), collapse = sep), |
| 163 | + .groups = "drop" |
| 164 | + ) |> |
| 165 | + dplyr::rename(!!k := rlang::sym("metabolite")) |> |
| 166 | + dplyr::right_join(met_df, by = key) |
| 167 | + |
| 168 | + met_table |
| 169 | +} |
| 170 | + |
| 171 | +#' @keywords internal |
| 172 | +.create_msets_table <- function(met_table, key = "refmet_name") { |
| 173 | + k <- rlang::sym(key) |
| 174 | + |
| 175 | + met_table |> |
| 176 | + dplyr::filter(.data$associated_msets != "") |> |
| 177 | + dplyr::select(!!k, dplyr::all_of("associated_msets")) |> |
| 178 | + tidyr::separate_rows(dplyr::all_of("associated_msets"), sep = ";") |> |
| 179 | + dplyr::filter(nzchar(.data$associated_msets)) |> |
| 180 | + dplyr::group_by(.data$associated_msets) |> |
| 181 | + dplyr::summarise( |
| 182 | + associated_metabolite = stringr::str_c(.data[[key]], collapse = ";"), |
| 183 | + .groups = "drop" |
| 184 | + ) |> |
| 185 | + dplyr::rename(msets = rlang::sym("associated_msets")) |
| 186 | +} |
| 187 | + |
| 188 | +#' @keywords internal |
| 189 | +.create_gene_table <- function(hypeR_GEM_obj, gsets_list) { |
| 190 | + gene_df <- dplyr::bind_rows(hypeR_GEM_obj$gene_tables, .id = "gene_type") |> |
| 191 | + dplyr::select(dplyr::any_of(c("name", "symbol"))) |
| 192 | + |
| 193 | + if (!("symbol" %in% names(gene_df))) { |
| 194 | + stop("`gene_tables` must contain a 'symbol' column.") |
| 195 | + } |
| 196 | + |
| 197 | + gene_df$associated_gsets <- vapply( |
| 198 | + X = gene_df$symbol, |
| 199 | + FUN = .find_associated_sets, |
| 200 | + FUN.VALUE = character(1), |
| 201 | + sets_list = gsets_list |
| 202 | + ) |
| 203 | + gene_df |
| 204 | +} |
| 205 | + |
| 206 | +#' @keywords internal |
| 207 | +.create_gsets_table <- function(gene_table) { |
| 208 | + if (!all(c("symbol", "associated_gsets") %in% names(gene_table))) { |
| 209 | + stop("`gene_table` must contain 'symbol' and 'associated_gsets'.") |
| 210 | + } |
| 211 | + |
| 212 | + gene_table |> |
| 213 | + dplyr::select(dplyr::all_of(c("symbol", "associated_gsets"))) |> |
| 214 | + dplyr::filter(.data$associated_gsets != "") |> |
| 215 | + tidyr::separate_rows(dplyr::all_of("associated_gsets"), sep = ";") |> |
| 216 | + dplyr::group_by(.data$associated_gsets) |> |
| 217 | + dplyr::summarise( |
| 218 | + associated_gene = stringr::str_c(.data$symbol, collapse = ";"), |
| 219 | + .groups = "drop" |
| 220 | + ) |> |
| 221 | + dplyr::rename(gsets = rlang::sym("associated_gsets")) |
| 222 | +} |
| 223 | + |
| 224 | +#' @keywords internal |
| 225 | +.msets2gsets_links <- function(met_table, msets_table, gene_table, gsets_table, key = "refmet_name") { |
| 226 | + edge_list <- data.frame(source = character(), target = character(), |
| 227 | + weight = integer(), stringsAsFactors = FALSE) |
| 228 | + |
| 229 | + if (!all(c("msets", "associated_metabolite") %in% names(msets_table))) return(edge_list) |
| 230 | + if (!all(c(key, "associated_genes") %in% names(met_table))) return(edge_list) |
| 231 | + if (!all(c("symbol", "associated_gsets") %in% names(gene_table))) return(edge_list) |
| 232 | + if (!all(c("gsets", "associated_gene") %in% names(gsets_table))) return(edge_list) |
| 233 | + |
| 234 | + # Precompute: gset -> gene vector |
| 235 | + gset_genes <- lapply(seq_len(nrow(gsets_table)), function(i) { |
| 236 | + gs <- unique(unlist(strsplit(stats::na.omit(gsets_table$associated_gene[i]), |
| 237 | + ";", fixed = TRUE), use.names = FALSE)) |
| 238 | + gs[nzchar(gs)] |
| 239 | + }) |
| 240 | + names(gset_genes) <- as.character(gsets_table$gsets) |
| 241 | + |
| 242 | + for (i in seq_len(nrow(msets_table))) { |
| 243 | + mset_i <- as.character(msets_table$msets[i]) |
| 244 | + assoc_met_str <- as.character(msets_table$associated_metabolite[i]) |
| 245 | + if (!nzchar(mset_i) || !nzchar(assoc_met_str)) next |
| 246 | + |
| 247 | + mets <- unlist(strsplit(assoc_met_str, ";", fixed = TRUE), use.names = FALSE) |
| 248 | + mets <- mets[nzchar(mets)] |
| 249 | + if (!length(mets)) next |
| 250 | + |
| 251 | + # genes from these metabolites |
| 252 | + mt_sub <- met_table[met_table[[key]] %in% mets, , drop = FALSE] |
| 253 | + ag <- unlist(strsplit(stats::na.omit(mt_sub$associated_genes), ";", fixed = TRUE), |
| 254 | + use.names = FALSE) |
| 255 | + m_genes <- unique(ag[nzchar(ag)]) |
| 256 | + if (!length(m_genes)) next |
| 257 | + |
| 258 | + # which gsets these genes belong to (by gene_table) |
| 259 | + gt_sub <- gene_table[gene_table$symbol %in% m_genes, , drop = FALSE] |
| 260 | + gs_flat <- unlist(strsplit(stats::na.omit(gt_sub$associated_gsets), ";", fixed = TRUE), |
| 261 | + use.names = FALSE) |
| 262 | + gsets <- unique(gs_flat[nzchar(gs_flat)]) |
| 263 | + if (!length(gsets)) next |
| 264 | + |
| 265 | + w <- vapply(gsets, function(g) { |
| 266 | + gs_genes <- gset_genes[[g]] |
| 267 | + if (is.null(gs_genes)) 0L else length(intersect(m_genes, gs_genes)) |
| 268 | + }, integer(1)) |
| 269 | + |
| 270 | + keep <- w > 0L |
| 271 | + if (any(keep)) { |
| 272 | + edge_list <- rbind( |
| 273 | + edge_list, |
| 274 | + data.frame(source = rep(mset_i, sum(keep)), |
| 275 | + target = gsets[keep], |
| 276 | + weight = as.integer(w[keep]), |
| 277 | + stringsAsFactors = FALSE) |
| 278 | + ) |
| 279 | + } |
| 280 | + } |
| 281 | + |
| 282 | + dplyr::distinct(edge_list) |
| 283 | +} |
| 284 | + |
| 285 | +# Silence R CMD check for data-masked column names used in verbs |
| 286 | +utils::globalVariables(c( |
| 287 | + "type", "symbol", "associated_metabolites", "metabolite", |
| 288 | + "associated_msets", "associated_gsets", "name" |
| 289 | +)) |
0 commit comments