Skip to content

Commit bf11b6a

Browse files
add sankey_plot
1 parent eab8c13 commit bf11b6a

File tree

6 files changed

+346
-1
lines changed

6 files changed

+346
-1
lines changed

.DS_Store

0 Bytes
Binary file not shown.

DESCRIPTION

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@ Imports:
3838
reactable,
3939
igraph,
4040
visNetwork,
41-
curl
41+
curl,
42+
networkD3,
43+
tidyr
4244
Suggests:
4345
tidyverse,
4446
devtools,

NAMESPACE

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ export(hypGEM_filter)
88
export(hypGEM_to_excel)
99
export(rctbls)
1010
export(refmet_convert)
11+
export(sankey_plot)
1112
export(signature2gene)
1213
export(visNet)
1314
import(methods)
@@ -19,16 +20,22 @@ importFrom(curl,curl_fetch_memory)
1920
importFrom(curl,handle_setform)
2021
importFrom(curl,handle_setopt)
2122
importFrom(curl,new_handle)
23+
importFrom(dplyr,all_of)
24+
importFrom(dplyr,any_of)
2225
importFrom(dplyr,arrange)
26+
importFrom(dplyr,bind_rows)
2327
importFrom(dplyr,case_when)
2428
importFrom(dplyr,distinct)
2529
importFrom(dplyr,filter)
30+
importFrom(dplyr,group_by)
2631
importFrom(dplyr,left_join)
2732
importFrom(dplyr,mutate)
2833
importFrom(dplyr,pull)
2934
importFrom(dplyr,rename)
3035
importFrom(dplyr,rename_with)
36+
importFrom(dplyr,right_join)
3137
importFrom(dplyr,select)
38+
importFrom(dplyr,summarise)
3239
importFrom(ggforce,trans_reverser)
3340
importFrom(ggplot2,aes)
3441
importFrom(ggplot2,coord_flip)
@@ -57,6 +64,7 @@ importFrom(magrittr,"%>%")
5764
importFrom(magrittr,is_greater_than)
5865
importFrom(magrittr,set_colnames)
5966
importFrom(magrittr,set_rownames)
67+
importFrom(networkD3,sankeyNetwork)
6068
importFrom(openxlsx,addWorksheet)
6169
importFrom(openxlsx,createWorkbook)
6270
importFrom(openxlsx,saveWorkbook)
@@ -66,13 +74,18 @@ importFrom(reactable,colDef)
6674
importFrom(reactable,reactable)
6775
importFrom(reshape2,melt)
6876
importFrom(rlang,.data)
77+
importFrom(rlang,sym)
6978
importFrom(scales,log10_trans)
79+
importFrom(stats,na.omit)
7080
importFrom(stats,p.adjust)
7181
importFrom(stats,phyper)
82+
importFrom(stringr,str_c)
7283
importFrom(stringr,str_count)
7384
importFrom(stringr,str_replace)
85+
importFrom(stringr,str_trim)
7486
importFrom(tibble,column_to_rownames)
7587
importFrom(tibble,rownames_to_column)
88+
importFrom(tidyr,separate_rows)
7689
importFrom(visNetwork,visEdges)
7790
importFrom(visNetwork,visGroups)
7891
importFrom(visNetwork,visIgraphLayout)

R/sankey_plot.R

Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
#' Create a 3-layer Sankey diagram (Metabolites → MSETs → GSETs)
2+
#'
3+
#' Builds a Sankey diagram with three layers based on a hypeR-GEM object:
4+
#' metabolites → metabolite sets (MSETs) → gene sets (GSETs).
5+
#'
6+
#' @param hypeR_GEM_obj A list with components \code{$mapped_metabolite_signatures}
7+
#' (a list of data frames) and \code{$gene_tables} (a list of data frames).
8+
#' @param msets_list A list of metabolite-set collections. Each element should be
9+
#' a named list where names are set names and values are character vectors of
10+
#' metabolite identifiers matching \code{key}.
11+
#' @param gsets_list A list of gene-set collections. Each element should be a
12+
#' named list where names are set names and values are character vectors of genes.
13+
#' @param key Column name in your metabolite tables used as the metabolite
14+
#' identifier (e.g., \code{"refmet_name"}, \code{"HMDB"}). Default: \code{"refmet_name"}.
15+
#' @param font_size Numeric font size for node labels. Default: 12.
16+
#' @param node_width Numeric node width. Default: 30.
17+
#'
18+
#' @return A \code{networkD3::sankeyNetwork} htmlwidget.
19+
#' @export
20+
#'
21+
#' @importFrom networkD3 sankeyNetwork
22+
#' @importFrom dplyr bind_rows select mutate filter group_by summarise rename right_join
23+
#' distinct pull all_of any_of
24+
#' @importFrom tidyr separate_rows
25+
#' @importFrom stringr str_trim str_c
26+
#' @importFrom rlang sym .data
27+
#' @importFrom stats na.omit
28+
sankey_plot <- function(hypeR_GEM_obj,
29+
msets_list,
30+
gsets_list,
31+
key = "refmet_name",
32+
font_size = 12,
33+
node_width = 30) {
34+
35+
# ---------- basic checks ----------
36+
if (!is.list(hypeR_GEM_obj) ||
37+
is.null(hypeR_GEM_obj$mapped_metabolite_signatures) ||
38+
is.null(hypeR_GEM_obj$gene_tables)) {
39+
stop("`hypeR_GEM_obj` must contain $mapped_metabolite_signatures and $gene_tables.")
40+
}
41+
if (!is.list(msets_list) || !length(msets_list)) {
42+
stop("`msets_list` must be a non-empty list of MSET collections.")
43+
}
44+
if (!is.list(gsets_list) || !length(gsets_list)) {
45+
stop("`gsets_list` must be a non-empty list of GSET collections.")
46+
}
47+
if (!is.character(key) || length(key) != 1L) {
48+
stop("`key` must be a single character column name.")
49+
}
50+
51+
# remove empty inner elements
52+
msets_list <- msets_list[lengths(msets_list) > 0L]
53+
gsets_list <- gsets_list[lengths(gsets_list) > 0L]
54+
55+
# ---------- build edges and tables ----------
56+
met2MSET_edge_list <- .met2MSET_links(hypeR_GEM_obj, msets_list, key = key)
57+
58+
met_table <- .create_met_table(hypeR_GEM_obj, msets_list, key = key)
59+
gene_table <- .create_gene_table(hypeR_GEM_obj, gsets_list)
60+
msets_table <- .create_msets_table(met_table, key = key)
61+
gsets_table <- .create_gsets_table(gene_table)
62+
63+
msets2gsets_edge_list <- .msets2gsets_links(
64+
met_table, msets_table, gene_table, gsets_table, key = key
65+
)
66+
67+
links <- rbind(met2MSET_edge_list, msets2gsets_edge_list)
68+
if (!nrow(links)) stop("No edges found to plot (all overlaps were zero).")
69+
70+
# ---------- node index ----------
71+
nodes <- data.frame(name = union(unique(links$source), unique(links$target)),
72+
stringsAsFactors = FALSE)
73+
links$source <- match(links$source, nodes$name) - 1L
74+
links$target <- match(links$target, nodes$name) - 1L
75+
76+
if (any(is.na(links$source)) || any(is.na(links$target)) ||
77+
any(links$source < 0L) || any(links$target < 0L)) {
78+
stop("Internal error: node indexing failed.")
79+
}
80+
81+
networkD3::sankeyNetwork(
82+
Links = links, Nodes = nodes,
83+
Source = "source", Target = "target", Value = "weight",
84+
NodeID = "name", fontSize = font_size, nodeWidth = node_width,
85+
sinksRight = FALSE
86+
)
87+
}
88+
89+
# ----- internals -----------------------------------------------------------
90+
91+
#' @keywords internal
92+
.met2MSET_links <- function(hypeR_GEM_obj, msets_list, key = "refmet_name") {
93+
met_df <- dplyr::bind_rows(hypeR_GEM_obj$mapped_metabolite_signatures,
94+
.id = "signature_type")
95+
96+
if (!all(c("signature_type", key) %in% names(met_df))) {
97+
stop("`mapped_metabolite_signatures` must include columns 'signature_type' and '", key, "'.")
98+
}
99+
100+
# split metabolites by signature_type
101+
met_groups <- split(met_df[[key]], met_df[["signature_type"]])
102+
met_groups <- lapply(met_groups, function(v) unique(stats::na.omit(as.character(v))))
103+
104+
# flatten one level: list(named vectors)
105+
msets <- unlist(msets_list, recursive = FALSE, use.names = TRUE)
106+
msets <- lapply(msets, function(v) unique(stats::na.omit(as.character(v))))
107+
if (is.null(names(msets))) names(msets) <- paste0("MSET_", seq_along(msets))
108+
109+
edge_list <- base::expand.grid(
110+
source = base::names(met_groups),
111+
target = base::names(msets),
112+
stringsAsFactors = FALSE
113+
)
114+
edge_list$weight <- mapply(function(i, j) {
115+
base::length(base::intersect(met_groups[[i]], msets[[j]]))
116+
}, edge_list$source, edge_list$target)
117+
118+
edge_list[edge_list$weight > 0L, , drop = FALSE]
119+
}
120+
121+
#' @keywords internal
122+
.find_association <- function(id, sets) {
123+
sets <- lapply(sets, function(x) unlist(x, use.names = FALSE))
124+
is_member <- vapply(sets, function(x) id %in% x, logical(1))
125+
names(sets)[is_member]
126+
}
127+
128+
#' @keywords internal
129+
.find_associated_sets <- function(id, sets_list) {
130+
out <- unlist(lapply(sets_list, .find_association, id = id), use.names = TRUE)
131+
paste(out, collapse = ";")
132+
}
133+
134+
#' @keywords internal
135+
.create_met_table <- function(hypeR_GEM_obj, msets_list, key = "refmet_name", sep = ";") {
136+
k <- rlang::sym(key)
137+
138+
# metabolites + associated MSETs (by signature type)
139+
met_df <- dplyr::bind_rows(hypeR_GEM_obj$mapped_metabolite_signatures, .id = "type") |>
140+
dplyr::select(!!k, dplyr::all_of("type")) |>
141+
dplyr::mutate(
142+
associated_msets = vapply(
143+
X = .data[[key]],
144+
FUN = .find_associated_sets,
145+
FUN.VALUE = character(1),
146+
sets_list = msets_list
147+
)
148+
)
149+
150+
gene_tab <- dplyr::bind_rows(hypeR_GEM_obj$gene_tables, .id = "type")
151+
if (!all(c("symbol", "associated_metabolites") %in% names(gene_tab))) {
152+
stop("`gene_tables` must contain columns 'symbol' and 'associated_metabolites'.")
153+
}
154+
155+
met_table <- gene_tab |>
156+
dplyr::select(dplyr::all_of(c("symbol", "associated_metabolites"))) |>
157+
tidyr::separate_rows(dplyr::all_of("associated_metabolites"), sep = sep) |>
158+
dplyr::mutate(associated_metabolites = stringr::str_trim(.data$associated_metabolites)) |>
159+
dplyr::filter(.data$associated_metabolites != "", !is.na(.data$associated_metabolites)) |>
160+
dplyr::group_by(metabolite = .data$associated_metabolites) |>
161+
dplyr::summarise(
162+
associated_genes = paste(unique(.data$symbol), collapse = sep),
163+
.groups = "drop"
164+
) |>
165+
dplyr::rename(!!k := rlang::sym("metabolite")) |>
166+
dplyr::right_join(met_df, by = key)
167+
168+
met_table
169+
}
170+
171+
#' @keywords internal
172+
.create_msets_table <- function(met_table, key = "refmet_name") {
173+
k <- rlang::sym(key)
174+
175+
met_table |>
176+
dplyr::filter(.data$associated_msets != "") |>
177+
dplyr::select(!!k, dplyr::all_of("associated_msets")) |>
178+
tidyr::separate_rows(dplyr::all_of("associated_msets"), sep = ";") |>
179+
dplyr::filter(nzchar(.data$associated_msets)) |>
180+
dplyr::group_by(.data$associated_msets) |>
181+
dplyr::summarise(
182+
associated_metabolite = stringr::str_c(.data[[key]], collapse = ";"),
183+
.groups = "drop"
184+
) |>
185+
dplyr::rename(msets = rlang::sym("associated_msets"))
186+
}
187+
188+
#' @keywords internal
189+
.create_gene_table <- function(hypeR_GEM_obj, gsets_list) {
190+
gene_df <- dplyr::bind_rows(hypeR_GEM_obj$gene_tables, .id = "gene_type") |>
191+
dplyr::select(dplyr::any_of(c("name", "symbol")))
192+
193+
if (!("symbol" %in% names(gene_df))) {
194+
stop("`gene_tables` must contain a 'symbol' column.")
195+
}
196+
197+
gene_df$associated_gsets <- vapply(
198+
X = gene_df$symbol,
199+
FUN = .find_associated_sets,
200+
FUN.VALUE = character(1),
201+
sets_list = gsets_list
202+
)
203+
gene_df
204+
}
205+
206+
#' @keywords internal
207+
.create_gsets_table <- function(gene_table) {
208+
if (!all(c("symbol", "associated_gsets") %in% names(gene_table))) {
209+
stop("`gene_table` must contain 'symbol' and 'associated_gsets'.")
210+
}
211+
212+
gene_table |>
213+
dplyr::select(dplyr::all_of(c("symbol", "associated_gsets"))) |>
214+
dplyr::filter(.data$associated_gsets != "") |>
215+
tidyr::separate_rows(dplyr::all_of("associated_gsets"), sep = ";") |>
216+
dplyr::group_by(.data$associated_gsets) |>
217+
dplyr::summarise(
218+
associated_gene = stringr::str_c(.data$symbol, collapse = ";"),
219+
.groups = "drop"
220+
) |>
221+
dplyr::rename(gsets = rlang::sym("associated_gsets"))
222+
}
223+
224+
#' @keywords internal
225+
.msets2gsets_links <- function(met_table, msets_table, gene_table, gsets_table, key = "refmet_name") {
226+
edge_list <- data.frame(source = character(), target = character(),
227+
weight = integer(), stringsAsFactors = FALSE)
228+
229+
if (!all(c("msets", "associated_metabolite") %in% names(msets_table))) return(edge_list)
230+
if (!all(c(key, "associated_genes") %in% names(met_table))) return(edge_list)
231+
if (!all(c("symbol", "associated_gsets") %in% names(gene_table))) return(edge_list)
232+
if (!all(c("gsets", "associated_gene") %in% names(gsets_table))) return(edge_list)
233+
234+
# Precompute: gset -> gene vector
235+
gset_genes <- lapply(seq_len(nrow(gsets_table)), function(i) {
236+
gs <- unique(unlist(strsplit(stats::na.omit(gsets_table$associated_gene[i]),
237+
";", fixed = TRUE), use.names = FALSE))
238+
gs[nzchar(gs)]
239+
})
240+
names(gset_genes) <- as.character(gsets_table$gsets)
241+
242+
for (i in seq_len(nrow(msets_table))) {
243+
mset_i <- as.character(msets_table$msets[i])
244+
assoc_met_str <- as.character(msets_table$associated_metabolite[i])
245+
if (!nzchar(mset_i) || !nzchar(assoc_met_str)) next
246+
247+
mets <- unlist(strsplit(assoc_met_str, ";", fixed = TRUE), use.names = FALSE)
248+
mets <- mets[nzchar(mets)]
249+
if (!length(mets)) next
250+
251+
# genes from these metabolites
252+
mt_sub <- met_table[met_table[[key]] %in% mets, , drop = FALSE]
253+
ag <- unlist(strsplit(stats::na.omit(mt_sub$associated_genes), ";", fixed = TRUE),
254+
use.names = FALSE)
255+
m_genes <- unique(ag[nzchar(ag)])
256+
if (!length(m_genes)) next
257+
258+
# which gsets these genes belong to (by gene_table)
259+
gt_sub <- gene_table[gene_table$symbol %in% m_genes, , drop = FALSE]
260+
gs_flat <- unlist(strsplit(stats::na.omit(gt_sub$associated_gsets), ";", fixed = TRUE),
261+
use.names = FALSE)
262+
gsets <- unique(gs_flat[nzchar(gs_flat)])
263+
if (!length(gsets)) next
264+
265+
w <- vapply(gsets, function(g) {
266+
gs_genes <- gset_genes[[g]]
267+
if (is.null(gs_genes)) 0L else length(intersect(m_genes, gs_genes))
268+
}, integer(1))
269+
270+
keep <- w > 0L
271+
if (any(keep)) {
272+
edge_list <- rbind(
273+
edge_list,
274+
data.frame(source = rep(mset_i, sum(keep)),
275+
target = gsets[keep],
276+
weight = as.integer(w[keep]),
277+
stringsAsFactors = FALSE)
278+
)
279+
}
280+
}
281+
282+
dplyr::distinct(edge_list)
283+
}
284+
285+
# Silence R CMD check for data-masked column names used in verbs
286+
utils::globalVariables(c(
287+
"type", "symbol", "associated_metabolites", "metabolite",
288+
"associated_msets", "associated_gsets", "name"
289+
))

_pkgdown.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ reference:
3535
- '`rctbls`'
3636
- '`visNet`'
3737
- '`ggempty`'
38+
- '`sankey_plot`'
3839

3940
- title: Data Objects
4041
desc: ~

0 commit comments

Comments
 (0)