From 1ddb5f05ba86b0d082ba0d6d4eae09a5371bef20 Mon Sep 17 00:00:00 2001 From: bednarsky <72978444+bednarsky@users.noreply.github.com> Date: Thu, 2 Oct 2025 14:05:58 +0200 Subject: [PATCH] Add summary plot for specific terms. Remove summary heatmaps. Update docs #34 #35 #32 --- README.md | 9 +- workflow/Snakefile | 18 +- .../report/summary_plot_specificTerms.rst | 1 + ...ary_plot.rst => summary_plot_topTerms.rst} | 0 workflow/rules/aggregate.smk | 18 +- workflow/scripts/aggregate.py | 16 +- workflow/scripts/overview_plot.R | 344 +++++++++--------- 7 files changed, 207 insertions(+), 199 deletions(-) create mode 100644 workflow/report/summary_plot_specificTerms.rst rename workflow/report/{summary_plot.rst => summary_plot_topTerms.rst} (100%) diff --git a/README.md b/README.md index 60122eb..3b99e9d 100644 --- a/README.md +++ b/README.md @@ -120,8 +120,9 @@ The five tools LOLA, GREAT, pycisTarget, RcisTarget and GSEApy (over-representat - effect-size is presented by the x-axis position - overlap is presented by the dot size - group summary/overview - - the union of the top `{top_terms_n}` most significant terms per query, method, and database within a group is determined. - - their effect-size (effect) and statistical significance (adjp) are visualized as hierarchically clustered heatmaps, with statistical significance denoted by `\*` (PDF). + - two plots: + - top terms: the union of the top `{top_terms_n}` most significant terms per query, method, and database within a group is determined. + - specific terms: the union of statistically significant terms with the lowest average significance across all other groups is determined. This plot is empty if no statistically significant terms are found. - a hierarchically clustered bubble plot encoding both effect-size (color) and significance (size) is provided, with statistical significance denoted by `\*` (PNG). - all summary visualizations are configured to cap the values (`{adjp_cap}`/`{or_cap}`/`{nes_cap}`) to avoid shifts in the coloring scheme caused by outliers. - **results** (`{result_path}/enrichment_analysis`) @@ -131,9 +132,7 @@ The five tools LOLA, GREAT, pycisTarget, RcisTarget and GSEApy (over-representat - enrichment dot plot (PNG): `{query}\_{database}.{png}` - `{group}/{method}/{database}/` containing - aggregated result table (CSV): `{group}\_{database}\_all.csv` - - filtered aggregated result table (CSV): `{group}\_{database}\_sig.csv` - - hierarchically clustered heatmaps visualizing statistical significance and effect-sizes of the top `{top_terms_n}` terms (PDF): `{group}\_{database}\_{adjp|effect}\_heatmap.pdf` - - hierarchically clustered bubble plot visualizing statistical significance and effect-sizes simultaneously (PNG): `{group}\_{database}\_summary.{png}` + - hierarchically clustered bubble plot visualizing statistical significance and effect-sizes simultaneously (PNG): `{group}\_{database}\_summary_{topTerms|specificTerms}.{png}`. In case of only one query gene/region set, this plot is empty. Note: - Despite usage of the correct parameter, **rGREAT** was not using the provided cores during testing. Nevertheless, it is still provided as parameter. diff --git a/workflow/Snakefile b/workflow/Snakefile index e902ea8..5a970e0 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -92,12 +92,18 @@ rule all: expand(os.path.join(result_path, '{gene_set}', 'preranked_GSEApy','{db}','{gene_set}_{db}.csv'), gene_set=rnk_dict.keys(), db=database_dict.keys()), expand(os.path.join(result_path, '{gene_set}', 'preranked_GSEApy','{db}','{gene_set}_{db}.png'), gene_set=rnk_dict.keys(), db=database_dict.keys()), # summaries - expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='ORA_GSEApy', db=database_dict.keys()), - expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=rnk["group"].unique(), tool='preranked_GSEApy', db=database_dict.keys()), - expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='GREAT', db=database_dict.keys()), - expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='LOLA', db=lola_db_dict.keys()), - expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=regions["group"].unique(), tool='pycisTarget', db=pycistarget_db_dict.keys()), - expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='RcisTarget', db=rcistarget_db_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_topTerms.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='ORA_GSEApy', db=database_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_specificTerms.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='ORA_GSEApy', db=database_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_topTerms.png'),group=rnk["group"].unique(), tool='preranked_GSEApy', db=database_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_specificTerms.png'),group=rnk["group"].unique(), tool='preranked_GSEApy', db=database_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_topTerms.png'),group=regions["group"].unique(), tool='GREAT', db=database_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_specificTerms.png'),group=regions["group"].unique(), tool='GREAT', db=database_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_topTerms.png'),group=regions["group"].unique(), tool='LOLA', db=lola_db_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_specificTerms.png'),group=regions["group"].unique(), tool='LOLA', db=lola_db_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_topTerms.png'),group=regions["group"].unique(), tool='pycisTarget', db=pycistarget_db_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_specificTerms.png'),group=regions["group"].unique(), tool='pycisTarget', db=pycistarget_db_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_topTerms.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='RcisTarget', db=rcistarget_db_dict.keys()), + expand(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_specificTerms.png'),group=list(set(genes["group"].tolist()+regions["group"].tolist())), tool='RcisTarget', db=rcistarget_db_dict.keys()), # config envs = expand(os.path.join(result_path,'envs','{env}.yaml'),env=['region_enrichment_analysis','gene_enrichment_analysis','visualization','pycisTarget','RcisTarget']), configs = os.path.join(result_path,'configs','{}_config.yaml'.format(config["project_name"])), diff --git a/workflow/report/summary_plot_specificTerms.rst b/workflow/report/summary_plot_specificTerms.rst new file mode 100644 index 0000000..2bca841 --- /dev/null +++ b/workflow/report/summary_plot_specificTerms.rst @@ -0,0 +1 @@ +Summary of the most specific enrichment analysis results of group {{snakemake.wildcards["group"]}} in database {{snakemake.wildcards["db"]}} using {{snakemake.wildcards["tool"]}}. \ No newline at end of file diff --git a/workflow/report/summary_plot.rst b/workflow/report/summary_plot_topTerms.rst similarity index 100% rename from workflow/report/summary_plot.rst rename to workflow/report/summary_plot_topTerms.rst diff --git a/workflow/rules/aggregate.smk b/workflow/rules/aggregate.smk index 338f0e8..11d84b3 100644 --- a/workflow/rules/aggregate.smk +++ b/workflow/rules/aggregate.smk @@ -5,7 +5,6 @@ rule aggregate: enrichment_results = get_group_paths, output: results_all = os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_all.csv'), - results_sig = os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_sig.csv'), threads: config.get("threads", 1) resources: mem_mb=config.get("mem", "16000"), @@ -21,8 +20,19 @@ rule visualize: input: results_all = os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_all.csv'), output: - summary_plot = report(os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary.png'), - caption="../report/summary_plot.rst", + summary_plot_topTerms = report( + os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_topTerms.png'), + caption="../report/summary_plot_topTerms.rst", + category="{}_{}".format(config["project_name"], module_name), + subcategory="{group}", + labels={ + "name": "{tool}", + "type": "summary plot", + "misc": "{db}", + }), + summary_plot_specificTerms = report( + os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_summary_specificTerms.png'), + caption="../report/summary_plot_specificTerms.rst", category="{}_{}".format(config["project_name"], module_name), subcategory="{group}", labels={ @@ -30,8 +40,6 @@ rule visualize: "type": "summary plot", "misc": "{db}", }), - adjp_hm = os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_adjp_heatmap.pdf'), - effect_hm = os.path.join(result_path,'{group}','{tool}','{db}','{group}_{db}_effect_heatmap.pdf'), params: utils_path = workflow.source_path("../scripts/utils.R") threads: config.get("threads", 1) diff --git a/workflow/scripts/aggregate.py b/workflow/scripts/aggregate.py index f995965..c9ab4f5 100644 --- a/workflow/scripts/aggregate.py +++ b/workflow/scripts/aggregate.py @@ -12,7 +12,6 @@ # output results_all_path = snakemake.output['results_all'] -results_sig_path = snakemake.output['results_sig'] # parameters group = snakemake.wildcards["group"] @@ -41,23 +40,10 @@ # move on if results are empty if len(results_list)==0: open(results_all_path, mode='a').close() - open(results_sig_path, mode='a').close() sys.exit(0) # concatenate all results into one results dataframe result_df = pd.concat(results_list, axis=0) # save all enirchment results -result_df.to_csv(results_all_path) - -# find union of statistically significant terms -if tool=="pycisTarget" or tool=="RcisTarget": - sig_terms = result_df.loc[result_df[adjp_col] >= adjp_th, term_col].unique() -else: - sig_terms = result_df.loc[result_df[adjp_col] <= adjp_th, term_col].unique() - -# filter by significant terms -result_sig_df = result_df.loc[result_df[term_col].isin(sig_terms), :] - -# save filtered enirchment results by significance -result_sig_df.to_csv(results_sig_path) +result_df.to_csv(results_all_path) \ No newline at end of file diff --git a/workflow/scripts/overview_plot.R b/workflow/scripts/overview_plot.R index b7ee44a..39ff79d 100644 --- a/workflow/scripts/overview_plot.R +++ b/workflow/scripts/overview_plot.R @@ -17,9 +17,8 @@ source(snakemake@params[["utils_path"]]) results_all_path <- snakemake@input[["results_all"]] # output -plot_path <- snakemake@output[["summary_plot"]] -adjp_hm_path <- snakemake@output[["adjp_hm"]] -effect_hm_path <- snakemake@output[["effect_hm"]] +plot_path_topTerms <- snakemake@output[["summary_plot_topTerms"]] +plot_path_specificTerms <- snakemake@output[["summary_plot_specificTerms"]] # parameters tool <- snakemake@wildcards[["tool"]] @@ -38,9 +37,8 @@ cluster_flag <- as.logical(as.numeric(snakemake@config[["cluster_summary"]])) # stop early if results are empty if(file.size(results_all_path) == 0L){ - file.create(plot_path) - file.create(adjp_hm_path) - file.create(effect_hm_path) + file.create(plot_path_topTerms) + file.create(plot_path_specificTerms) quit(save = "no", status = 0) } @@ -49,187 +47,197 @@ results_all <- data.frame(fread(file.path(results_all_path), header=TRUE)) # stop early if results consist of only one query if(length(unique(results_all$name))==1){ - file.create(plot_path) - file.create(adjp_hm_path) - file.create(effect_hm_path) + file.create(plot_path_topTerms) + file.create(plot_path_specificTerms) quit(save = "no", status = 0) } # remove empty terms results_all <- results_all[results_all[[term_col]]!="",] -# determine top_n most significant terms (not necessarily statistically significant!) -top_terms <- c() -for (query in unique(results_all$name)){ - tmp_result <- results_all[results_all$name==query,] - top_n_tmp <- min(top_n, nrow(tmp_result)) - +make_summary_plot <- function(type = "topTerms"){ + # determine terms to plot + top_terms <- c() + if (type == "topTerms"){ + # pick top_n most significant terms per query (not necessarily statistically significant!) + for (query in unique(results_all$name)){ + tmp_result <- results_all[results_all$name==query,] + top_n_tmp <- min(top_n, nrow(tmp_result)) + if(tool=="pycisTarget" | tool=="RcisTarget"){ + tmp_terms <- tmp_result[order(-tmp_result[[adjp_col]]), term_col][1:top_n_tmp] + }else{ + tmp_terms <- tmp_result[order(tmp_result[[adjp_col]]), term_col][1:top_n_tmp] + } + top_terms <- unique(c(top_terms,tmp_terms)) + } + } else if (type == "specificTerms"){ + # for each query, take significant terms that have the lowest average -log10(adj-p) across all other queries + all_queries <- unique(results_all$name) + for (query in all_queries){ + tmp_result <- results_all[results_all$name==query,] + if(tool=="pycisTarget" | tool=="RcisTarget"){ + sig_mask <- tmp_result[[adjp_col]] >= adjp_th + # invert, to get the ranking right + tmp_result[[adjp_col]] <- -tmp_result[[adjp_col]] + } else { + # significant in current query + sig_mask <- tmp_result[[adjp_col]] <= adjp_th + # log10 transform p-values to avoid influence of small p-values + tmp_result[[adjp_col]] <- -log10(tmp_result[[adjp_col]]) + } + sig_terms <- tmp_result[sig_mask, term_col] + if (length(sig_terms) == 0){ + next + } + other_queries <- setdiff(all_queries, query) + # compute average across other queries + avg_other <- sapply(sig_terms, function(term){ + vals <- sapply(other_queries, function(q){ + v <- results_all[results_all$name==q & results_all[[term_col]]==term, adjp_col] + if (length(v) == 0) return(NA) else return(v[1]) + }) + if (tool=="pycisTarget" | tool=="RcisTarget"){ + vals[is.na(vals)] <- 0 + mean(vals) + } else { + vals[is.na(vals)] <- 1 + mean(-log10(vals)) + } + }) + ordered_terms <- names(sort(avg_other, decreasing = FALSE)) + top_n_tmp <- min(top_n, length(ordered_terms)) + tmp_terms <- ordered_terms[1:top_n_tmp] + top_terms <- unique(c(top_terms, tmp_terms)) + } + if (length(top_terms) == 0){ + file.create(plot_path_specificTerms) + return() + } + } else { + stop("Unknown type: ", type) + } + + # make adjusted p-vale and effect-size (odds-ratio or normalized enrichment scores) dataframes + adjp_df <- dcast(results_all, as.formula(paste(term_col, "~ name")), value.var = adjp_col) + rownames(adjp_df) <- adjp_df[[term_col]] + adjp_df[[term_col]] <- NULL + adjp_df <- adjp_df[,unique(results_all$name)] + + effect_df <- dcast(results_all, as.formula(paste(term_col, "~ name")), value.var = effect_col) + rownames(effect_df) <- effect_df[[term_col]] + effect_df[[term_col]] <- NULL + effect_df <- effect_df[,unique(results_all$name)] + + # filter by selected terms + adjp_df <- adjp_df[top_terms,] + effect_df <- effect_df[top_terms,] + + # fill NA for effect_df with 1 or 0 (i.e., neutral enrichment) and for adjp_df with 1 (i.e., no significance) + effect_df[is.na(effect_df)] <- if (tool=="preranked_GSEApy" | tool=="pycisTarget" | tool=="RcisTarget") 0 else 1 + adjp_df[is.na(adjp_df)] <- if (tool=="pycisTarget" | tool=="RcisTarget") 0 else 1 + + # make stat. sign. annotation for effect-size plot later + adjp_annot <- adjp_df if(tool=="pycisTarget" | tool=="RcisTarget"){ - tmp_terms <- tmp_result[order(-tmp_result[[adjp_col]]), term_col][1:top_n_tmp] + adjp_annot[adjp_df >= adjp_th] <- "*" + adjp_annot[adjp_df < adjp_th] <- "" }else{ - tmp_terms <- tmp_result[order(tmp_result[[adjp_col]]), term_col][1:top_n_tmp] + adjp_annot[adjp_df <= adjp_th] <- "*" + adjp_annot[adjp_df > adjp_th] <- "" } - - top_terms <- unique(c(top_terms,tmp_terms)) -} -# make adjusted p-vale and effect-size (odds-ratio or normalized enrichment scores) dataframes -adjp_df <- dcast(results_all, as.formula(paste(term_col, "~ name")), value.var = adjp_col) -rownames(adjp_df) <- adjp_df[[term_col]] -adjp_df[[term_col]] <- NULL -adjp_df <- adjp_df[,unique(results_all$name)] - -effect_df <- dcast(results_all, as.formula(paste(term_col, "~ name")), value.var = effect_col) -rownames(effect_df) <- effect_df[[term_col]] -effect_df[[term_col]] <- NULL -effect_df <- effect_df[,unique(results_all$name)] - -# filter by top terms -adjp_df <- adjp_df[top_terms,] -effect_df <- effect_df[top_terms,] - -# fill NA for effect_df with 1 or 0 (i.e., neutral enrichment) and for adjp_df with 1 (i.e., no significance) -effect_df[is.na(effect_df)] <- if (tool=="preranked_GSEApy" | tool=="pycisTarget" | tool=="RcisTarget") 0 else 1 -adjp_df[is.na(adjp_df)] <- if (tool=="pycisTarget" | tool=="RcisTarget") 0 else 1 - -# make stat. sign. annotation for effect-size plot later -adjp_annot <- adjp_df -if(tool=="pycisTarget" | tool=="RcisTarget"){ - adjp_annot[adjp_df >= adjp_th] <- "*" - adjp_annot[adjp_df < adjp_th] <- "" -}else{ - adjp_annot[adjp_df <= adjp_th] <- "*" - adjp_annot[adjp_df > adjp_th] <- "" -} + # log2 transform odds ratios + if (tool!="preranked_GSEApy" & tool!="pycisTarget" & tool!="RcisTarget"){ + effect_df <- log2(effect_df) + } -# log2 transform odds ratios -if (tool!="preranked_GSEApy" & tool!="pycisTarget" & tool!="RcisTarget"){ - effect_df <- log2(effect_df) -} + # transform and cap adjp and effect size + if(tool!="pycisTarget" & tool!="RcisTarget"){ + # cap effect_df for plotting depending on tool abs(log2(or)) < or_cap OR abs(NES) < nes_cap + effect_df[effect_df > effect_cap] <- effect_cap + effect_df[effect_df < -effect_cap] <- -effect_cap -# transform and cap adjp and effect size -if(tool!="pycisTarget" & tool!="RcisTarget"){ - # cap effect_df for plotting depending on tool abs(log2(or)) < or_cap OR abs(NES) < nes_cap - effect_df[effect_df > effect_cap] <- effect_cap - effect_df[effect_df < -effect_cap] <- -effect_cap + # log10 transform adjp & cap -log10(adjpvalue) < adjp_cap + adjp_df <- -log10(adjp_df) + adjp_df[adjp_df > adjp_cap] <- adjp_cap + } - # log10 transform adjp & cap -log10(adjpvalue) < adjp_cap - adjp_df <- -log10(adjp_df) - adjp_df[adjp_df > adjp_cap] <- adjp_cap -} - -# plot hierarchically clustered heatmap for adjp and effect -width_hm <- 0.2 * dim(adjp_df)[2] + 5 -height_hm <- 0.2 * dim(adjp_df)[1] + 3 - -pheatmap(adjp_df, - display_numbers=adjp_annot, - main= if (tool=="pycisTarget" | tool=="RcisTarget") adjp_col else"-log10(adj. p-values)", - treeheight_row = 10, - treeheight_col = 10, - fontsize = 6, - fontsize_number = 10, - cluster_rows = ifelse(nrow(adjp_df)>1, TRUE, FALSE), - cluster_cols = ifelse(ncol(adjp_df)>1, cluster_flag, FALSE), - silent=TRUE, - width=width_hm, - height=height_hm, - angle_col=45, - cellwidth=10, - cellheight=10, - filename=adjp_hm_path, - breaks= if (max(adjp_df)==0) seq(0, 1, length.out=200) else seq(0, max(adjp_df), length.out=200), - color=colorRampPalette(c("white", "red"))(200) - ) - -pheatmap(effect_df, - display_numbers=adjp_annot, - main = if (tool=="preranked_GSEApy" | tool=="pycisTarget" | tool=="RcisTarget") effect_col else paste0("log2(",effect_col,")"), - treeheight_row = 10, - treeheight_col = 10, - fontsize = 6, - fontsize_number = 10, - cluster_rows = ifelse(nrow(effect_df)>1, TRUE, FALSE), - cluster_cols = ifelse(ncol(effect_df)>1, cluster_flag, FALSE), - silent=TRUE, - width=width_hm, - height=height_hm, - angle_col=45, - cellwidth=10, - cellheight=10, - filename=effect_hm_path, - breaks=seq(-max(abs(effect_df)), max(abs(effect_df)), length.out=200), - color=colorRampPalette(c("blue", "white", "red"))(200) - ) - -# perform hierarchical clustering on the effect-sizes (NES or log2 odds ratios) of the terms and reorder dataframe -if (nrow(effect_df)>1){ - hc_rows <- hclust(dist(effect_df)) - hc_row_names <- rownames(effect_df)[hc_rows$order] - effect_df <- effect_df[hc_rows$order,] -}else{ - hc_row_names <- rownames(effect_df) -} + # perform hierarchical clustering on the effect-sizes (NES or log2 odds ratios) of the terms and reorder dataframe + if (nrow(effect_df)>1){ + hc_rows <- hclust(dist(effect_df)) + hc_row_names <- rownames(effect_df)[hc_rows$order] + effect_df <- effect_df[hc_rows$order,] + }else{ + hc_row_names <- rownames(effect_df) + } -if (ncol(effect_df)>1 & cluster_flag){ - hc_cols <- hclust(dist(t(effect_df))) - hc_col_names <- colnames(effect_df)[hc_cols$order] - effect_df <- effect_df[, hc_cols$order] -} else{ - hc_col_names <- colnames(effect_df) -} + if (ncol(effect_df)>1 & cluster_flag){ + hc_cols <- hclust(dist(t(effect_df))) + hc_col_names <- colnames(effect_df)[hc_cols$order] + effect_df <- effect_df[, hc_cols$order] + } else{ + hc_col_names <- colnames(effect_df) + } -# add a column for the terms -effect_df$terms <- rownames(effect_df) -# melt data frame for plotting -plot_df <- melt(data=effect_df, - id.vars="terms", - measure.vars=colnames(adjp_df), - variable.name = "feature_set", - value.name = "effect") - -# add adjusted p-values to plot dataframe -plot_df$adjp <- apply(plot_df, 1, function(x) adjp_df[x['terms'], x['feature_set']]) - -# set effect-size and adjusted p-value conditional to NA (odds ratios == 0 and NES == 0) for plotting -plot_df$effect[plot_df$effect==0] <- NA -plot_df$adjp[plot_df$adjp==0] <- NA - -# ensure that the order of terms and feature sets is kept -plot_df$terms <- factor(plot_df$terms,levels=hc_row_names) -if (cluster_flag){ - plot_df$feature_set <- factor(plot_df$feature_set, levels=hc_col_names) -} + # add a column for the terms + effect_df$terms <- rownames(effect_df) + # melt data frame for plotting + plot_df <- melt(data=effect_df, + id.vars="terms", + measure.vars=colnames(adjp_df), + variable.name = "feature_set", + value.name = "effect") + + # add adjusted p-values to plot dataframe + plot_df$adjp <- apply(plot_df, 1, function(x) adjp_df[x['terms'], x['feature_set']]) + + # set effect-size and adjusted p-value conditional to NA (odds ratios == 0 and NES == 0) for plotting + plot_df$effect[plot_df$effect==0] <- NA + plot_df$adjp[plot_df$adjp==0] <- NA + + # ensure that the order of terms and feature sets is kept + plot_df$terms <- factor(plot_df$terms,levels=hc_row_names) + if (cluster_flag){ + plot_df$feature_set <- factor(plot_df$feature_set, levels=hc_col_names) + } -# stat. significance star df -if(tool=="pycisTarget" | tool=="RcisTarget"){ - adjp_star_df <- plot_df[(!is.na(plot_df$adjp)) & (plot_df$adjp >= adjp_th),] -}else{ - adjp_star_df <- plot_df[(!is.na(plot_df$adjp)) & (plot_df$adjp >= -log10(adjp_th)),] -} + # stat. significance star df + if(tool=="pycisTarget" | tool=="RcisTarget"){ + adjp_star_df <- plot_df[(!is.na(plot_df$adjp)) & (plot_df$adjp >= adjp_th),] + }else{ + adjp_star_df <- plot_df[(!is.na(plot_df$adjp)) & (plot_df$adjp >= -log10(adjp_th)),] + } -# plot -enr_plot <- ggplot(plot_df, aes(x=feature_set, y=terms, fill=effect, size=adjp))+ - geom_point(shape=21, stroke=0.25) + - geom_point(data = adjp_star_df, aes(x=feature_set, y=terms), shape=8, size=0.5, color = "black", alpha = 0.5) + # stars for statistical significance - scale_fill_gradient2(midpoint=0, low="royalblue4", mid="white", high="firebrick2", space ="Lab", name = if (tool=="preranked_GSEApy" | tool=="pycisTarget" | tool=="RcisTarget") effect_col else paste0("log2(",effect_col,")")) + - scale_y_discrete(label=addline_format) + - scale_size_continuous(range = c(1,5), name = if (tool=="pycisTarget" | tool=="RcisTarget") adjp_col else paste("-log10(",adjp_col,")")) + - ggtitle(paste(tool, database, group, sep='\n')) + - clean_theme() + - theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust=1), - axis.title.x=element_blank(), - axis.title.y=element_blank()) + guides(size = guide_legend(reverse=TRUE)) - -width <- 0.15 * dim(adjp_df)[2] + 3 -height <- 0.2 * dim(adjp_df)[1] + 2 -# options(repr.plot.width=width, repr.plot.height=height) -# enr_plot - -# save plot -ggsave_new(filename = paste0(group,'_',database,'_summary'), + # plot + enr_plot <- ggplot(plot_df, aes(x=feature_set, y=terms, fill=effect, size=adjp))+ + geom_point(shape=21, stroke=0.25) + + geom_point(data = adjp_star_df, aes(x=feature_set, y=terms), shape=8, size=0.5, color = "black", alpha = 0.5) + + scale_fill_gradient2(midpoint=0, low="royalblue4", mid="white", high="firebrick2", space ="Lab", name = if (tool=="preranked_GSEApy" | tool=="pycisTarget" | tool=="RcisTarget") effect_col else paste0("log2(",effect_col,")")) + + scale_y_discrete(label=addline_format) + + scale_size_continuous(range = c(1,5), name = if (tool=="pycisTarget" | tool=="RcisTarget") adjp_col else paste("-log10(",adjp_col,")")) + + ggtitle(paste(tool, database, group, sep='\n')) + + clean_theme() + + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust=1), + axis.title.x=element_blank(), + axis.title.y=element_blank()) + guides(size = guide_legend(reverse=TRUE)) + + width <- 0.15 * dim(adjp_df)[2] + 3 + height <- 0.2 * dim(adjp_df)[1] + 2 + + # save plot + if (type == "topTerms"){ + plot_path <- plot_path_topTerms + } else { + plot_path <- plot_path_specificTerms + } + ggsave_new(filename = paste0(group,'_',database,'_summary_', type), results_path = dirname(plot_path), plot = enr_plot, width = width, height = height ) +} + +# create both plots +make_summary_plot(type = "topTerms") +make_summary_plot(type = "specificTerms")