diff --git a/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R b/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R index e172ab35ae5c..3e982ee3e946 100644 --- a/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R +++ b/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R @@ -39,14 +39,14 @@ #' @return named list of options and values similar to optparse parse_args <- function(x) { - args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] - args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) - - # Ensure the option vectors are length 2 (key/ value) to catch empty ones - args_vals <- lapply(args_vals, function(z) { length(z) <- 2; z}) - - parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) - parsed_args[! is.na(parsed_args)] + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z) { length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] } #' Flexibly read CSV or TSV files @@ -58,24 +58,24 @@ parse_args <- function(x) { #' @return output Data frame read_delim_flexible <- function(file, header = TRUE, row.names = NULL, check.names = F) { - - ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) - - if (ext == "tsv" || ext == "txt") { - separator <- "\\t" - } else if (ext == "csv") { - separator <- "," - } else { - stop(paste("Unknown separator for", ext)) - } - - read.delim( - file, - sep = separator, - header = header, - row.names = row.names, - check.names = check.names - ) + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + read.delim( + file, + sep = separator, + header = header, + row.names = row.names, + check.names = check.names + ) } #' Round numeric dataframe columns to fixed decimal places by applying @@ -87,27 +87,27 @@ read_delim_flexible <- function(file, header = TRUE, row.names = NULL, check.nam #' #' @return output Data frame round_dataframe_columns <- function(df, columns = NULL, digits = -1) { - if (digits == -1) { - return(df) # if -1, return df without rounding - } - - df <- data.frame(df, check.names = FALSE) # make data.frame from vector as otherwise, the format will get messed up - if (is.null(columns)) { - columns <- colnames(df)[(unlist(lapply(df, is.numeric), use.names=F))] # extract only numeric columns for rounding - } - - df[,columns] <- round( - data.frame(df[, columns], check.names = FALSE), - digits = digits - ) - - # Convert columns back to numeric - - for (c in columns) { - df[[c]][grep("^ *NA\$", df[[c]])] <- NA - df[[c]] <- as.numeric(df[[c]]) - } - df + if (digits == -1) { + return(df) # if -1, return df without rounding + } + + df <- data.frame(df, check.names = FALSE) # make data.frame from vector as otherwise, the format will get messed up + if (is.null(columns)) { + columns <- colnames(df)[(unlist(lapply(df, is.numeric), use.names=F))] # extract only numeric columns for rounding + } + + df[,columns] <- round( + data.frame(df[, columns], check.names = FALSE), + digits = digits + ) + + # Convert columns back to numeric + + for (c in columns) { + df[[c]][grep("^ *NA\$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df } ################################################ @@ -122,25 +122,24 @@ round_dataframe_columns <- function(df, columns = NULL, digits = -1) { # Set defaults and classes opt <- list( - de_file = '$de_file', - de_id_column = 'gene_id', - organism = NULL, - sources = NULL, - output_prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), - significant = T, - measure_underrepresentation = F, - correction_method = 'gSCS', - evcodes = F, - pval_threshold = 0.05, - gmt_file = '$gmt_file', - token = NULL, - background_file = '$background_file', - background_column = NULL, - domain_scope = 'annotated', - min_diff = 1, - round_digits = -1, - palette_name = 'Blues', - archive = 'gprofiler' + de_file = '$de_file', + de_id_column = 'gene_id', + organism = NULL, + sources = NULL, + output_prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), + significant = T, + measure_underrepresentation = F, + correction_method = 'gSCS', + evcodes = T, + pval_threshold = 0.05, + gmt_file = '$gmt_file', + token = NULL, + background_file = '$background_file', + background_column = NULL, + domain_scope = 'annotated', + min_diff = 1, + round_digits = -1, + palette_name = 'Blues' ) opt_types <- lapply(opt, class) @@ -149,38 +148,38 @@ opt_types <- lapply(opt, class) args_opt <- parse_args('$task.ext.args') for ( ao in names(args_opt)) { - if (! ao %in% names(opt)) { - stop(paste("Invalid option:", ao)) - } else { - - # Preserve classes from defaults where possible - if (! is.null(opt[[ao]])) { - args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) - } - opt[[ao]] <- args_opt[[ao]] + if (! ao %in% names(opt)) { + stop(paste("Invalid option:", ao)) + } else { + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])) { + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) } + opt[[ao]] <- args_opt[[ao]] + } } # Check if required parameters have been provided required_opts <- c('output_prefix') missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] if (length(missing) > 0) { - stop(paste("Missing required options:", paste(missing, collapse=', '))) + stop(paste("Missing required options:", paste(missing, collapse=', '))) } if (is.null(opt\$organism) && opt\$gmt_file == "" && is.null(opt\$token)) { - stop('Please provide organism, gmt_file or token.') + stop('Please provide organism, gmt_file or token.') } # Check file inputs are valid for (file_input in c('de_file')) { - if (is.null(opt[[file_input]])) { - stop(paste("Please provide", file_input), call. = FALSE) - } - - if (! file.exists(opt[[file_input]])) { - stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) - } + if (is.null(opt[[file_input]])) { + stop(paste("Please provide", file_input), call. = FALSE) + } + + if (! file.exists(opt[[file_input]])) { + stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) + } } ################################################ @@ -191,7 +190,6 @@ for (file_input in c('de_file')) { library(gprofiler2) library(ggplot2) -library(yaml) ################################################ ################################################ @@ -200,9 +198,9 @@ library(yaml) ################################################ de.genes <- - read_delim_flexible( - file = opt\$de_file - ) + read_delim_flexible( + file = opt\$de_file + ) output_prefix <- paste0(opt\$output_prefix, ".gprofiler2") @@ -210,223 +208,280 @@ output_prefix <- paste0(opt\$output_prefix, ".gprofiler2") file.create(paste(output_prefix, 'all_enriched_pathways', 'tsv', sep = '.')) if (nrow(de.genes) > 0) { - - query <- de.genes[[opt\$de_id_column]] - - ################################################ - ################################################ - # Run gprofiler processes and generate outputs # - ################################################ - ################################################ - - set.seed(1) # This will ensure that reruns have the same plot colors - - sources <- opt\$sources - if (!is.null(sources)) { - sources <- strsplit(opt\$sources, split = ",")[[1]] - } - - if (!is.null(opt\$token)) { - - # First check if a token was provided - token <- opt\$token - - } else if (!is.null(opt\$organism)) { - - # Next, check if organism was provided. Get the GMT file from gprofiler and save both the full file as well as the filtered one to metadata - base_url <- paste0("https://biit.cs.ut.ee/", opt\$archive) - gmt_url <- paste0(base_url, "//static/gprofiler_full_", opt\$organism, ".ENSG.gmt") - set_base_url(base_url) - tryCatch( - { - gmt_path <- paste0("gprofiler_full_", opt\$organism, ".ENSG.gmt") - if (!is.null(sources)) { - gmt_path <- paste0("gprofiler_full_", opt\$organism, ".", paste(sources, collapse="_"), ".ENSG_filtered.gmt") - } - download <- download.file(gmt_url, gmt_path) - if (download != 0) { - print("Failed to fetch the GMT file from gprofiler with this URL:") - print(gmt_url) - print("For reproducibility reasons, try to download the GMT file manually by visiting https://biit.cs.ut.ee/gprofiler/gost, then selecting the correct organism and, in datasources, clicking 'combined ENSG.gmt'.") - } else { - if (!is.null(sources)) { - gmt <- Filter(function(line) any(startsWith(line, sources)), readLines(gmt_path)) - print(paste0("GMT file successfully downloaded and filtered. Please note that for some sources, the GMT file may not contain any entries as these cannot be retrieved from gprofiler; in this case, the GMT file may be completely empty.")) - writeLines(gmt, gmt_path) - } - } - }, - error=function(gost_error) { - print("Failed to fetch the GMT file from gprofiler with this URL:") - print(gmt_url) - print("Got error:") - print(gost_error) - print("For reproducibility reasons, please try to download the GMT file manually by visiting https://biit.cs.ut.ee/gprofiler/gost, then selecting the correct organism and, in datasources, clicking 'combined ENSG.gmt'. Then provide it to the pipeline with the parameter `--gmt_file`") - } - ) - token <- opt\$organism - - } else { - - # Last option: Use custom GMT file - gmt_path <- opt\$gmt_file - - # If sources are set, extract only requested entries (gprofiler will NOT filter automatically!) + + query <- de.genes[[opt\$de_id_column]] + + # Optional map: Ensembl ID -> gene symbol (used only for display) + symbol_map <- NULL + # Try to build from columns in the DE table (offline, preferred) + id_candidates <- c("gene_id","feature_id","feature","Ensembl_ID","ensembl_gene_id") + name_candidates <- c("gene_name","feature_name","symbol","external_gene_name","mgi_symbol","hgnc_symbol") + id_col <- id_candidates[id_candidates %in% colnames(de.genes)][1] + name_col <- name_candidates[name_candidates %in% colnames(de.genes)][1] + if (!is.na(id_col) && !is.na(name_col)) { + symbol_map <- setNames(as.character(de.genes[[name_col]]), as.character(de.genes[[id_col]])) + } + + ################################################ + ################################################ + # Run gprofiler processes and generate outputs # + ################################################ + ################################################ + + set.seed(1) # This will ensure that reruns have the same plot colors + + sources <- opt\$sources + if (!is.null(sources)) { + sources <- strsplit(opt\$sources, split = ",")[[1]] + } + if (!is.null(sources)) { + sources <- strsplit(opt\$sources, split = ",")[[1]] + } + + if (!is.null(opt\$token)) { + + # First check if a token was provided + token <- opt\$token + + } else if (!is.null(opt\$organism)) { + + # Next, check if organism was provided. Get the GMT file from gprofiler and save both the full file as well as the filtered one to metadata + gmt_url <- paste0("https://biit.cs.ut.ee/gprofiler//static/gprofiler_full_", opt\$organism, ".ENSG.gmt") + tryCatch( + { + gmt_path <- paste0("gprofiler_full_", opt\$organism, ".ENSG.gmt") if (!is.null(sources)) { - gmt <- Filter(function(line) any(startsWith(line, sources)), readLines(opt\$gmt)) - gmt_path <- paste0(strsplit(basename(opt\$gmt_file), split = "\\\\.")[[1]][[1]], ".", paste(sources, collapse="_"), "_filtered.gmt") + gmt_path <- paste0("gprofiler_full_", opt\$organism, ".", paste(sources, collapse="_"), ".ENSG_filtered.gmt") + } + download <- download.file(gmt_url, gmt_path) + if (download != 0) { + print("Failed to fetch the GMT file from gprofiler with this URL:") + print(gmt_url) + print("For reproducibility reasons, try to download the GMT file manually by visiting https://biit.cs.ut.ee/gprofiler/gost, then selecting the correct organism and, in datasources, clicking 'combined ENSG.gmt'.") + } else { + if (!is.null(sources)) { + gmt <- Filter(function(line) any(startsWith(line, sources)), readLines(gmt_path)) + print(paste0("GMT file successfully downloaded and filtered. Please note that for some sources, the GMT file may not contain any entries as these cannot be retrieved from gprofiler; in this case, the GMT file may be completely empty.")) writeLines(gmt, gmt_path) + } } - token <- upload_GMT_file(gmt_path) - - # Add gost ID to output GMT name so that it can be reused in future runs - file.rename(gmt_path, paste0(strsplit(basename(opt\$gmt_file), split = "\\\\.")[[1]][[1]], ".", paste(sources, collapse="_"), "_gostID_", token, "_filtered.gmt")) - + }, + error=function(gost_error) { + print("Failed to fetch the GMT file from gprofiler with this URL:") + print(gmt_url) + print("Got error:") + print(gost_error) + print("For reproducibility reasons, please try to download the GMT file manually by visiting https://biit.cs.ut.ee/gprofiler/gost, then selecting the correct organism and, in datasources, clicking 'combined ENSG.gmt'. Then provide it to the pipeline with the parameter `--gmt_file`") + } + ) + token <- opt\$organism + + } else { + + # Last option: Use custom GMT file + gmt_path <- opt\$gmt_file + + # If sources are set, extract only requested entries (gprofiler will NOT filter automatically!) + if (!is.null(sources)) { + gmt <- Filter(function(line) any(startsWith(line, sources)), readLines(opt\$gmt_file)) + gmt_path <- paste0(strsplit(basename(opt\$gmt_file), split = "\\\\.")[[1]][[1]], ".", paste(sources, collapse="_"), "_filtered.gmt") + writeLines(gmt, gmt_path) } - - - # If custom background_file was provided, read it - if (opt\$background_file != "") { - intensities_table <- read_delim_flexible( - file = opt\$background_file - ) - # If only 1 col, it is a list, not a matrix - if (ncol(intensities_table) == 1) { - background <- intensities_table[,1] # Extract first column from df - background <- append(background, colnames(intensities_table)[1]) # First entry was put into header, add it to vector + token <- upload_GMT_file(gmt_path) + + # Add gost ID to output GMT name so that it can be reused in future runs + file.rename(gmt_path, paste0(strsplit(basename(opt\$gmt_file), split = "\\\\.")[[1]][[1]], ".", paste(sources, collapse="_"), "_gostID_", token, "_filtered.gmt")) + + } + + + # If custom background_file was provided, read it + if (opt\$background_file != "") { + intensities_table <- read_delim_flexible( + file = opt\$background_file + ) + # If only 1 col, it is a list, not a matrix + if (ncol(intensities_table) == 1) { + background <- intensities_table[,1] # Extract first column from df + background <- append(background, colnames(intensities_table)[1]) # First entry was put into header, add it to vector + } else { + # Otherwise it's a matrix + # Set rownames to background_column if param was set + if (!is.null(opt\$background_column)) { + if (opt\$background_column %in% colnames(intensities_table)) { + rownames(intensities_table) <- intensities_table[[opt\$background_column]] + intensities_table[[opt\$background_column]] <- NULL } else { - # Otherwise it's a matrix - # Set rownames to background_column if param was set - if (!is.null(opt\$background_column)) { - if (opt\$background_column %in% colnames(intensities_table)) { - rownames(intensities_table) <- intensities_table[[opt\$background_column]] - intensities_table[[opt\$background_column]] <- NULL - } else { - stop(paste0("Invalid background_column argument: ", opt\$background_column, - ". Valid columns are: ", paste(colnames(intensities_table), collapse=", "), ".")) - } - } else { - - # Otherwise set rownames to first column - rownames(intensities_table) <- intensities_table[,1] - intensities_table <- intensities_table[,-1] - } - - # Rownames are set, now remove non-numeric columns - nums <- unlist(lapply(intensities_table, is.numeric), use.names = FALSE) - intensities_table <- intensities_table[, nums] - # Keep only rownames which have abundance - background <- rownames(subset(intensities_table, rowSums(intensities_table, na.rm = TRUE)>0)) + stop(paste0("Invalid background_column argument: ", opt\$background_column, + ". Valid columns are: ", paste(colnames(intensities_table), collapse=", "), ".")) } - } else { - background <- NULL + } else { + + # Otherwise set rownames to first column + rownames(intensities_table) <- intensities_table[,1] + intensities_table <- intensities_table[,-1] + } + + # Rownames are set, now remove non-numeric columns + nums <- unlist(lapply(intensities_table, is.numeric), use.names = FALSE) + intensities_table <- intensities_table[, nums] + # Keep only rownames which have abundance + background <- rownames(subset(intensities_table, rowSums(intensities_table, na.rm = TRUE)>0)) } - - # Name the query as it will otherwise be called 'query_1' which will also determine the gostplot title - q <- list(query) - names(q) <- c(output_prefix) - gost_results <- gost( - query=q, - organism=token, - significant=opt\$significant, - measure_underrepresentation=opt\$measure_underrepresentation, - correction_method=opt\$correction_method, - sources=sources, - evcodes=opt\$evcodes, - user_threshold=opt\$pval_threshold, - custom_bg=background, - domain_scope=opt\$domain_scope + } else { + background <- NULL + } + + # Name the query as it will otherwise be called 'query_1' which will also determine the gostplot title + q <- list(query) + names(q) <- c(output_prefix) + gost_results <- gost( + query=q, + organism=token, + significant=opt\$significant, + measure_underrepresentation=opt\$measure_underrepresentation, + correction_method=opt\$correction_method, + sources=sources, + evcodes=opt\$evcodes, + user_threshold=opt\$pval_threshold, + custom_bg=background, + domain_scope=opt\$domain_scope + ) + + if (!is.null(gost_results)) { + # Create interactive plot and save to HTML + interactive_plot <- gostplot(gost_results, capped=T, interactive=T) + + # Save interactive plot as HTML + htmlwidgets::saveWidget( + widget = interactive_plot, + file = paste(output_prefix, 'gostplot', 'html', sep = '.') ) - - if (!is.null(gost_results)) { - # Create interactive plot and save to HTML - interactive_plot <- gostplot(gost_results, capped=T, interactive=T) - - # Save interactive plot as HTML - htmlwidgets::saveWidget( - widget = interactive_plot, - file = paste(output_prefix, 'gostplot', 'html', sep = '.') - ) - - # Create a static plot and save to PNG - static_plot <- gostplot(gost_results, capped=T, interactive=F) - ggsave(plot = static_plot, filename = paste(output_prefix, 'gostplot', 'png', sep = '.'), width = 10, height = 7) - - # Subset gost results to those pathways with a min. number of differential features - gost_results\$result <- gost_results\$result[which(gost_results\$result\$intersection_size>=opt\$min_diff),] - - # annotate query size (number of differential features in contrast) - gost_results\$result\$original_query_size <- rep(length(as.character(de.genes\$Ensembl_ID)), nrow(gost_results\$result)) - - # R object for other processes to use - - saveRDS(gost_results, file = paste(output_prefix, 'gost_results.rds', sep = '.')) - - # Write full enrichment table (except parents column as that one throws an error) - - gost_results\$results <- data.frame( - round_dataframe_columns(gost_results\$result[,-which(names(gost_results\$result) == "parents")], digits=opt\$round_digits), - check.names = FALSE - ) - - write.table( - gost_results\$results, - file = paste(output_prefix, 'all_enriched_pathways', 'tsv', sep = '.'), - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE + + # Create a static plot and save to PNG + static_plot <- gostplot(gost_results, capped=T, interactive=F) + ggsave(plot = static_plot, filename = paste(output_prefix, 'gostplot', 'png', sep = '.'), width = 10, height = 7) + + # Subset gost results to those pathways with a min. number of differential features + gost_results\$result <- gost_results\$result[which(gost_results\$result\$intersection_size>=opt\$min_diff),] + + # If we don't have symbols from the DE table, build a map via g:Profiler now (requires organism) + conv_map <- NULL + if (is.null(symbol_map) && !is.null(opt\$organism)) { + ids_all <- unique(trimws(unlist(strsplit(paste(gost_results\$result\$intersection, collapse=","), ",")))) + if (length(ids_all) > 0) { + conv <- tryCatch( + # let g:Profiler pick the right namespace; we'll use the 'name' column as the symbol + gprofiler2::gconvert(ids_all, organism = opt\$organism), + error = function(e) NULL ) - - # Iterate over the enrichment results by source and save separate tables - for (df in split(gost_results\$result, gost_results\$result\$source)){ - - db_source <- df\$source[1] - df_subset <- data.frame( - Pathway_name = df\$term_name, - Pathway_code = df\$term_id, - DE_genes = df\$intersection_size, - Pathway_size = df\$term_size, - Fraction_DE = df\$recall, - Padj = df\$p_value, - DE_genes_names = df\$intersection - ) - df_subset <- data.frame( - round_dataframe_columns(df_subset, digits=opt\$round_digits), - check.names = FALSE - ) - write.table( - df_subset, - file = paste(output_prefix, db_source, 'sub_enriched_pathways', 'tsv', sep = '.'), - col.names = TRUE, - row.names = FALSE, - sep = '\t', - quote = FALSE - ) - - # For plot, shorten pathway names as they can get quite long (full name can be looked up in the table) - df_subset\$Pathway_name <- sapply(df_subset\$Pathway_name, substr, start=1, stop=50) - - # Extract 3 colors from the chosen palette (2 are sufficient, but brewer.pal has a minimum of 3); first and last will be used for plot - colors <- RColorBrewer::brewer.pal(3, opt\$palette_name) - - # Enriched pathways horizontal barplots of padj values - p <- ggplot(df_subset, aes(x=reorder(Pathway_name, Fraction_DE), y=Fraction_DE)) + - geom_bar(aes(fill=Padj), stat="identity", width = 0.7) + - geom_text(aes(label=paste0(df_subset\$DE_genes, "/", df_subset\$Pathway_size)), vjust=0.4, hjust=-0.2, size=3) + - theme(plot.title.position = "plot") + - coord_flip() + - scale_y_continuous(limits = c(0.00, 1.24), breaks = seq(0, 1.24, by = 0.25)) + - ggtitle(paste("Enriched", db_source, "pathways")) + - xlab("") + ylab("Enriched fraction (DE features / Pathway size)") + - scale_fill_continuous(high = colors[1], low = colors[3]) - - # Save plot with set width to ensure there is enough space for the labels; adapt height to nrow but limit it to 100 as there will be an error for too high values - ggsave(p, filename = paste(output_prefix, db_source, 'sub_enriched_pathways', 'png', sep = '.'), device = "png", width=10, height=min(100, 1.5+nrow(df_subset)*0.15), limitsize=F) + if (!is.null(conv) && nrow(conv) > 0) { + conv_map <- setNames(as.character(conv\$name), as.character(conv\$input)) } + } } + + # annotate query size (number of differential features in contrast) + gost_results\$result\$original_query_size <- rep(length(unique(as.character(query))), nrow(gost_results\$result)) + + # R object for other processes to use + saveRDS(gost_results, file = paste(output_prefix, 'gost_results.rds', sep = '.')) + + # Write full enrichment table (except parents column as that one throws an error) + gost_results\$results <- data.frame( + round_dataframe_columns(gost_results\$result[,-which(names(gost_results\$result) == "parents")], digits=opt\$round_digits), + check.names = FALSE + ) + + write.table( + gost_results\$results, + file = paste(output_prefix, 'all_enriched_pathways', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) + + # Iterate over the enrichment results by source and save separate tables + for (df in split(gost_results\$result, gost_results\$result\$source)){ + + db_source <- df\$source[1] + + # 1) Keep original Ensembl IDs from g:Profiler intersection + intersection_ids <- df\$intersection + + # 2) Build symbols vector using DE-derived map first, then g:Profiler fallback + intersection_symbols <- vapply(strsplit(df\$intersection, ","), function(vec){ + vec <- trimws(vec) + + # start empty + sym <- rep(NA_character_, length(vec)) + + # fill from DE symbol_map (if present) + if (!is.null(symbol_map)) { + tmp <- symbol_map[vec] + sym[!is.na(tmp) & tmp != ""] <- tmp[!is.na(tmp) & tmp != ""] + } + + # fill remaining from gconvert map (if present) + if (!is.null(conv_map)) { + tmp2 <- conv_map[vec] + need <- is.na(sym) | sym == "" + sym[need & !is.na(tmp2) & tmp2 != ""] <- tmp2[need & !is.na(tmp2) & tmp2 != ""] + } + + # fallback to original IDs + sym[is.na(sym) | sym == ""] <- vec[is.na(sym) | sym == ""] + paste(sym, collapse = ",") + }, FUN.VALUE = character(1)) + + df_subset <- data.frame( + Pathway_name = df\$term_name, + Pathway_code = df\$term_id, + DE_genes = df\$intersection_size, + Pathway_size = df\$term_size, + Fraction_DE = df\$recall, + Padj = df\$p_value, + DE_genes_ids = intersection_ids, + DE_genes_names = intersection_symbols + ) + + df_subset <- data.frame( + round_dataframe_columns(df_subset, digits=opt\$round_digits), + check.names = FALSE + ) + write.table( + df_subset, + file = paste(output_prefix, db_source, 'sub_enriched_pathways', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) + + # For plot, shorten pathway names as they can get quite long (full name can be looked up in the table) + df_subset\$Pathway_name <- sapply(df_subset\$Pathway_name, substr, start=1, stop=50) + + # Extract 3 colors from the chosen palette (2 are sufficient, but brewer.pal has a minimum of 3); first and last will be used for plot + colors <- RColorBrewer::brewer.pal(3, opt\$palette_name) + + # Enriched pathways horizontal barplots of padj values + p <- ggplot(df_subset, aes(x=reorder(Pathway_name, Fraction_DE), y=Fraction_DE)) + + geom_bar(aes(fill=Padj), stat="identity", width = 0.7) + + geom_text(aes(label=paste0(df_subset\$DE_genes, "/", df_subset\$Pathway_size)), vjust=0.4, hjust=-0.2, size=3) + + theme(plot.title.position = "plot") + + coord_flip() + + scale_y_continuous(limits = c(0.00, 1.24), breaks = seq(0, 1.24, by = 0.25)) + + ggtitle(paste("Enriched", db_source, "pathways")) + + xlab("") + ylab("Enriched fraction (DE features / Pathway size)") + + scale_fill_continuous(high = colors[1], low = colors[3]) + + # Save plot with set width to ensure there is enough space for the labels; adapt height to nrow but limit it to 100 as there will be an error for too high values + ggsave(p, filename = paste(output_prefix, db_source, 'sub_enriched_pathways', 'png', sep = '.'), device = "png", width=10, height=min(100, 1.5+nrow(df_subset)*0.15), limitsize=F) + } + } } else { - print("No differential features found, pathway enrichment analysis with gprofiler2 will be skipped.") + print("No differential features found, pathway enrichment analysis with gprofiler2 will be skipped.") } ################################################ @@ -448,25 +503,16 @@ sink() r.version <- strsplit(version[['version.string']], ' ')[[1]][3] gprofiler2.version <- as.character(packageVersion('gprofiler2')) ggplot2.version <- as.character(packageVersion('ggplot2')) -if (is.null(opt\$token) & !is.null(opt\$organism)) { - gprofiler_data.version <- as.yaml(get_version_info(opt\$organism)) -} else { - gprofiler_data.version <- as.yaml(get_version_info()) -} -# The YAML comes out a bit messy for the gprofiler data, not nested correctly -# when we use writeLines and gprofiler_data, so I write it via yaml -write_yaml( - list( - '"$task.process"'=list( - "r-ggplot2"=ggplot2.version, - "r-gprofiler2"=gprofiler2.version, - "gprofiler-data"=gprofiler_data.version - ) - ), - file='versions.yml' -) +writeLines( + c( + '"\${task.process}":', + paste(' r-base:', r.version), + paste(' r-ggplot2:', ggplot2.version), + paste(' r-gprofiler2:', gprofiler2.version) + ), + 'versions.yml') ################################################ ################################################ ################################################ -################################################ +################################################ \ No newline at end of file