J-81 PR changes 3/3 (no CI changes)

torres-alexis · torres-alexis · commit d857577f6cb1 · 2024-01-31T19:26:04.000-08:00
nasa#29 - Based on: 1092e80 2dd8fce 11a22f0 7ba75d0 4e317f2 5d35b61 7d2cc24 53363e4 1b6e325 b5013e9 574eb79 fc89c5e 1500019 6719218 35c9823 af0b716 cbf6055 6a0d105 ee188eb a8c65d3 - Changed dge module to DGE_BY_DESEQ2, added test modules - Removed deprecated conda support files - Updated ensembl file used for test dataset VV
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md b/RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Fixed
+
+- Workflow usage files will all follow output directory set by workflow user
+  
+### Changed
+
+- TrimGalore! will now use autodetect for adaptor type
+- V&V migrated from dp_tools version 1.1.8 to 1.3.2 including:
+  - Migration of V&V protocol code to this codebase instead of dp_tools
+  - Fix for sample wise checks reusing same sample
+
 ## [1.0.3](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_RCP-F_1.0.3/RNAseq/Workflow_Documentation/NF_RCP-F) - 2023-01-25
 
 ### Added
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/Perform_DGE.Rmd b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/Perform_DGE.Rmd
@@ -7,49 +7,70 @@ params:
     input_gene_results_dir: ""
     # One and only one of the following must be specified
     runsheet_path: NULL 
-    isa_path: NULL
 
     primary_keytype: "" # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR)
     normalization: "default" # ENUM like, supports "ERCC-groupB" and "default"
     normalized_counts_output_prefix: ""
     dge_output_prefix: ""
     DEBUG_MODE_LIMIT_GENES: FALSE
     DEBUG_MODE_ADD_DUMMY_COUNTS: FALSE
-    work_dir: "." # should be set to launch directory 
-    verbose: FALSE
+    work_dir: "." # NON_DPPD: should be set to launch directory 
+    SUMMARY_FILE_PATH: "summary.txt"
 ---
 
 ## Substeps {.tabset}
 
 ### 1. Setup
-
+<!---  START:NON_DPPD --->
 ```{r, setup, include=FALSE}
 knitr::opts_knit$set(root.dir = params$work_dir)
 library(knitr)
 ```
 
-```{r libary-loading, message = params$verbose, warning = params$verbose}
+```{r libary-loading}
 # allow more flexibility in download time
 # useful for slower connections where the default of 60 seconds might be exceeded
 options(timeout=600)
 
-# Import libraries (tximport, DESeq2, tidyverse, Risa)
+# Import libraries (tximport, DESeq2, tidyverse)
 library(tximport)
 library(DESeq2)
 library(stringr)
 
 params
+SUMMARY_FILE_PATH <- params$SUMMARY_FILE_PATH
 yaml::write_yaml(params, "last_params.yml")
-```
-```{r validate_params}
-# assert either runsheet_path OR isa_path supplied in params
-if (!xor(!is.null(params$runsheet_path), !is.null(params$isa_path))) {
-    stop("Must supply EITHER runsheet_path or isa_path in params")
-}
+#  END:NON_DPPD
+
+# START:ONLY_DPPD
+# params <- c(
+#     runsheet_path = "/path/to/runsheet", # Used for downloading
+#     input_gene_results_dir = "/path/to/genes_results_files", # Location of the gene results files
+#     primary_keytype = "", # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR)
+#     normalization = "", # ENUM like, supports "ERCC-groupB" and "default"
+#     normalized_counts_output_prefix = "", # Output prefix for normalized counts files
+#     dge_output_prefix = "" # Output prefix for DGE files
+# )
+# END:ONLY_DPPD
 ```
 
 ### 2. Load Study Metadata
-```{r runsheet-to-compare_df, include=(!is.null(params$runsheet_path)), eval=(!is.null(params$runsheet_path))}
+```{r runsheet-to-compare_df}
+#' Calculate the square of a number
+#'
+#' This function takes a numeric input and returns its square.
+#'
+#' @param x Numeric value to be squared.
+#'
+#' @return The square of the input value.
+#'
+#' @examples
+#' square(2)
+#' # Output: 4
+#'
+#' square(-3)
+#' # Output: 9
+#'
 compare_csv_from_runsheet <- function(runsheet_path) {
     df = read.csv(runsheet_path)
     # get only Factor Value columns
@@ -64,25 +85,6 @@ compare_csv <- compare_csv_from_runsheet(params$runsheet_path)
 #DT::datatable(compare_csv, caption = "Data Frame of parsed runsheet filtered to required columns")
 ```
 
-```{r isa-to-compare_df, include=(!is.null(params$isa_path)), eval=(!is.null(params$isa_path))}
-# TODO: Remove this route, ISA zip support will be dropped as of DPPD-7101-F
-library(Risa)
-
-compare_csv_from_isa_archive <- function(isa_path) {
-    td = tempdir()
-    unzip(isa_path, exdir = td)
-    isa <- Risa::readISAtab(path = td)
-    n = as.numeric(which(isa@assay.technology.types == "RNA Sequencing (RNA-Seq)"))
-    isa_tabs <- isa@assay.tabs[[n]]@assay.file
-    factors <- as.data.frame(isa@factors[[1]], stringsAsFactors = FALSE)
-    colnames(factors) <- paste("factor",1:dim(factors)[2], sep = "_")
-    return(data.frame(sample_id = isa_tabs$`Sample Name`, factors))
-}
-# Loading metadata from isa archive
-compare_csv <- compare_csv_from_isa_archive(params$isa_path)
-#DT::datatable(compare_csv, caption = "Data Frame of parsed isa archive filtered to required metadata")
-```
-
 ```{r compare_df-to-study_df}
 study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]])
 colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]]
@@ -130,8 +132,7 @@ files <- list.files(
 
 ## Reorder the *genes.results files to match the ordering of the ISA samples
 
-# Replace spaces in sample names from ISA with "_", consistent with runsheet generation
-samples = str_replace_all(rownames(study), " ", "_")
+samples = rownames(study)
 reordering <- sapply(samples, function(x)grep(paste0("Rsem_gene_counts/", x,".genes.results$"), files, value=FALSE))
 files <- files[reordering]
 names(files) <- samples
@@ -335,12 +336,12 @@ output_table_1$LRT.p.value <- res_1_lrt@listData$padj
 ```{r wald-test-iteration}
 ## Iterate through Wald Tests to generate pairwise comparisons of all groups
 for (i in 1:dim(contrasts)[2]){
-  res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i]))
-  res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)]
-  colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i]))
-  output_table_1<-cbind(output_table_1,res_1)
-  rm(res_1)
+    res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i]))
+    res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)]
+    colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i]))
+    output_table_1<-cbind(output_table_1,res_1)
 }
+
 ```
 
 ```{r}
@@ -385,6 +386,16 @@ write.csv(
     sampleTable,
     file =  paste0(params$dge_output_prefix, "SampleTable.csv")
 )
+
+# Create summary file based on output_table_1
+output <- capture.output(summary(output_table_1))
+
+# Open file connection
+conn <- file(paste0(params$dge_output_prefix, "summary.txt"), "w")
+
+# Write the captured output to the file
+writeLines(output, conn)
+
 # DT::datatable(head(output_table_1, n = 30),
 #   caption = "First 30 rows of differential gene expression table",
 #   extensions = "FixedColumns",
@@ -408,4 +419,4 @@ cat(capture.output(sessionInfo()),
     file = version_output_fn,
     append = TRUE,
     sep = "\n")
-```
+```
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R
@@ -4,10 +4,6 @@ library("here")
 library("cli")
 
 parser <- OptionParser()
-parser <- add_option(parser, c("-v", "--verbose"),
-    action = "store_true",
-    default = FALSE, help = "Print extra output [default]"
-)
 parser <- add_option(parser, c("--skip_perform_dge"),
     action = "store_true", default = FALSE,
     help = "Skips running the DGE, this can be used when the output from the DGE already exist",
@@ -43,9 +39,6 @@ parser <- add_option(parser, c("--DEBUG_MODE_ADD_DUMMY_COUNTS"),
     default = FALSE, action = "store_true",
     help = "Replaces all gene counts with random values from 0 to 5000",
 )
-parser <- add_option(parser, c("--isa_path"),
-    help = "ISA Archive path, one of two allowed metadata inputs, exactly one metadata input must be supplied",
-)
 parser <- add_option(parser, c("--runsheet_path"),
     help = "runsheet csv path, one of two allowed metadata inputs, exactly one metadata input must be supplied",
 )
@@ -69,14 +62,11 @@ if (!args$skip_perform_dge) {
     cli_alert_warning("Running Perform_DGE.Rmd")
     rmarkdown::render(here("dge_annotation_R_scripts", "Perform_DGE.Rmd"),
         output_dir = args$work_dir,
-        quiet = !args$verbose,
         params = list(
             work_dir = args$work_dir,
-            verbose = args$verbose,
             input_gene_results_dir = args$input_gene_results_dir,
             primary_keytype = args$primary_keytype,
             runsheet_path = args$runsheet_path,
-            isa_path = args$isa_path,
             normalization = args$normalization,
             dge_output_prefix = args$dge_output_prefix,
             normalized_counts_output_prefix = args$normalized_counts_output_prefix,
@@ -93,7 +83,6 @@ if (!args$skip_gene_annotation) {
     cli_alert_warning("Running Add_Gene_Annotations.Rmd")
     rmarkdown::render(here("dge_annotation_R_scripts", "Add_Gene_Annotations.Rmd"),
         output_dir = args$work_dir,
-        quiet = !args$verbose,
         params = list(
             input_table_path = paste0(args$dge_output_prefix, "differential_expression_no_annotations.csv"),
             work_dir = args$work_dir,
@@ -111,7 +100,6 @@ if (!args$skip_gene_annotation) {
     cli_alert_warning("Running Extend_DGE_Table.Rmd")
     rmarkdown::render(here("dge_annotation_R_scripts", "Extend_DGE_Table.Rmd"),
         output_dir = args$work_dir,
-        quiet = !args$verbose,
         params = list(
             input_table_path = paste0(args$dge_output_prefix, "differential_expression.csv"),
             work_dir = args$work_dir,
@@ -128,7 +116,6 @@ if (!args$skip_gene_annotation) {
     cli_alert_warning("Running Generate_PCA_Table.Rmd")
     rmarkdown::render(here("dge_annotation_R_scripts", "Generate_PCA_Table.Rmd"),
         output_dir = args$work_dir,
-        quiet = !args$verbose,
         params = list(
             input_table_path = paste0(args$normalized_counts_output_prefix, "Normalized_Counts.csv"),
             work_dir = args$work_dir,
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf
@@ -22,7 +22,7 @@ include { BUILD_STAR;
           CONCAT_ERCC;
           QUANTIFY_STAR_GENES;
           QUANTIFY_RSEM_GENES } from './modules/genome.nf'
-include { DGE_BY_DESEQ2 } from './modules/dge.nf'
+include { DGE_BY_DESEQ2 } from './modules/DGE_BY_DESEQ2'
 include { VV_RAW_READS;
           VV_TRIMMED_READS;
           VV_STAR_ALIGNMENTS;
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf
@@ -28,6 +28,9 @@ process DGE_BY_DESEQ2 {
           path("dge_output_ercc/visualization_output_table_ERCCnorm.csv"),
           path("dge_output_ercc/visualization_PCA_table_ERCCnorm.csv"), optional: true, emit: dge_ercc
 
+    path("dge_output/summary.txt"), emit: summary
+    path("dge_output_ercc/ERCCnorm_summary.txt"), optional: true, emit: summary_ercc
+
     path("versions.txt"), emit: version
 
   script:
@@ -45,8 +48,7 @@ process DGE_BY_DESEQ2 {
         --dge_output_prefix "dge_output/" \\
         --annotation_file_path ${annotation_file} \\
         --extended_table_output_prefix "dge_output/"\\
-        --extended_table_output_suffix ".csv" \\
-        --verbose
+        --extended_table_output_suffix ".csv"
 
     if ${ meta.has_ercc ? 'true' : 'false'}
     then
@@ -60,8 +62,8 @@ process DGE_BY_DESEQ2 {
             --dge_output_prefix "dge_output_ercc/ERCCnorm_" \\
             --annotation_file_path ${annotation_file} \\
             --extended_table_output_prefix "dge_output_ercc/"\\
-            --extended_table_output_suffix "_ERCCnorm.csv" \\
-            --verbose
+            --extended_table_output_suffix "_ERCCnorm.csv"
     fi
+    # bump
     """
-}
+}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test
@@ -0,0 +1,71 @@
+nextflow_process {
+
+    name "Test Process DGE_BY_DESEQ2"
+    script "modules/DGE_BY_DESEQ2/main.nf"
+    process "DGE_BY_DESEQ2"
+    
+    test("GLDS-194") {
+        tag 'DGE_BY_DESEQ2'
+
+        when {
+            params {
+                // define parameters here. Example:
+                use_dummy_gene_counts = true
+            }
+            process {
+                """
+                // define inputs of the process here. Example:
+                input[0] = file("test-datasets/testdata/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv")
+                input[1] = file("test-datasets/testdata/GLDS-194/03-RSEM_Counts/*.genes.results")
+                input[2] = [ primary_keytype:'ENSEMBL', has_ercc:true ]
+                input[3] = file("https://figshare.com/ndownloader/files/36597114")
+                input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assert snapshot(
+                process.out.summary,
+                process.out.norm_counts,
+                process.out.summary_ercc,
+                process.out.norm_counts_ercc,
+                process.out.version
+            ).match()
+        }
+
+    }
+
+    test("GLDS-321:55_.ISSUE") {
+        tag 'DGE_BY_DESEQ2'
+
+        when {
+            params {
+                // define parameters here. Example:
+                use_dummy_gene_counts = true
+            }
+            process {
+                """
+                // define inputs of the process here. Example:
+                input[0] = file("test-datasets/testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv")
+                input[1] = file("test-datasets/testdata/GLDS-321/03-RSEM_Counts/*.genes.results")
+                input[2] = [ primary_keytype:'TAIR', has_ercc:false ]
+                input[3] = file("https://figshare.com/ndownloader/files/36597132")
+                input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
+                """
+            }
+        }
+
+        then {
+            assert process.success
+            assert snapshot(
+                process.out.summary,
+                process.out.norm_counts,
+                process.out.version,
+            ).match()
+        }
+
+    }
+
+}
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/DGE_BY_DESEQ2/DGE_BY_DESEQ2.nf.test.snap
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test
diff --git a/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap b/RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/tests/modules/dge.nf.test.snap