Skip to content

Commit d857577

Browse files
committed
J-81 PR changes 3/3 (no CI changes)
nasa#29 - Based on: 1092e80 2dd8fce 11a22f0 7ba75d0 4e317f2 5d35b61 7d2cc24 53363e4 1b6e325 b5013e9 574eb79 fc89c5e 1500019 6719218 35c9823 af0b716 cbf6055 6a0d105 ee188eb a8c65d3 - Changed dge module to DGE_BY_DESEQ2, added test modules - Removed deprecated conda support files - Updated ensembl file used for test dataset VV
1 parent 61be4ad commit d857577

File tree

9 files changed

+185
-254
lines changed

9 files changed

+185
-254
lines changed

RNAseq/Workflow_Documentation/NF_RCP-F/CHANGELOG.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,19 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [Unreleased]
9+
10+
### Fixed
11+
12+
- Workflow usage files will all follow output directory set by workflow user
13+
14+
### Changed
15+
16+
- TrimGalore! will now use autodetect for adaptor type
17+
- V&V migrated from dp_tools version 1.1.8 to 1.3.2 including:
18+
- Migration of V&V protocol code to this codebase instead of dp_tools
19+
- Fix for sample wise checks reusing same sample
20+
821
## [1.0.3](https://github.com/nasa/GeneLab_Data_Processing/tree/NF_RCP-F_1.0.3/RNAseq/Workflow_Documentation/NF_RCP-F) - 2023-01-25
922

1023
### Added

RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/Perform_DGE.Rmd

Lines changed: 51 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -7,49 +7,70 @@ params:
77
input_gene_results_dir: ""
88
# One and only one of the following must be specified
99
runsheet_path: NULL
10-
isa_path: NULL
1110

1211
primary_keytype: "" # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR)
1312
normalization: "default" # ENUM like, supports "ERCC-groupB" and "default"
1413
normalized_counts_output_prefix: ""
1514
dge_output_prefix: ""
1615
DEBUG_MODE_LIMIT_GENES: FALSE
1716
DEBUG_MODE_ADD_DUMMY_COUNTS: FALSE
18-
work_dir: "." # should be set to launch directory
19-
verbose: FALSE
17+
work_dir: "." # NON_DPPD: should be set to launch directory
18+
SUMMARY_FILE_PATH: "summary.txt"
2019
---
2120

2221
## Substeps {.tabset}
2322

2423
### 1. Setup
25-
24+
<!--- START:NON_DPPD --->
2625
```{r, setup, include=FALSE}
2726
knitr::opts_knit$set(root.dir = params$work_dir)
2827
library(knitr)
2928
```
3029

31-
```{r libary-loading, message = params$verbose, warning = params$verbose}
30+
```{r libary-loading}
3231
# allow more flexibility in download time
3332
# useful for slower connections where the default of 60 seconds might be exceeded
3433
options(timeout=600)
3534
36-
# Import libraries (tximport, DESeq2, tidyverse, Risa)
35+
# Import libraries (tximport, DESeq2, tidyverse)
3736
library(tximport)
3837
library(DESeq2)
3938
library(stringr)
4039
4140
params
41+
SUMMARY_FILE_PATH <- params$SUMMARY_FILE_PATH
4242
yaml::write_yaml(params, "last_params.yml")
43-
```
44-
```{r validate_params}
45-
# assert either runsheet_path OR isa_path supplied in params
46-
if (!xor(!is.null(params$runsheet_path), !is.null(params$isa_path))) {
47-
stop("Must supply EITHER runsheet_path or isa_path in params")
48-
}
43+
# END:NON_DPPD
44+
45+
# START:ONLY_DPPD
46+
# params <- c(
47+
# runsheet_path = "/path/to/runsheet", # Used for downloading
48+
# input_gene_results_dir = "/path/to/genes_results_files", # Location of the gene results files
49+
# primary_keytype = "", # Denotes the name of the indentifier column (e.g. ENSEMBL, TAIR)
50+
# normalization = "", # ENUM like, supports "ERCC-groupB" and "default"
51+
# normalized_counts_output_prefix = "", # Output prefix for normalized counts files
52+
# dge_output_prefix = "" # Output prefix for DGE files
53+
# )
54+
# END:ONLY_DPPD
4955
```
5056

5157
### 2. Load Study Metadata
52-
```{r runsheet-to-compare_df, include=(!is.null(params$runsheet_path)), eval=(!is.null(params$runsheet_path))}
58+
```{r runsheet-to-compare_df}
59+
#' Calculate the square of a number
60+
#'
61+
#' This function takes a numeric input and returns its square.
62+
#'
63+
#' @param x Numeric value to be squared.
64+
#'
65+
#' @return The square of the input value.
66+
#'
67+
#' @examples
68+
#' square(2)
69+
#' # Output: 4
70+
#'
71+
#' square(-3)
72+
#' # Output: 9
73+
#'
5374
compare_csv_from_runsheet <- function(runsheet_path) {
5475
df = read.csv(runsheet_path)
5576
# get only Factor Value columns
@@ -64,25 +85,6 @@ compare_csv <- compare_csv_from_runsheet(params$runsheet_path)
6485
#DT::datatable(compare_csv, caption = "Data Frame of parsed runsheet filtered to required columns")
6586
```
6687

67-
```{r isa-to-compare_df, include=(!is.null(params$isa_path)), eval=(!is.null(params$isa_path))}
68-
# TODO: Remove this route, ISA zip support will be dropped as of DPPD-7101-F
69-
library(Risa)
70-
71-
compare_csv_from_isa_archive <- function(isa_path) {
72-
td = tempdir()
73-
unzip(isa_path, exdir = td)
74-
isa <- Risa::readISAtab(path = td)
75-
n = as.numeric(which([email protected] == "RNA Sequencing (RNA-Seq)"))
76-
isa_tabs <- [email protected][[n]]@assay.file
77-
factors <- as.data.frame(isa@factors[[1]], stringsAsFactors = FALSE)
78-
colnames(factors) <- paste("factor",1:dim(factors)[2], sep = "_")
79-
return(data.frame(sample_id = isa_tabs$`Sample Name`, factors))
80-
}
81-
# Loading metadata from isa archive
82-
compare_csv <- compare_csv_from_isa_archive(params$isa_path)
83-
#DT::datatable(compare_csv, caption = "Data Frame of parsed isa archive filtered to required metadata")
84-
```
85-
8688
```{r compare_df-to-study_df}
8789
study <- as.data.frame(compare_csv[,2:dim(compare_csv)[2]])
8890
colnames(study) <- colnames(compare_csv)[2:dim(compare_csv)[2]]
@@ -130,8 +132,7 @@ files <- list.files(
130132
131133
## Reorder the *genes.results files to match the ordering of the ISA samples
132134
133-
# Replace spaces in sample names from ISA with "_", consistent with runsheet generation
134-
samples = str_replace_all(rownames(study), " ", "_")
135+
samples = rownames(study)
135136
reordering <- sapply(samples, function(x)grep(paste0("Rsem_gene_counts/", x,".genes.results$"), files, value=FALSE))
136137
files <- files[reordering]
137138
names(files) <- samples
@@ -335,12 +336,12 @@ output_table_1$LRT.p.value <- res_1_lrt@listData$padj
335336
```{r wald-test-iteration}
336337
## Iterate through Wald Tests to generate pairwise comparisons of all groups
337338
for (i in 1:dim(contrasts)[2]){
338-
res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i]))
339-
res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)]
340-
colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i]))
341-
output_table_1<-cbind(output_table_1,res_1)
342-
rm(res_1)
339+
res_1 <- results(dds_1, contrast=c("condition",contrasts[1,i],contrasts[2,i]))
340+
res_1 <- as.data.frame(res_1@listData)[,c(2,4,5,6)]
341+
colnames(res_1)<-c(paste0("Log2fc_",colnames(contrasts)[i]),paste0("Stat_",colnames(contrasts)[i]),paste0("P.value_",colnames(contrasts)[i]),paste0("Adj.p.value_",colnames(contrasts)[i]))
342+
output_table_1<-cbind(output_table_1,res_1)
343343
}
344+
344345
```
345346

346347
```{r}
@@ -385,6 +386,16 @@ write.csv(
385386
sampleTable,
386387
file = paste0(params$dge_output_prefix, "SampleTable.csv")
387388
)
389+
390+
# Create summary file based on output_table_1
391+
output <- capture.output(summary(output_table_1))
392+
393+
# Open file connection
394+
conn <- file(paste0(params$dge_output_prefix, "summary.txt"), "w")
395+
396+
# Write the captured output to the file
397+
writeLines(output, conn)
398+
388399
# DT::datatable(head(output_table_1, n = 30),
389400
# caption = "First 30 rows of differential gene expression table",
390401
# extensions = "FixedColumns",
@@ -408,4 +419,4 @@ cat(capture.output(sessionInfo()),
408419
file = version_output_fn,
409420
append = TRUE,
410421
sep = "\n")
411-
```
422+
```

RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/bin/dge_annotation_R_scripts/dge_annotation_workflow.R

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@ library("here")
44
library("cli")
55

66
parser <- OptionParser()
7-
parser <- add_option(parser, c("-v", "--verbose"),
8-
action = "store_true",
9-
default = FALSE, help = "Print extra output [default]"
10-
)
117
parser <- add_option(parser, c("--skip_perform_dge"),
128
action = "store_true", default = FALSE,
139
help = "Skips running the DGE, this can be used when the output from the DGE already exist",
@@ -43,9 +39,6 @@ parser <- add_option(parser, c("--DEBUG_MODE_ADD_DUMMY_COUNTS"),
4339
default = FALSE, action = "store_true",
4440
help = "Replaces all gene counts with random values from 0 to 5000",
4541
)
46-
parser <- add_option(parser, c("--isa_path"),
47-
help = "ISA Archive path, one of two allowed metadata inputs, exactly one metadata input must be supplied",
48-
)
4942
parser <- add_option(parser, c("--runsheet_path"),
5043
help = "runsheet csv path, one of two allowed metadata inputs, exactly one metadata input must be supplied",
5144
)
@@ -69,14 +62,11 @@ if (!args$skip_perform_dge) {
6962
cli_alert_warning("Running Perform_DGE.Rmd")
7063
rmarkdown::render(here("dge_annotation_R_scripts", "Perform_DGE.Rmd"),
7164
output_dir = args$work_dir,
72-
quiet = !args$verbose,
7365
params = list(
7466
work_dir = args$work_dir,
75-
verbose = args$verbose,
7667
input_gene_results_dir = args$input_gene_results_dir,
7768
primary_keytype = args$primary_keytype,
7869
runsheet_path = args$runsheet_path,
79-
isa_path = args$isa_path,
8070
normalization = args$normalization,
8171
dge_output_prefix = args$dge_output_prefix,
8272
normalized_counts_output_prefix = args$normalized_counts_output_prefix,
@@ -93,7 +83,6 @@ if (!args$skip_gene_annotation) {
9383
cli_alert_warning("Running Add_Gene_Annotations.Rmd")
9484
rmarkdown::render(here("dge_annotation_R_scripts", "Add_Gene_Annotations.Rmd"),
9585
output_dir = args$work_dir,
96-
quiet = !args$verbose,
9786
params = list(
9887
input_table_path = paste0(args$dge_output_prefix, "differential_expression_no_annotations.csv"),
9988
work_dir = args$work_dir,
@@ -111,7 +100,6 @@ if (!args$skip_gene_annotation) {
111100
cli_alert_warning("Running Extend_DGE_Table.Rmd")
112101
rmarkdown::render(here("dge_annotation_R_scripts", "Extend_DGE_Table.Rmd"),
113102
output_dir = args$work_dir,
114-
quiet = !args$verbose,
115103
params = list(
116104
input_table_path = paste0(args$dge_output_prefix, "differential_expression.csv"),
117105
work_dir = args$work_dir,
@@ -128,7 +116,6 @@ if (!args$skip_gene_annotation) {
128116
cli_alert_warning("Running Generate_PCA_Table.Rmd")
129117
rmarkdown::render(here("dge_annotation_R_scripts", "Generate_PCA_Table.Rmd"),
130118
output_dir = args$work_dir,
131-
quiet = !args$verbose,
132119
params = list(
133120
input_table_path = paste0(args$normalized_counts_output_prefix, "Normalized_Counts.csv"),
134121
work_dir = args$work_dir,

RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/main.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ include { BUILD_STAR;
2222
CONCAT_ERCC;
2323
QUANTIFY_STAR_GENES;
2424
QUANTIFY_RSEM_GENES } from './modules/genome.nf'
25-
include { DGE_BY_DESEQ2 } from './modules/dge.nf'
25+
include { DGE_BY_DESEQ2 } from './modules/DGE_BY_DESEQ2'
2626
include { VV_RAW_READS;
2727
VV_TRIMMED_READS;
2828
VV_STAR_ALIGNMENTS;

RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/dge.nf renamed to RNAseq/Workflow_Documentation/NF_RCP-F/workflow_code/modules/DGE_BY_DESEQ2/main.nf

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ process DGE_BY_DESEQ2 {
2828
path("dge_output_ercc/visualization_output_table_ERCCnorm.csv"),
2929
path("dge_output_ercc/visualization_PCA_table_ERCCnorm.csv"), optional: true, emit: dge_ercc
3030

31+
path("dge_output/summary.txt"), emit: summary
32+
path("dge_output_ercc/ERCCnorm_summary.txt"), optional: true, emit: summary_ercc
33+
3134
path("versions.txt"), emit: version
3235

3336
script:
@@ -45,8 +48,7 @@ process DGE_BY_DESEQ2 {
4548
--dge_output_prefix "dge_output/" \\
4649
--annotation_file_path ${annotation_file} \\
4750
--extended_table_output_prefix "dge_output/"\\
48-
--extended_table_output_suffix ".csv" \\
49-
--verbose
51+
--extended_table_output_suffix ".csv"
5052
5153
if ${ meta.has_ercc ? 'true' : 'false'}
5254
then
@@ -60,8 +62,8 @@ process DGE_BY_DESEQ2 {
6062
--dge_output_prefix "dge_output_ercc/ERCCnorm_" \\
6163
--annotation_file_path ${annotation_file} \\
6264
--extended_table_output_prefix "dge_output_ercc/"\\
63-
--extended_table_output_suffix "_ERCCnorm.csv" \\
64-
--verbose
65+
--extended_table_output_suffix "_ERCCnorm.csv"
6566
fi
67+
# bump
6668
"""
67-
}
69+
}
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
nextflow_process {
2+
3+
name "Test Process DGE_BY_DESEQ2"
4+
script "modules/DGE_BY_DESEQ2/main.nf"
5+
process "DGE_BY_DESEQ2"
6+
7+
test("GLDS-194") {
8+
tag 'DGE_BY_DESEQ2'
9+
10+
when {
11+
params {
12+
// define parameters here. Example:
13+
use_dummy_gene_counts = true
14+
}
15+
process {
16+
"""
17+
// define inputs of the process here. Example:
18+
input[0] = file("test-datasets/testdata/GLDS-194/Metadata/GLDS-194_bulkRNASeq_v1_runsheet.csv")
19+
input[1] = file("test-datasets/testdata/GLDS-194/03-RSEM_Counts/*.genes.results")
20+
input[2] = [ primary_keytype:'ENSEMBL', has_ercc:true ]
21+
input[3] = file("https://figshare.com/ndownloader/files/36597114")
22+
input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
23+
"""
24+
}
25+
}
26+
27+
then {
28+
assert process.success
29+
assert snapshot(
30+
process.out.summary,
31+
process.out.norm_counts,
32+
process.out.summary_ercc,
33+
process.out.norm_counts_ercc,
34+
process.out.version
35+
).match()
36+
}
37+
38+
}
39+
40+
test("GLDS-321:55_.ISSUE") {
41+
tag 'DGE_BY_DESEQ2'
42+
43+
when {
44+
params {
45+
// define parameters here. Example:
46+
use_dummy_gene_counts = true
47+
}
48+
process {
49+
"""
50+
// define inputs of the process here. Example:
51+
input[0] = file("test-datasets/testdata/GLDS-321/Metadata/GLDS-321_bulkRNASeq_v1_runsheet.csv")
52+
input[1] = file("test-datasets/testdata/GLDS-321/03-RSEM_Counts/*.genes.results")
53+
input[2] = [ primary_keytype:'TAIR', has_ercc:false ]
54+
input[3] = file("https://figshare.com/ndownloader/files/36597132")
55+
input[4] = file("${ baseDir }/bin/dge_annotation_R_scripts.zip")
56+
"""
57+
}
58+
}
59+
60+
then {
61+
assert process.success
62+
assert snapshot(
63+
process.out.summary,
64+
process.out.norm_counts,
65+
process.out.version,
66+
).match()
67+
}
68+
69+
}
70+
71+
}

0 commit comments

Comments
 (0)