Fix #825

drpatelh · drpatelh · commit 96f69883a01d · 2022-05-23T18:15:36.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,16 +3,25 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unpublished Version / DEV]
+## [[3.8](https://github.com/nf-core/rnaseq/releases/tag/3.8)] - 2022-05-25
+
+### :warning: Major enhancements
+
+Fixed quite a well hidden bug in the UMI processing mode of the pipeline when using `--with_umi --aligner star_salmon` as reported by [Lars Roed Ingerslev](https://github.com/lars-work-sund). Paired-end BAM files were not appropriately name sorted after `umi_tools dedup` which ultimately resulted in incorrect reading and quantification with Salmon. If you have used previous versions of the pipeline to analyse paired-end UMI data it will need to be reprocessed using this version of the pipeline. See [#828](https://github.com/nf-core/rnaseq/issues/828) for more context.
 
 ### Enhancements & fixes
 
 - [[#824](https://github.com/nf-core/rnaseq/issues/824)] - Add explicit docs for usage of featureCounts in the pipeline
+- [[#825](https://github.com/nf-core/rnaseq/issues/825)] - Pipeline fails due to trimming related removal of all reads from a sample
 - [[#828](https://github.com/nf-core/rnaseq/issues/828)] - Filter BAM output of UMI-tools dedup before passing to Salmon quant
 - Updated pipeline template to [nf-core/tools 2.4.1](https://github.com/nf-core/tools/releases/tag/2.4.1)
 
 ### Parameters
 
+| Old parameter | New parameter         |
+| ------------- | --------------------- |
+|               | `--min_trimmed_reads` |
+
 ## [[3.7](https://github.com/nf-core/rnaseq/releases/tag/3.7)] - 2022-05-03
 
 ### :warning: Major enhancements
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
@@ -28,6 +28,7 @@ run_modules:
 
 # Order of modules
 top_modules:
+  - "fail_trimmed_samples"
   - "fail_mapped_samples"
   - "fail_strand_check"
   - "star_rsem_deseq2_pca"
@@ -140,6 +141,15 @@ sp:
 
 # See https://github.com/ewels/MultiQC_TestData/blob/master/data/custom_content/with_config/table_headerconfig/multiqc_config.yaml
 custom_data:
+  fail_trimmed_samples:
+    section_name: "WARNING: Fail Trimming Check"
+    description: "List of samples that failed the minimum trimmed reads threshold specified via the '--min_trimmed_reads' parameter, and hence were ignored for the downstream processing steps."
+    plot_type: "table"
+    pconfig:
+      id: "fail_trimmed_samples_table"
+      table_title: "Samples failed trimming threshold"
+      namespace: "Samples failed trimming threshold"
+      format: "{:.0f}"
   fail_mapped_samples:
     section_name: "WARNING: Fail Alignment Check"
     description: "List of samples that failed the STAR minimum mapped reads threshold specified via the '--min_mapped_reads' parameter, and hence were ignored for the downstream processing steps."
diff --git a/conf/modules.config b/conf/modules.config
@@ -204,6 +204,13 @@ if (!params.skip_trimming) {
                 ]
             ]
         }
+
+        withName: 'MULTIQC_TSV_FAIL_TRIMMED' {
+            publishDir = [
+                path: { "${params.outdir}/multiqc" },
+                enabled: false
+            ]
+        }
     }
 }
 
diff --git a/lib/WorkflowRnaseq.groovy b/lib/WorkflowRnaseq.groovy
@@ -152,6 +152,23 @@ class WorkflowRnaseq {
         }
     }
 
+    //
+    // Function that parses TrimGalore log output file to get total number of reads after trimming
+    //
+    public static Integer getTrimGaloreReadsAfterFiltering(log_file) {
+        def total_reads = 0
+        def filtered_reads = 0
+        log_file.eachLine { line ->
+            def total_reads_matcher = line =~ /([\d\.]+)\ssequences processed in total/
+            def se_filtered_reads_matcher = line =~ /shorter than the length cutoff of\s[\d\.]+\sbp:\s([\d\.]+)/
+            def pe_filtered_reads_matcher = line =~ /shorter than the length cutoff\s\([\d\.]+\sbp\):\s([\d\.]+)/
+            if (total_reads_matcher) total_reads = total_reads_matcher[0][1].toFloat()
+            if (se_filtered_reads_matcher) filtered_reads = se_filtered_reads_matcher[0][1].toFloat()
+            if (pe_filtered_reads_matcher) filtered_reads = pe_filtered_reads_matcher[0][1].toFloat()
+        }
+        return total_reads - filtered_reads
+    }
+
     //
     // Function that parses and returns the alignment rate from the STAR log output
     //
diff --git a/modules/local/multiqc.nf b/modules/local/multiqc.nf
@@ -11,6 +11,7 @@ process MULTIQC {
     path multiqc_custom_config
     path software_versions
     path workflow_summary
+    path fail_trimming_summary
     path fail_mapping_summary
     path fail_strand_check
     path ('fastqc/*')
diff --git a/nextflow.config b/nextflow.config
@@ -33,6 +33,7 @@ params {
     save_umi_intermeds         = false
 
     // Trimming
+    min_trimmed_reads          = 10000
     clip_r1                    = null
     clip_r2                    = null
     three_prime_clip_r1        = null
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -10,7 +10,9 @@
             "type": "object",
             "fa_icon": "fas fa-terminal",
             "description": "Define where the pipeline should find input data and save output data.",
-            "required": ["outdir"],
+            "required": [
+                "outdir"
+            ],
             "properties": {
                 "input": {
                     "type": "string",
@@ -329,6 +331,12 @@
                     "help_text": "This enables the option Cutadapt `--nextseq-trim=3'CUTOFF` option via Trim Galore, which will set a quality cutoff (that is normally given with -q instead), but qualities of G bases are ignored. This trimming is in common for the NextSeq- and NovaSeq-platforms, where basecalls without any signal are called as high-quality G bases.",
                     "fa_icon": "fas fa-cut"
                 },
+                "min_trimmed_reads": {
+                    "type": "integer",
+                    "default": 10000,
+                    "fa_icon": "fas fa-hand-paper",
+                    "description": "Minimum number of trimmed reads below which samples are removed from further processing. Some downstream steps in the pipeline will fail if this threshold is too low."
+                },
                 "skip_trimming": {
                     "type": "boolean",
                     "description": "Skip the adapter trimming step.",
@@ -354,13 +362,19 @@
                     "default": "star_salmon",
                     "description": "Specifies the alignment algorithm to use - available options are 'star_salmon', 'star_rsem' and 'hisat2'.",
                     "fa_icon": "fas fa-map-signs",
-                    "enum": ["star_salmon", "star_rsem", "hisat2"]
+                    "enum": [
+                        "star_salmon",
+                        "star_rsem",
+                        "hisat2"
+                    ]
                 },
                 "pseudo_aligner": {
                     "type": "string",
                     "description": "Specifies the pseudo aligner to use - available options are 'salmon'. Runs in addition to '--aligner'.",
                     "fa_icon": "fas fa-hamburger",
-                    "enum": ["salmon"]
+                    "enum": [
+                        "salmon"
+                    ]
                 },
                 "bam_csi_index": {
                     "type": "boolean",
@@ -596,7 +610,14 @@
                     "description": "Method used to save pipeline results to output directory.",
                     "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.",
                     "fa_icon": "fas fa-copy",
-                    "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"],
+                    "enum": [
+                        "symlink",
+                        "rellink",
+                        "link",
+                        "copy",
+                        "copyNoFollow",
+                        "move"
+                    ],
                     "hidden": true
                 },
                 "email_on_fail": {
diff --git a/workflows/rnaseq.nf b/workflows/rnaseq.nf
@@ -106,6 +106,7 @@ include { DUPRADAR                           } from '../modules/local/dupradar'
 include { MULTIQC                            } from '../modules/local/multiqc'
 include { MULTIQC_CUSTOM_BIOTYPE             } from '../modules/local/multiqc_custom_biotype'
 include { MULTIQC_TSV_FROM_LIST as MULTIQC_TSV_FAIL_MAPPED  } from '../modules/local/multiqc_tsv_from_list'
+include { MULTIQC_TSV_FROM_LIST as MULTIQC_TSV_FAIL_TRIMMED } from '../modules/local/multiqc_tsv_from_list'
 include { MULTIQC_TSV_FROM_LIST as MULTIQC_TSV_STRAND_CHECK } from '../modules/local/multiqc_tsv_from_list'
 
 //
@@ -194,8 +195,9 @@ workflow RNASEQ {
     .reads
     .map {
         meta, fastq ->
-            meta.id = meta.id.split('_')[0..-2].join('_')
-            [ meta, fastq ] 
+            def meta_clone = meta.clone()
+            meta_clone.id = meta_clone.id.split('_')[0..-2].join('_')
+            [ meta_clone, fastq ] 
     }
     .groupTuple(by: [0])
     .branch {
@@ -233,9 +235,47 @@ workflow RNASEQ {
     ch_versions = ch_versions.mix(FASTQC_UMITOOLS_TRIMGALORE.out.versions)
 
     //
-    // MODULE: Remove genome contaminant reads
+    // Filter channels to get samples that passed minimum trimmed read count
     //
+    ch_fail_trimming_multiqc = Channel.empty()
     ch_filtered_reads = FASTQC_UMITOOLS_TRIMGALORE.out.reads
+    if (!params.skip_trimming) {
+        ch_filtered_reads
+            .join(FASTQC_UMITOOLS_TRIMGALORE.out.trim_log)
+            .map {
+                meta, reads, trim_log ->
+                    if (!meta.single_end) {
+                        trim_log = trim_log[-1]
+                    }
+                    num_reads = WorkflowRnaseq.getTrimGaloreReadsAfterFiltering(trim_log)
+                    [ meta, reads, num_reads ]
+            }
+            .set { ch_num_trimmed_reads  }
+
+        ch_num_trimmed_reads
+            .map { meta, reads, num_reads -> if (num_reads > params.min_trimmed_reads) [ meta, reads ] }
+            .set { ch_filtered_reads }
+
+        ch_num_trimmed_reads
+            .map {
+                meta, reads, num_reads ->
+                if (num_reads <= params.min_trimmed_reads) {
+                    return [ "$meta.id\t$num_reads" ]
+                }
+            }
+            .set { ch_num_trimmed_reads }
+        
+        MULTIQC_TSV_FAIL_TRIMMED (
+            ch_num_trimmed_reads.collect(),
+            ["Sample", "Reads after trimming"],
+            'fail_trimmed_samples'
+        )
+        .set { ch_fail_trimming_multiqc }
+    }
+
+    //
+    // MODULE: Remove genome contaminant reads
+    //
     if (!params.skip_bbsplit) {
         BBMAP_BBSPLIT (
             ch_filtered_reads,
@@ -726,6 +766,7 @@ workflow RNASEQ {
             ch_multiqc_custom_config.collect().ifEmpty([]),
             CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect(),
             ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml'),
+            ch_fail_trimming_multiqc.ifEmpty([]),
             ch_fail_mapping_multiqc.ifEmpty([]),
             ch_fail_strand_multiqc.ifEmpty([]),
             FASTQC_UMITOOLS_TRIMGALORE.out.fastqc_zip.collect{it[1]}.ifEmpty([]),

Original file line number	Diff line number	Diff line change
`@@ -204,6 +204,13 @@ if (!params.skip_trimming) {`
`204`	`204`	`]`
`205`	`205`	`]`
`206`	`206`	`}`
	`207`	`+`
	`208`	`+ withName: 'MULTIQC_TSV_FAIL_TRIMMED' {`
	`209`	`+ publishDir = [`
	`210`	`+ path: { "${params.outdir}/multiqc" },`
	`211`	`+ enabled: false`
	`212`	`+ ]`
	`213`	`+ }`
`207`	`214`	`}`
`208`	`215`	`}`
`209`	`216`