nf-core
diff --git a/‎.prettierignore‎
Lines changed: 1 addition & 1 deletion b/‎.prettierignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CHANGELOG.md‎
Lines changed: 14 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎assets/multiqc_config.yml‎
Lines changed: 10 additions & 0 deletions b/‎assets/multiqc_config.yml‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎bin/prepare-for-rsem.py‎
Lines changed: 270 additions & 0 deletions b/‎bin/prepare-for-rsem.py‎
Lines changed: 270 additions & 0 deletions
diff --git a/‎conf/modules.config‎
Lines changed: 19 additions & 1 deletion b/‎conf/modules.config‎
Lines changed: 19 additions & 1 deletion
@@ -6,4 +6,4 @@ results/
 .DS_Store
 testing/
 testing*
-*.pyc
+*.pyc
@@ -3,14 +3,27 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [Unpublished Version / DEV]
+## [[3.8](https://github.com/nf-core/rnaseq/releases/tag/3.8)] - 2022-05-25
+
+### :warning: Major enhancements
+
+Fixed quite a well hidden bug in the UMI processing mode of the pipeline when using `--with_umi --aligner star_salmon` as reported by [Lars Roed Ingerslev](https://github.com/lars-work-sund). Paired-end BAM files were not appropriately name sorted after `umi_tools dedup` which ultimately resulted in incorrect reading and quantification with Salmon. If you have used previous versions of the pipeline to analyse paired-end UMI data it will need to be reprocessed using this version of the pipeline. See [#828](https://github.com/nf-core/rnaseq/issues/828) for more context.
 
 ### Enhancements & fixes
 
+- [[#824](https://github.com/nf-core/rnaseq/issues/824)] - Add explicit docs for usage of featureCounts in the pipeline
+- [[#825](https://github.com/nf-core/rnaseq/issues/825)] - Pipeline fails due to trimming related removal of all reads from a sample
+- [[#827](https://github.com/nf-core/rnaseq/issues/827)] - Control generation of --output-stats when running umi-tools dedup
+- [[#828](https://github.com/nf-core/rnaseq/issues/828)] - Filter BAM output of UMI-tools dedup before passing to Salmon quant
 - Updated pipeline template to [nf-core/tools 2.4.1](https://github.com/nf-core/tools/releases/tag/2.4.1)
 
 ### Parameters
 
+| Old parameter | New parameter            |
+| ------------- | ------------------------ |
+|               | `--min_trimmed_reads`    |
+|               | `--umitools_dedup_stats` |
+
 ## [[3.7](https://github.com/nf-core/rnaseq/releases/tag/3.7)] - 2022-05-03
 
 ### :warning: Major enhancements
 
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) Phil Ewels, Rickard Hammarén
+Copyright (c) Harshil Patel, Phil Ewels, Rickard Hammarén
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -28,6 +28,7 @@ run_modules:
 
 # Order of modules
 top_modules:
+  - "fail_trimmed_samples"
   - "fail_mapped_samples"
   - "fail_strand_check"
   - "star_rsem_deseq2_pca"
@@ -140,6 +141,15 @@ sp:
 
 # See https://github.com/ewels/MultiQC_TestData/blob/master/data/custom_content/with_config/table_headerconfig/multiqc_config.yaml
 custom_data:
+  fail_trimmed_samples:
+    section_name: "WARNING: Fail Trimming Check"
+    description: "List of samples that failed the minimum trimmed reads threshold specified via the '--min_trimmed_reads' parameter, and hence were ignored for the downstream processing steps."
+    plot_type: "table"
+    pconfig:
+      id: "fail_trimmed_samples_table"
+      table_title: "Samples failed trimming threshold"
+      namespace: "Samples failed trimming threshold"
+      format: "{:.0f}"
   fail_mapped_samples:
     section_name: "WARNING: Fail Alignment Check"
     description: "List of samples that failed the STAR minimum mapped reads threshold specified via the '--min_mapped_reads' parameter, and hence were ignored for the downstream processing steps."
 
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+
+'''
+==============================================================================
+Credits
+==============================================================================
+
+This script is a clone of the "prepare-for-rsem.py" script written by
+Ian Sudbury, Tom Smith and other contributors to the UMI-tools package:
+https://github.com/CGATOxford/UMI-tools
+
+It has been included here to address problems encountered with 
+Salmon quant and RSEM as discussed in the issue below:
+https://github.com/CGATOxford/UMI-tools/issues/465
+
+When the "umi_tools prepare-for-rsem" command becomes available in an official
+UMI-tools release this script will be replaced and deprecated.
+
+Commit:
+https://github.com/CGATOxford/UMI-tools/blob/bf8608d6a172c5ca0dcf33c126b4e23429177a72/umi_tools/prepare-for-rsem.py
+
+==============================================================================
+prepare_for_rsem - make the output from dedup or group compatible with RSEM
+===============================================================================
+The SAM format specification states that the mnext and mpos fields should point
+to the primary alignment of a read's mate. However, not all aligners adhere to
+this standard. In addition, the RSEM software requires that the mate of a read1
+appears directly after it in its input BAM. This requires that there is exactly
+one read1 alignment for every read2 and vice versa.
+
+In general (except in a few edge cases) UMI tools outputs only the read2 to that 
+corresponds to the read specified in the mnext and mpos positions of a selected
+read1, and only outputs this read once, even if multiple read1s point to it. 
+This makes UMI-tools outputs incompatible with RSEM. This script takes the output
+from dedup or groups and ensures that each read1 has exactly one read2 (and vice
+versa), that read2 always appears directly after read1,and that pairs point to 
+each other (note this is technically not valid SAM format). Copy any specified
+tags from read1 to read2 if they are present (by default, UG and BX, the unique
+group and correct UMI tags added by _group_)
+
+Input must to name sorted.
+
+
+https://raw.githubusercontent.com/CGATOxford/UMI-tools/master/LICENSE
+
+'''
+
+from umi_tools import Utilities as U
+from collections import defaultdict, Counter
+import pysam
+import sys
+
+usage = '''
+prepare_for_rsem - make output from dedup or group compatible with RSEM
+
+Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
+
+       note: If --stdout is ommited, standard out is output. To
+             generate a valid BAM file on standard out, please
+             redirect log with --log=LOGFILE or --log2stderr '''
+
+def chunk_bam(bamfile):
+    '''Take in a iterator of pysam.AlignmentSegment entries and yeild
+    lists of reads that all share the same name'''
+
+    last_query_name = None
+    output_buffer = list()
+
+    for read in bamfile:
+    
+        if last_query_name is not None and last_query_name != read.query_name:
+            yield(output_buffer)
+            output_buffer = list()
+        
+        last_query_name = read.query_name
+        output_buffer.append(read)
+
+    yield (output_buffer)
+
+def copy_tags(tags, read1, read2):
+    '''Given a  list of tags, copies the values of these tags from read1
+    to read2, if the tag is set'''
+
+    for tag in tags:
+        
+        try:
+            read1_tag = read1.get_tag(tag, with_value_type=True)
+            read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1])
+        except KeyError:
+            pass
+        
+    return(read2)
+
+def pick_mate(read, template_dict, mate_key):
+    '''Find the mate of read in the template dict using key. It will retreieve
+    all reads at that key, and then scan to pick the one that refers to _read_
+    as it's mate. If there is no such read, it picks a first one it comes to'''
+
+    mate = None
+
+    # get a list of secondary reads at the correct alignment position
+    potential_mates = template_dict[not read.is_read1][mate_key]
+
+    # search through one at a time to find a read that points to the current read
+    # as its mate.
+    for candidate_mate in potential_mates:
+        if candidate_mate.next_reference_name == read.reference_name and \
+            candidate_mate.next_reference_start == read.pos:
+            mate = candidate_mate
+               
+    # if no such read is found, then pick any old secondary alignment at that position
+    # note: this happens when UMI-tools outputs the wrong read as something's pair.
+    if mate is None and len(potential_mates) >0:
+        mate = potential_mates[0]
+
+    return mate
+
+def main(argv=None):
+
+    if argv is None:
+        argv = sys.argv
+
+    # setup command line parser
+    parser = U.OptionParser(version="%prog version: $Id$",
+                            usage=usage,
+                            description=globals()["__doc__"])
+    group = U.OptionGroup(parser, "RSEM preparation specific options")
+
+    group.add_option("--tags", dest="tags", type="string",
+                     default="UG,BX",
+                     help="Comma-seperated list of tags to transfer from read1 to read2")
+    group.add_option("--sam", dest="sam", action="store_true",
+                     default=False,
+                     help="input and output SAM rather than BAM")
+
+    parser.add_option_group(group)
+
+    # add common options (-h/--help, ...) and parse command line
+    (options, args) = U.Start(parser, argv=argv,
+                              add_group_dedup_options=False,
+                              add_umi_grouping_options=False,
+                              add_sam_options=False)
+
+    skipped_stats = Counter()
+
+    if options.stdin != sys.stdin:
+        in_name = options.stdin.name
+        options.stdin.close()
+    else:
+        in_name = "-"
+
+    if options.sam:
+        mode = ""
+    else:
+        mode = "b"
+
+    inbam = pysam.AlignmentFile(in_name, "r"+mode)
+
+    if options.stdout != sys.stdout:
+        out_name = options.stdout.name
+        options.stdout.close()
+    else:
+        out_name = "-"
+
+    outbam = pysam.AlignmentFile(out_name, "w" + mode, template = inbam)
+
+    options.tags = options.tags.split(",")
+
+    for template in chunk_bam(inbam):
+        
+        assert len(set(r.query_name for r in template)) == 1
+        current_template = {True: defaultdict(list),
+                            False: defaultdict(list)}
+
+        for read in template:
+            key = (read.reference_name, read.pos, not read.is_secondary)
+            current_template[read.is_read1][key].append(read)
+
+        output = set()
+
+        for read in template:
+           
+            mate = None
+           
+            # if this read is a non_primary alignment, we first want to check if it has a mate
+            # with the non-primary alignment flag set. 
+
+            mate_key_primary = ( True)
+            mate_key_secondary = (read.next_reference_name, read.next_reference_start, False)
+            
+            # First look for a read that has the same primary/secondary status
+            # as read (i.e. secondary mate for secondary read, and primary mate
+            # for primary read)
+            mate_key = (read.next_reference_name, read.next_reference_start,
+                        read.is_secondary)
+            mate = pick_mate(read, current_template, mate_key)
+
+            # If none was found then look for the opposite (primary mate of secondary
+            # read or seconadary mate of primary read)
+            if mate is None:
+                mate_key = (read.next_reference_name, read.next_reference_start,
+                            not read.is_secondary)
+                mate = pick_mate(read, current_template, mate_key)
+
+            # If we still don't have a mate, then their can't be one?
+            if mate is None:
+                skipped_stats["no_mate"] += 1
+                U.warn("Alignment {} has no mate -- skipped".format(
+                    "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
+                ))
+                continue
+            
+            # because we might want to make changes to the read, but not have those changes reflected
+            # if we need the read again,we copy the read. This is only way I can find to do this.
+            read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header)
+            mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header)
+
+            # Make it so that if our read is secondary, the mate is also secondary. We don't make the
+            # mate primary if the read is primary because we would otherwise end up with mulitple
+            # primary alignments. 
+            if read.is_secondary:
+                mate.is_secondary = True
+            
+            # In a situation where there is already one mate for each read, then we will come across
+            # each pair twice - once when we scan read1 and once when we scan read2. Thus we need
+            # to make sure we don't output something already output. 
+            if read.is_read1:
+                
+                mate = copy_tags(options.tags, read, mate)  
+                output_key = str(read) + str(mate)
+
+                if output_key not in output:
+                    output.add(output_key)
+                    outbam.write(read)
+                    outbam.write(mate)
+                    skipped_stats["pairs_output"] += 1
+
+            elif read.is_read2:
+
+                read = copy_tags(options.tags, mate, read)
+                output_key = str(mate) + str(read)
+
+                if output_key not in output:
+                    output.add(output_key)
+                    outbam.write(mate)
+                    outbam.write(read)
+                    skipped_stats["pairs_output"] += 1
+
+            else:
+                skipped_stats["skipped_not_read12"] += 1
+                U.warn("Alignment {} is neither read1 nor read2 -- skipped".format(
+                    "\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
+                ))
+                continue
+
+    if not out_name == "-":
+        outbam.close()
+    
+    U.info("Total pairs output: {}, Pairs skipped - no mates: {},"
+           " Pairs skipped - not read1 or 2: {}".format(
+               skipped_stats["pairs_output"],
+               skipped_stats["no_mate"],
+               skipped_stats["skipped_not_read12"]))
+    U.Stop()
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
+
+
+
@@ -204,6 +204,13 @@ if (!params.skip_trimming) {
                 ]
             ]
         }
+
+        withName: 'MULTIQC_TSV_FAIL_TRIMMED' {
+            publishDir = [
+                path: { "${params.outdir}/multiqc" },
+                enabled: false
+            ]
+        }
     }
 }
 
@@ -362,6 +369,7 @@ if (!params.skip_alignment) {
     if (params.with_umi && ['star_salmon','hisat2'].contains(params.aligner)) {
         process {
             withName: '.*:DEDUP_UMI_UMITOOLS_GENOME:UMITOOLS_DEDUP' {
+                ext.args = { meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard' }
                 ext.prefix = { "${meta.id}.umi_dedup.sorted" }
                 publishDir = [
                     [
@@ -560,11 +568,20 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
                 ext.args   = '-n'
                 ext.prefix = { "${meta.id}.umi_dedup.transcriptome" }
                 publishDir = [
-                    path: { "${params.outdir}/samtools" },
+                    path: { "${params.outdir}/${params.aligner}" },
                     enabled: false
                 ]
             }
 
+            withName: 'NFCORE_RNASEQ:RNASEQ:UMITOOLS_PREPAREFORRSEM' {
+                ext.prefix = { "${meta.id}.umi_dedup.transcriptome.filtered" }
+                publishDir = [
+                    path: { "${params.outdir}/${params.aligner}/umitools/log" },
+                    mode: params.publish_dir_mode,
+                    pattern: '*.log'
+                ]
+            }
+
             withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_SAMTOOLS:SAMTOOLS_SORT' {
                 ext.prefix = { "${meta.id}.transcriptome.sorted" }
                 publishDir = [
@@ -588,6 +605,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
             }
 
             withName: '.*:DEDUP_UMI_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP' {
+                ext.args = { meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard' }
                 ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted" }
                 publishDir = [
                     path: { "${params.outdir}/${params.aligner}/umitools" },
Original file line number	Diff line number	Diff line change
`@@ -204,6 +204,13 @@ if (!params.skip_trimming) {`
`204`	`204`	`]`
`205`	`205`	`]`
`206`	`206`	`}`
	`207`	`+`
	`208`	`+ withName: 'MULTIQC_TSV_FAIL_TRIMMED' {`
	`209`	`+ publishDir = [`
	`210`	`+ path: { "${params.outdir}/multiqc" },`
	`211`	`+ enabled: false`
	`212`	`+ ]`
	`213`	`+ }`
`207`	`214`	`}`
`208`	`215`	`}`
`209`	`216`
`@@ -362,6 +369,7 @@ if (!params.skip_alignment) {`
`362`	`369`	`if (params.with_umi && ['star_salmon','hisat2'].contains(params.aligner)) {`
`363`	`370`	`process {`
`364`	`371`	`withName: '.*:DEDUP_UMI_UMITOOLS_GENOME:UMITOOLS_DEDUP' {`
	`372`	`+ ext.args = { meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard' }`
`365`	`373`	`ext.prefix = { "${meta.id}.umi_dedup.sorted" }`
`366`	`374`	`publishDir = [`
`367`	`375`	`[`
`@@ -560,11 +568,20 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {`
`560`	`568`	`ext.args = '-n'`
`561`	`569`	`ext.prefix = { "${meta.id}.umi_dedup.transcriptome" }`
`562`	`570`	`publishDir = [`
`563`		`- path: { "${params.outdir}/samtools" },`
	`571`	`+ path: { "${params.outdir}/${params.aligner}" },`
`564`	`572`	`enabled: false`
`565`	`573`	`]`
`566`	`574`	`}`
`567`	`575`
	`576`	`+ withName: 'NFCORE_RNASEQ:RNASEQ:UMITOOLS_PREPAREFORRSEM' {`
	`577`	`+ ext.prefix = { "${meta.id}.umi_dedup.transcriptome.filtered" }`
	`578`	`+ publishDir = [`
	`579`	`+ path: { "${params.outdir}/${params.aligner}/umitools/log" },`
	`580`	`+ mode: params.publish_dir_mode,`
	`581`	`+ pattern: '*.log'`
	`582`	`+ ]`
	`583`	`+ }`
	`584`	`+`
`568`	`585`	`withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_SAMTOOLS:SAMTOOLS_SORT' {`
`569`	`586`	`ext.prefix = { "${meta.id}.transcriptome.sorted" }`
`570`	`587`	`publishDir = [`
`@@ -588,6 +605,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {`
`588`	`605`	`}`
`589`	`606`
`590`	`607`	`withName: '.*:DEDUP_UMI_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP' {`
	`608`	`+ ext.args = { meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard' }`
`591`	`609`	`ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted" }`
`592`	`610`	`publishDir = [`
`593`	`611`	`path: { "${params.outdir}/${params.aligner}/umitools" },`