Skip to content

Commit 2e3e668

Browse files
committed
Fix #828
1 parent f0eaed4 commit 2e3e668

File tree

5 files changed

+348
-4
lines changed

5 files changed

+348
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
### Enhancements & fixes
99

10+
- [[#828](https://github.com/nf-core/rnaseq/issues/828)] - Filter BAM output of UMI-tools dedup before passing to Salmon quant
1011
- Updated pipeline template to [nf-core/tools 2.4.1](https://github.com/nf-core/tools/releases/tag/2.4.1)
1112

1213
### Parameters

bin/prepare-for-rsem.py

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
#!/usr/bin/env python3
2+
3+
'''
4+
==============================================================================
5+
Credits
6+
==============================================================================
7+
8+
This script is a clone of the "prepare-for-rsem.py" script written by
9+
Ian Sudbury, Tom Smith and other contributors to the UMI-tools package:
10+
https://github.com/CGATOxford/UMI-tools
11+
12+
It has been included here to address problems encountered with
13+
Salmon quant and RSEM as discussed in the issue below:
14+
https://github.com/CGATOxford/UMI-tools/issues/465
15+
16+
When the "umi_tools prepare-for-rsem" command becomes available in an official
17+
UMI-tools release this script will be replaced and deprecated.
18+
19+
Commit:
20+
https://github.com/CGATOxford/UMI-tools/blob/bf8608d6a172c5ca0dcf33c126b4e23429177a72/umi_tools/prepare-for-rsem.py
21+
22+
==============================================================================
23+
prepare_for_rsem - make the output from dedup or group compatible with RSEM
24+
===============================================================================
25+
The SAM format specification states that the mnext and mpos fields should point
26+
to the primary alignment of a read's mate. However, not all aligners adhere to
27+
this standard. In addition, the RSEM software requires that the mate of a read1
28+
appears directly after it in its input BAM. This requires that there is exactly
29+
one read1 alignment for every read2 and vice versa.
30+
31+
In general (except in a few edge cases) UMI tools outputs only the read2 to that
32+
corresponds to the read specified in the mnext and mpos positions of a selected
33+
read1, and only outputs this read once, even if multiple read1s point to it.
34+
This makes UMI-tools outputs incompatible with RSEM. This script takes the output
35+
from dedup or groups and ensures that each read1 has exactly one read2 (and vice
36+
versa), that read2 always appears directly after read1,and that pairs point to
37+
each other (note this is technically not valid SAM format). Copy any specified
38+
tags from read1 to read2 if they are present (by default, UG and BX, the unique
39+
group and correct UMI tags added by _group_)
40+
41+
Input must to name sorted.
42+
43+
44+
https://raw.githubusercontent.com/CGATOxford/UMI-tools/master/LICENSE
45+
46+
'''
47+
48+
from umi_tools import Utilities as U
49+
from collections import defaultdict, Counter
50+
import pysam
51+
import sys
52+
53+
usage = '''
54+
prepare_for_rsem - make output from dedup or group compatible with RSEM
55+
56+
Usage: umi_tools prepare_for_rsem [OPTIONS] [--stdin=IN_BAM] [--stdout=OUT_BAM]
57+
58+
note: If --stdout is ommited, standard out is output. To
59+
generate a valid BAM file on standard out, please
60+
redirect log with --log=LOGFILE or --log2stderr '''
61+
62+
def chunk_bam(bamfile):
63+
'''Take in a iterator of pysam.AlignmentSegment entries and yeild
64+
lists of reads that all share the same name'''
65+
66+
last_query_name = None
67+
output_buffer = list()
68+
69+
for read in bamfile:
70+
71+
if last_query_name is not None and last_query_name != read.query_name:
72+
yield(output_buffer)
73+
output_buffer = list()
74+
75+
last_query_name = read.query_name
76+
output_buffer.append(read)
77+
78+
yield (output_buffer)
79+
80+
def copy_tags(tags, read1, read2):
81+
'''Given a list of tags, copies the values of these tags from read1
82+
to read2, if the tag is set'''
83+
84+
for tag in tags:
85+
86+
try:
87+
read1_tag = read1.get_tag(tag, with_value_type=True)
88+
read2.set_tag(tag, value=read1_tag[0], value_type=read1_tag[1])
89+
except KeyError:
90+
pass
91+
92+
return(read2)
93+
94+
def pick_mate(read, template_dict, mate_key):
95+
'''Find the mate of read in the template dict using key. It will retreieve
96+
all reads at that key, and then scan to pick the one that refers to _read_
97+
as it's mate. If there is no such read, it picks a first one it comes to'''
98+
99+
mate = None
100+
101+
# get a list of secondary reads at the correct alignment position
102+
potential_mates = template_dict[not read.is_read1][mate_key]
103+
104+
# search through one at a time to find a read that points to the current read
105+
# as its mate.
106+
for candidate_mate in potential_mates:
107+
if candidate_mate.next_reference_name == read.reference_name and \
108+
candidate_mate.next_reference_start == read.pos:
109+
mate = candidate_mate
110+
111+
# if no such read is found, then pick any old secondary alignment at that position
112+
# note: this happens when UMI-tools outputs the wrong read as something's pair.
113+
if mate is None and len(potential_mates) >0:
114+
mate = potential_mates[0]
115+
116+
return mate
117+
118+
def main(argv=None):
119+
120+
if argv is None:
121+
argv = sys.argv
122+
123+
# setup command line parser
124+
parser = U.OptionParser(version="%prog version: $Id$",
125+
usage=usage,
126+
description=globals()["__doc__"])
127+
group = U.OptionGroup(parser, "RSEM preparation specific options")
128+
129+
group.add_option("--tags", dest="tags", type="string",
130+
default="UG,BX",
131+
help="Comma-seperated list of tags to transfer from read1 to read2")
132+
group.add_option("--sam", dest="sam", action="store_true",
133+
default=False,
134+
help="input and output SAM rather than BAM")
135+
136+
parser.add_option_group(group)
137+
138+
# add common options (-h/--help, ...) and parse command line
139+
(options, args) = U.Start(parser, argv=argv,
140+
add_group_dedup_options=False,
141+
add_umi_grouping_options=False,
142+
add_sam_options=False)
143+
144+
skipped_stats = Counter()
145+
146+
if options.stdin != sys.stdin:
147+
in_name = options.stdin.name
148+
options.stdin.close()
149+
else:
150+
in_name = "-"
151+
152+
if options.sam:
153+
mode = ""
154+
else:
155+
mode = "b"
156+
157+
inbam = pysam.AlignmentFile(in_name, "r"+mode)
158+
159+
if options.stdout != sys.stdout:
160+
out_name = options.stdout.name
161+
options.stdout.close()
162+
else:
163+
out_name = "-"
164+
165+
outbam = pysam.AlignmentFile(out_name, "w" + mode, template = inbam)
166+
167+
options.tags = options.tags.split(",")
168+
169+
for template in chunk_bam(inbam):
170+
171+
assert len(set(r.query_name for r in template)) == 1
172+
current_template = {True: defaultdict(list),
173+
False: defaultdict(list)}
174+
175+
for read in template:
176+
key = (read.reference_name, read.pos, not read.is_secondary)
177+
current_template[read.is_read1][key].append(read)
178+
179+
output = set()
180+
181+
for read in template:
182+
183+
mate = None
184+
185+
# if this read is a non_primary alignment, we first want to check if it has a mate
186+
# with the non-primary alignment flag set.
187+
188+
mate_key_primary = ( True)
189+
mate_key_secondary = (read.next_reference_name, read.next_reference_start, False)
190+
191+
# First look for a read that has the same primary/secondary status
192+
# as read (i.e. secondary mate for secondary read, and primary mate
193+
# for primary read)
194+
mate_key = (read.next_reference_name, read.next_reference_start,
195+
read.is_secondary)
196+
mate = pick_mate(read, current_template, mate_key)
197+
198+
# If none was found then look for the opposite (primary mate of secondary
199+
# read or seconadary mate of primary read)
200+
if mate is None:
201+
mate_key = (read.next_reference_name, read.next_reference_start,
202+
not read.is_secondary)
203+
mate = pick_mate(read, current_template, mate_key)
204+
205+
# If we still don't have a mate, then their can't be one?
206+
if mate is None:
207+
skipped_stats["no_mate"] += 1
208+
U.warn("Alignment {} has no mate -- skipped".format(
209+
"\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
210+
))
211+
continue
212+
213+
# because we might want to make changes to the read, but not have those changes reflected
214+
# if we need the read again,we copy the read. This is only way I can find to do this.
215+
read = pysam.AlignedSegment().from_dict(read.to_dict(), read.header)
216+
mate = pysam.AlignedSegment().from_dict(mate.to_dict(), read.header)
217+
218+
# Make it so that if our read is secondary, the mate is also secondary. We don't make the
219+
# mate primary if the read is primary because we would otherwise end up with mulitple
220+
# primary alignments.
221+
if read.is_secondary:
222+
mate.is_secondary = True
223+
224+
# In a situation where there is already one mate for each read, then we will come across
225+
# each pair twice - once when we scan read1 and once when we scan read2. Thus we need
226+
# to make sure we don't output something already output.
227+
if read.is_read1:
228+
229+
mate = copy_tags(options.tags, read, mate)
230+
output_key = str(read) + str(mate)
231+
232+
if output_key not in output:
233+
output.add(output_key)
234+
outbam.write(read)
235+
outbam.write(mate)
236+
skipped_stats["pairs_output"] += 1
237+
238+
elif read.is_read2:
239+
240+
read = copy_tags(options.tags, mate, read)
241+
output_key = str(mate) + str(read)
242+
243+
if output_key not in output:
244+
output.add(output_key)
245+
outbam.write(mate)
246+
outbam.write(read)
247+
skipped_stats["pairs_output"] += 1
248+
249+
else:
250+
skipped_stats["skipped_not_read12"] += 1
251+
U.warn("Alignment {} is neither read1 nor read2 -- skipped".format(
252+
"\t".join(map(str, [read.query_name, read.flag, read.reference_name, int(read.pos)]))
253+
))
254+
continue
255+
256+
if not out_name == "-":
257+
outbam.close()
258+
259+
U.info("Total pairs output: {}, Pairs skipped - no mates: {},"
260+
" Pairs skipped - not read1 or 2: {}".format(
261+
skipped_stats["pairs_output"],
262+
skipped_stats["no_mate"],
263+
skipped_stats["skipped_not_read12"]))
264+
U.Stop()
265+
266+
if __name__ == "__main__":
267+
sys.exit(main(sys.argv))
268+
269+
270+

conf/modules.config

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ if (!params.skip_alignment) {
362362
if (params.with_umi && ['star_salmon','hisat2'].contains(params.aligner)) {
363363
process {
364364
withName: '.*:DEDUP_UMI_UMITOOLS_GENOME:UMITOOLS_DEDUP' {
365+
ext.args = { meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard' }
365366
ext.prefix = { "${meta.id}.umi_dedup.sorted" }
366367
publishDir = [
367368
[
@@ -560,11 +561,20 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
560561
ext.args = '-n'
561562
ext.prefix = { "${meta.id}.umi_dedup.transcriptome" }
562563
publishDir = [
563-
path: { "${params.outdir}/samtools" },
564+
path: { "${params.outdir}/${params.aligner}" },
564565
enabled: false
565566
]
566567
}
567568

569+
withName: 'NFCORE_RNASEQ:RNASEQ:UMITOOLS_PREPAREFORRSEM' {
570+
ext.prefix = { "${meta.id}.umi_dedup.transcriptome.filtered" }
571+
publishDir = [
572+
path: { "${params.outdir}/${params.aligner}/umitools/log" },
573+
mode: params.publish_dir_mode,
574+
pattern: '*.log'
575+
]
576+
}
577+
568578
withName: 'NFCORE_RNASEQ:RNASEQ:BAM_SORT_SAMTOOLS:SAMTOOLS_SORT' {
569579
ext.prefix = { "${meta.id}.transcriptome.sorted" }
570580
publishDir = [
@@ -588,6 +598,7 @@ if (!params.skip_alignment && params.aligner == 'star_salmon') {
588598
}
589599

590600
withName: '.*:DEDUP_UMI_UMITOOLS_TRANSCRIPTOME:UMITOOLS_DEDUP' {
601+
ext.args = { meta.single_end ? '' : '--unpaired-reads=discard --chimeric-pairs=discard' }
591602
ext.prefix = { "${meta.id}.umi_dedup.transcriptome.sorted" }
592603
publishDir = [
593604
path: { "${params.outdir}/${params.aligner}/umitools" },
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
process UMITOOLS_PREPAREFORRSEM {
2+
tag "$meta.id"
3+
label 'process_medium'
4+
5+
conda (params.enable_conda ? "bioconda::umi_tools=1.1.2" : null)
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://depot.galaxyproject.org/singularity/umi_tools:1.1.2--py38h4a8c8d9_0' :
8+
'quay.io/biocontainers/umi_tools:1.1.2--py38h4a8c8d9_0' }"
9+
10+
input:
11+
tuple val(meta), path(bam)
12+
13+
output:
14+
tuple val(meta), path('*.bam'), emit: bam
15+
tuple val(meta), path('*.log'), emit: log
16+
path "versions.yml" , emit: versions
17+
18+
when:
19+
task.ext.when == null || task.ext.when
20+
21+
script: // This script is bundled with the pipeline, in nf-core/rnaseq/bin/
22+
def args = task.ext.args ?: ''
23+
def prefix = task.ext.prefix ?: "${meta.id}"
24+
"""
25+
prepare-for-rsem.py \\
26+
--stdin=$bam \\
27+
--stdout=${prefix}.bam \\
28+
--log=${prefix}.prepare_for_rsem.log \\
29+
$args
30+
31+
cat <<-END_VERSIONS > versions.yml
32+
"${task.process}":
33+
umitools: \$(umi_tools --version 2>&1 | sed 's/^.*UMI-tools version://; s/ *\$//')
34+
END_VERSIONS
35+
"""
36+
}

0 commit comments

Comments
 (0)