Skip to content

Commit c36b4e8

Browse files
committed
Add support for OTHER library strategies and limited support for non-10x BAM files
1 parent 97430fc commit c36b4e8

File tree

6 files changed

+871
-7
lines changed

6 files changed

+871
-7
lines changed

rnaseq_pipeline/sources/sra.py

Lines changed: 64 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from dataclasses import dataclass
1111
from datetime import timedelta
1212
from glob import glob
13-
from os.path import join, basename
13+
from os.path import join
1414
from typing import Optional, List
1515

1616
import luigi
@@ -129,7 +129,7 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
129129
library_source = root.find(
130130
'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\'' + srx + '\']/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_SOURCE')
131131

132-
if library_strategy is not None and library_strategy.text not in ['RNA-Seq', 'ssRNA-seq']:
132+
if library_strategy is not None and library_strategy.text not in ['RNA-Seq', 'ssRNA-seq', 'OTHER']:
133133
logger.warning('%s Ignoring run with %s library strategy.', srr, library_strategy.text)
134134
continue
135135

@@ -146,9 +146,11 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
146146

147147
sra_10x_bam_files = run.findall('SRAFiles/SRAFile[@semantic_name=\'10X Genomics bam file\']')
148148

149+
sra_bam_files = run.findall('SRAFiles/SRAFile[@semantic_name=\'bam\']')
150+
149151
issues = SraRunIssue(0)
150152

151-
if not sra_fastq_files and not sra_10x_bam_files:
153+
if not sra_fastq_files and not sra_10x_bam_files and not sra_bam_files:
152154
issues |= SraRunIssue.NO_SRA_FILES
153155

154156
# if the data was loaded with fastq-load.py, we can obtain the order of the files from the options
@@ -202,6 +204,7 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
202204
number_of_spots -= 1
203205
reads.pop(i)
204206
spot_read_lengths.pop(i)
207+
205208
else:
206209
logger.warning(
207210
'%s: No spot statistics found, cannot use the average read lengths to determine the order of the FASTQ files.',
@@ -295,7 +298,7 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
295298

296299
if flowcells:
297300
logger.info('%s: Detected read types from BAM file %s: %s', srr, bam_file.attrib['filename'],
298-
', '.join(rt.name for rt in bam_read_types))
301+
', '.join(rt.name for rt in bam_read_types))
299302
# FIXME: report FASTQ filenames for all flowcells and lanes
300303
flowcell = next(iter(flowcells.values()))
301304
lane_id, read_types = next(iter(flowcell.items()))
@@ -333,6 +336,51 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
333336
issues=issues))
334337
continue
335338

339+
# check for regular BAM files
340+
elif sra_bam_files:
341+
logger.info('%s: Using regular BAM files to detect read layout.', srr)
342+
343+
if len(sra_bam_files) > 1:
344+
logger.warning('%s: Multiple BAM files found, will only use the first one.', srr)
345+
bam_file = sra_bam_files[0]
346+
347+
if is_single_end:
348+
fastq_filenames = ['R1.fastq.gz']
349+
fastq_file_sizes = None
350+
read_types = [SraReadType.BIOLOGICAL]
351+
use_bamtofastq = True
352+
bam_filenames = [bam_file.attrib['filename']]
353+
bam_file_urls = [bam_file.attrib['url']]
354+
bam_fastq_filenames = ['R1.fastq.gz']
355+
elif is_paired:
356+
fastq_filenames = ['R1.fastq.gz', 'R2.fastq.gz']
357+
fastq_file_sizes = None
358+
read_types = [SraReadType.BIOLOGICAL, SraReadType.BIOLOGICAL]
359+
use_bamtofastq = True
360+
bam_filenames = [bam_file.attrib['filename']]
361+
bam_file_urls = [bam_file.attrib['url']]
362+
bam_fastq_filenames = ['R1.fastq.gz', 'R2.fastq.gz']
363+
else:
364+
logger.warning('%s: Failed to detect read types from BAM file, ignoring that run.', srr)
365+
issues |= SraRunIssue.INVALID_RUN
366+
if include_invalid_runs:
367+
result.append(SraRunMetadata(srx, srr,
368+
is_single_end=is_single_end,
369+
is_paired=is_paired,
370+
fastq_filenames=None,
371+
fastq_file_sizes=None,
372+
read_types=None,
373+
number_of_spots=None,
374+
average_read_lengths=None,
375+
fastq_load_options=None,
376+
use_bamtofastq=False,
377+
bam_filenames=None,
378+
bam_file_urls=None,
379+
bam_fastq_filenames=None,
380+
layout=[],
381+
issues=issues))
382+
continue
383+
336384
# use spot statistics to determine the order of the files by matching their sizes with the sizes of the files
337385
# this is less reliable than using the fastq-load.py options, but it is still better than nothing
338386
# we can only use this strategy if all the read sizes are different and can be related to the file sizes
@@ -400,6 +448,16 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
400448
read_types = None
401449
issues |= SraRunIssue.AMBIGUOUS_READ_SIZES
402450

451+
elif len(sra_fastq_files) == 1:
452+
logger.info('%s: Single FASTQ file found, using it as a single-end dataset.', srr)
453+
fastq_filenames = [sf.attrib['filename'] for sf in sra_fastq_files]
454+
fastq_file_sizes = [int(sf.attrib['size']) for sf in sra_fastq_files]
455+
use_bamtofastq = False
456+
bam_filenames = [bf.attrib['filename'] for bf in sra_10x_bam_files]
457+
bam_file_urls = [bf.attrib['url'] for bf in sra_10x_bam_files]
458+
bam_fastq_filenames = None
459+
read_types = fastq_load_read_types
460+
403461
else:
404462
issues |= SraRunIssue.INVALID_RUN
405463
logger.warning(
@@ -463,7 +521,7 @@ def read_bam_header(srr, filename, url):
463521
logger.info('%s: Using cached 10x BAM header from %s...', srr, bam_header_file)
464522
else:
465523
logger.info('%s: Reading header from 10x BAM file %s from %s to %s...', srr,
466-
filename, url, bam_header_file)
524+
filename, url, bam_header_file)
467525
os.makedirs(os.path.dirname(bam_header_file), exist_ok=True)
468526
with open(bam_header_file, 'w') as f:
469527
# FIXME: use requests
@@ -534,7 +592,7 @@ def run(self):
534592
def output(self):
535593
return DownloadRunTarget(run_id=self.srr,
536594
files=[join(self.output_dir, self.srr + '_' + str(i + 1) + '.fastq.gz') for i in
537-
range(len(self.layout))],
595+
range(len(self.layout))],
538596
layout=self.layout,
539597
output_dir=self.output_dir)
540598

tests/data/SRX1772599.xml

Lines changed: 168 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)