1010from dataclasses import dataclass
1111from datetime import timedelta
1212from glob import glob
13- from os .path import join , basename
13+ from os .path import join
1414from typing import Optional , List
1515
1616import luigi
@@ -129,7 +129,7 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
129129 library_source = root .find (
130130 'EXPERIMENT_PACKAGE/EXPERIMENT[@accession=\' ' + srx + '\' ]/DESIGN/LIBRARY_DESCRIPTOR/LIBRARY_SOURCE' )
131131
132- if library_strategy is not None and library_strategy .text not in ['RNA-Seq' , 'ssRNA-seq' ]:
132+ if library_strategy is not None and library_strategy .text not in ['RNA-Seq' , 'ssRNA-seq' , 'OTHER' ]:
133133 logger .warning ('%s Ignoring run with %s library strategy.' , srr , library_strategy .text )
134134 continue
135135
@@ -146,9 +146,11 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
146146
147147 sra_10x_bam_files = run .findall ('SRAFiles/SRAFile[@semantic_name=\' 10X Genomics bam file\' ]' )
148148
149+ sra_bam_files = run .findall ('SRAFiles/SRAFile[@semantic_name=\' bam\' ]' )
150+
149151 issues = SraRunIssue (0 )
150152
151- if not sra_fastq_files and not sra_10x_bam_files :
153+ if not sra_fastq_files and not sra_10x_bam_files and not sra_bam_files :
152154 issues |= SraRunIssue .NO_SRA_FILES
153155
154156 # if the data was loaded with fastq-load.py, we can obtain the order of the files from the options
@@ -202,6 +204,7 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
202204 number_of_spots -= 1
203205 reads .pop (i )
204206 spot_read_lengths .pop (i )
207+
205208 else :
206209 logger .warning (
207210 '%s: No spot statistics found, cannot use the average read lengths to determine the order of the FASTQ files.' ,
@@ -295,7 +298,7 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
295298
296299 if flowcells :
297300 logger .info ('%s: Detected read types from BAM file %s: %s' , srr , bam_file .attrib ['filename' ],
298- ', ' .join (rt .name for rt in bam_read_types ))
301+ ', ' .join (rt .name for rt in bam_read_types ))
299302 # FIXME: report FASTQ filenames for all flowcells and lanes
300303 flowcell = next (iter (flowcells .values ()))
301304 lane_id , read_types = next (iter (flowcell .items ()))
@@ -333,6 +336,51 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
333336 issues = issues ))
334337 continue
335338
339+ # check for regular BAM files
340+ elif sra_bam_files :
341+ logger .info ('%s: Using regular BAM files to detect read layout.' , srr )
342+
343+ if len (sra_bam_files ) > 1 :
344+ logger .warning ('%s: Multiple BAM files found, will only use the first one.' , srr )
345+ bam_file = sra_bam_files [0 ]
346+
347+ if is_single_end :
348+ fastq_filenames = ['R1.fastq.gz' ]
349+ fastq_file_sizes = None
350+ read_types = [SraReadType .BIOLOGICAL ]
351+ use_bamtofastq = True
352+ bam_filenames = [bam_file .attrib ['filename' ]]
353+ bam_file_urls = [bam_file .attrib ['url' ]]
354+ bam_fastq_filenames = ['R1.fastq.gz' ]
355+ elif is_paired :
356+ fastq_filenames = ['R1.fastq.gz' , 'R2.fastq.gz' ]
357+ fastq_file_sizes = None
358+ read_types = [SraReadType .BIOLOGICAL , SraReadType .BIOLOGICAL ]
359+ use_bamtofastq = True
360+ bam_filenames = [bam_file .attrib ['filename' ]]
361+ bam_file_urls = [bam_file .attrib ['url' ]]
362+ bam_fastq_filenames = ['R1.fastq.gz' , 'R2.fastq.gz' ]
363+ else :
364+ logger .warning ('%s: Failed to detect read types from BAM file, ignoring that run.' , srr )
365+ issues |= SraRunIssue .INVALID_RUN
366+ if include_invalid_runs :
367+ result .append (SraRunMetadata (srx , srr ,
368+ is_single_end = is_single_end ,
369+ is_paired = is_paired ,
370+ fastq_filenames = None ,
371+ fastq_file_sizes = None ,
372+ read_types = None ,
373+ number_of_spots = None ,
374+ average_read_lengths = None ,
375+ fastq_load_options = None ,
376+ use_bamtofastq = False ,
377+ bam_filenames = None ,
378+ bam_file_urls = None ,
379+ bam_fastq_filenames = None ,
380+ layout = [],
381+ issues = issues ))
382+ continue
383+
336384 # use spot statistics to determine the order of the files by matching their sizes with the sizes of the files
337385 # this is less reliable than using the fastq-load.py options, but it is still better than nothing
338386 # we can only use this strategy if all the read sizes are different and can be related to the file sizes
@@ -400,6 +448,16 @@ def read_xml_metadata(path, include_invalid_runs=False) -> List[SraRunMetadata]:
400448 read_types = None
401449 issues |= SraRunIssue .AMBIGUOUS_READ_SIZES
402450
451+ elif len (sra_fastq_files ) == 1 :
452+ logger .info ('%s: Single FASTQ file found, using it as a single-end dataset.' , srr )
453+ fastq_filenames = [sf .attrib ['filename' ] for sf in sra_fastq_files ]
454+ fastq_file_sizes = [int (sf .attrib ['size' ]) for sf in sra_fastq_files ]
455+ use_bamtofastq = False
456+ bam_filenames = [bf .attrib ['filename' ] for bf in sra_10x_bam_files ]
457+ bam_file_urls = [bf .attrib ['url' ] for bf in sra_10x_bam_files ]
458+ bam_fastq_filenames = None
459+ read_types = fastq_load_read_types
460+
403461 else :
404462 issues |= SraRunIssue .INVALID_RUN
405463 logger .warning (
@@ -463,7 +521,7 @@ def read_bam_header(srr, filename, url):
463521 logger .info ('%s: Using cached 10x BAM header from %s...' , srr , bam_header_file )
464522 else :
465523 logger .info ('%s: Reading header from 10x BAM file %s from %s to %s...' , srr ,
466- filename , url , bam_header_file )
524+ filename , url , bam_header_file )
467525 os .makedirs (os .path .dirname (bam_header_file ), exist_ok = True )
468526 with open (bam_header_file , 'w' ) as f :
469527 # FIXME: use requests
@@ -534,7 +592,7 @@ def run(self):
534592 def output (self ):
535593 return DownloadRunTarget (run_id = self .srr ,
536594 files = [join (self .output_dir , self .srr + '_' + str (i + 1 ) + '.fastq.gz' ) for i in
537- range (len (self .layout ))],
595+ range (len (self .layout ))],
538596 layout = self .layout ,
539597 output_dir = self .output_dir )
540598
0 commit comments