@@ -18,6 +18,7 @@ def __init__(self, file, db, output, threads=os.cpu_count()):
1818 self .db = db
1919 self .output = output
2020 self .threads = threads
21+ self .outfile = get_filename (self .file , self .output )
2122
2223 self .aset = {'l2' , 'l11' , 'l10e' , 'l15e' , 'l18e' , 's3ae' , 's19e' , 's28e' }
2324 self .bset = {'l2' , 'l11' , 'l20' , 'l27' , 's2' , 's7' , 's9' , 's16' }
@@ -41,8 +42,8 @@ def run_kraken(self, db_kraken):
4142 subprocess .run ([
4243 'kraken2' ,
4344 '--db' , db_kraken ,
44- '--report' , get_filename ( self .file , self . output , '. kraken.report.tmp') ,
45- '--output' , get_filename ( self .file , self . output , '. kraken.output.tmp') ,
45+ '--report' , f' { self .outfile } . kraken.report.tmp' ,
46+ '--output' , f' { self .outfile } . kraken.output.tmp' ,
4647 '--threads' , str (self .threads ),
4748 self .file ,
4849 ], check = True , stdout = subprocess .DEVNULL , stderr = subprocess .DEVNULL )
@@ -55,7 +56,7 @@ def parse_kraken(self):
5556 '''
5657 qseqids , taxids = set (), set ()
5758 record = False
58- with open (get_filename ( self .file , self . output , '. kraken.report.tmp') ) as f :
59+ with open (f' { self .outfile } . kraken.report.tmp' ) as f :
5960 for line in f :
6061 ls = line .rstrip ().split ('\t ' )
6162 if ls [3 ] in {'D' , 'R1' }:
@@ -64,7 +65,7 @@ def parse_kraken(self):
6465 if record :
6566 taxids .add (ls [4 ])
6667
67- with open (get_filename ( self .file , self . output , '. kraken.output.tmp') ) as f :
68+ with open (f' { self .outfile } . kraken.output.tmp' ) as f :
6869 for line in f :
6970 ls = line .rstrip ().split ('\t ' )
7071 if ls [2 ] in taxids :
@@ -81,7 +82,7 @@ def run_diamond(self, max_target_seqs=25, evalue=1e-15, identity=0, subject_cove
8182 'diamond' , 'blastx' ,
8283 '--db' , os .path .join (self .db , 'prot.dmnd' ),
8384 '--query' , self .file ,
84- '--out' , get_filename ( self .file , self . output , '. diamond.tmp') ,
85+ '--out' , f' { self .outfile } . diamond.tmp' ,
8586 '--outfmt' , '6' , * outfmt ,
8687 '--evalue' , str (evalue ), '--subject-cover' , str (subject_cover ), '--id' , str (identity ),
8788 '--range-culling' , '-F' , '15' , '--range-cover' , '25' ,
@@ -95,7 +96,7 @@ def parse_diamond(self):
9596 '''
9697 qcoords = defaultdict (set )
9798
98- with open (get_filename ( self .file , self . output , '. diamond.tmp') ) as f :
99+ with open (f' { self .outfile } . diamond.tmp' ) as f :
99100 for line in f :
100101 ls = line .rstrip ().split ('\t ' )
101102 qseqid , sseqid = ls [0 ], ls [1 ]
@@ -117,27 +118,27 @@ def parse_diamond(self):
117118 ):
118119 ## append qseqid and coordinates for back-tracing
119120 self .hits .append ([qseqid , kingdom , family , qstart , qend ])
120- self .copies [sseqid . split ( '-' )[ - 2 ] ] += 0.125
121+ self .copies [kingdom ] += 0.125
121122
122123 def run_minimap (self , secondary_num = 2147483647 , secondary_ratio = 0.9 ):
123124 '''
124125 Run minimap2 to get taxonomic profiles.
125126 '''
126- with open (get_filename ( self .file , self . output , '. sequence.tmp') , 'w' ) as w :
127+ with open (f' { self .outfile } . sequence.tmp' , 'w' ) as w :
127128 w .write (extract_sequences (self .file , {hit [0 ] for hit in self .hits }))
128129
129130 ## consider each kingdom + family separately
130131 qseqids = defaultdict (set )
131132 for hit in self .hits :
132133 qseqids [hit [1 ] + '.' + hit [2 ].replace ('/' , '_' )].add (hit [0 ])
133134
134- with open (get_filename ( self .file , self . output , '. minimap.tmp') , 'w' ) as w :
135+ with open (f' { self .outfile } . minimap.tmp' , 'w' ) as w :
135136 with logging_redirect_tqdm ():
136137 queue = tqdm (qseqids .items (), leave = False )
137138 for family , qseqid in queue :
138139 queue .set_description (f'==> Processing <{ family } >' )
139140
140- sequences = extract_sequences (get_filename ( self .file , self . output , '. sequence.tmp') , qseqid )
141+ sequences = extract_sequences (f' { self .outfile } . sequence.tmp' , qseqid )
141142 subprocess .run ([
142143 'minimap2' ,
143144 '-cx' , 'map-ont' ,
@@ -238,9 +239,9 @@ def postprocess(self, max_iteration=1000, epsilon=1e-10):
238239
239240 ## return assignments
240241 assignments = []
241- for row in p_reads .tocsr ():
242- row = row .toarray ().squeeze ()
243- assignments .append (np .where (row == row .max ())[0 ].tolist ())
242+ for p_read in p_reads .tocsr ():
243+ p_read = p_read .toarray ().squeeze ()
244+ assignments .append (np .where (p_read == p_read .max ())[0 ].tolist ())
244245
245246 ties = defaultdict (set )
246247 for qseqid , lineage in enumerate (assignments ):
@@ -252,16 +253,16 @@ def postprocess(self, max_iteration=1000, epsilon=1e-10):
252253 ## resolve ties for equal probability cases using AS, MS and ID
253254 if ties :
254255 qset = set .union (* (set (qseqid ) for qseqid in ties .values ()))
255- data = [row for row in data if row [0 ] in qset ]
256+ alignments = [alignment for alignment in self . alignments if alignment [0 ] in qset ]
256257
257258 for lineages , qseqids in ties .items ():
258- target = [row for row in data if row [0 ] in qseqids and row [- 1 ] in lineages ]
259+ targets = [alignment for alignment in alignments if alignment [0 ] in qseqids and alignment [- 1 ] in lineages ]
259260
260261 scores = defaultdict (lambda : defaultdict (list ))
261- for row in target :
262- scores [row [- 1 ]]['AS' ].append (row [2 ])
263- scores [row [- 1 ]]['DE' ].append (row [3 ])
264- scores [row [- 1 ]]['ID' ].append (row [4 ])
262+ for target in targets :
263+ scores [target [- 1 ]]['AS' ].append (target [2 ])
264+ scores [target [- 1 ]]['DE' ].append (target [3 ])
265+ scores [target [- 1 ]]['ID' ].append (target [4 ])
265266
266267 ## if all the same, choose the one with known species name
267268 target = sorted ([
@@ -277,7 +278,7 @@ def postprocess(self, max_iteration=1000, epsilon=1e-10):
277278 for qseqid in qseqids :
278279 self .assignments [qseqid ] = target
279280
280- def run (self , db_kraken = None , skip_profile = False , skip_clean = False ,
281+ def run (self , debug = False , db_kraken = None , skip_profile = False , skip_clean = False ,
281282 max_target_seqs = 25 , evalue = 1e-15 , identity = 0 , subject_cover = 75 ,
282283 secondary_num = 2147483647 , secondary_ratio = 0.9 ,
283284 max_iteration = 1000 , epsilon = 1e-10 ):
@@ -286,23 +287,22 @@ def run(self, db_kraken=None, skip_profile=False, skip_clean=False,
286287 '''
287288 if db_kraken is not None :
288289 logger .info ('Filtering reads ...' )
289- self .run_kraken (db_kraken )
290+ if not debug : self .run_kraken (db_krake = db_kraken )
290291 self .parse_kraken ()
291- logger .info ('... removed {} putatively non-prokaryotic reads.' . format ( len ( self . nset )) )
292+ logger .info (f '... removed { len ( self . nset ) } putatively non-prokaryotic reads.' )
292293
293294 logger .info ('Estimating genome copies ...' )
294- self .run_diamond (max_target_seqs , evalue , identity , subject_cover )
295+ if not debug : self .run_diamond (max_target_seqs = max_target_seqs , evalue = evalue , identity = identity , subject_cover = subject_cover )
295296 self .parse_diamond ()
296- logger .info ('... found {} copies of genomes (bacteria: {}; archaea: {}).' .format (
297- sum (self .copies .values ()), self .copies ['bacteria' ], self .copies ['archaea' ]))
297+ logger .info (f"... found { sum (self .copies .values ())} copies of genomes (bacteria: { self .copies ['bacteria' ]} ; archaea: { self .copies ['archaea' ]} )." )
298298
299299 if not skip_profile :
300300 logger .info ('Assigning taxonomy ...' )
301- self .run_minimap (secondary_num , secondary_ratio )
301+ if not debug : self .run_minimap (secondary_num = secondary_num , secondary_ratio = secondary_ratio )
302302
303303 logger .info ('Reassigning taxonomy ...' )
304304 self .parse_minimap ()
305- self .postprocess (max_iteration , epsilon )
305+ self .postprocess (max_iteration = max_iteration , epsilon = epsilon )
306306
307307 ## fill missing ones according to hits
308308 replacements = {
@@ -336,24 +336,23 @@ def run(self, db_kraken=None, skip_profile=False, skip_clean=False,
336336 ], key = lambda row : (row [8 ], row [10 ], row [7 ]))
337337
338338 richness = {'bacteria' : 0 , 'archaea' : 0 }
339- with open (get_filename ( self .file , self . output , '. tsv') , 'w' ) as w :
339+ with open (f' { self .outfile } . tsv' , 'w' ) as w :
340340 w .write ('\t ' .join (self .ranks + ['copy' , 'abundance' , 'identity' ]) + '\n ' )
341341
342342 for row in self .profile :
343343 if not re .search ('unclassified (Bacteria|Archaea) species' , row [6 ]):
344344 richness [row [0 ].split ('|' )[- 1 ].lower ()] += 1
345345 w .write ('\t ' .join (row [:7 ] + [f'{ row [7 ]:.3f} ' , f'{ row [8 ]:e} ' , f'{ row [9 ]:.4f} /{ row [10 ]:.4f} ' ]) + '\n ' )
346346
347- logger .info ('... found {} unique species (bacteria: {}; archaea: {}).' .format (
348- sum (richness .values ()), richness ['bacteria' ], richness ['archaea' ]))
347+ logger .info (f"... found { sum (richness .values ())} unique species (bacteria: { richness ['bacteria' ]} ; archaea: { richness ['archaea' ]} )." )
349348
350349 else :
351350 self .profile = sorted ([
352351 ['Bacteria' , self .copies ['bacteria' ], self .copies ['bacteria' ] / sum (self .copies .values ())],
353352 ['Archaea' , self .copies ['archaea' ], self .copies ['archaea' ] / sum (self .copies .values ())]
354353 ], key = lambda row : (row [- 2 ], row [- 3 ]))
355354
356- with open (get_filename ( self .file , self . output , '. tsv') , 'w' ) as w :
355+ with open (f' { self .outfile } . tsv' , 'w' ) as w :
357356 w .write ('\t ' .join (['superkingdom' , 'copy' , 'abundance' ]) + '\n ' )
358357
359358 for row in self .profile :
@@ -369,10 +368,10 @@ def run(self, db_kraken=None, skip_profile=False, skip_clean=False,
369368 reads .update ({hit [0 ]: {
370369 'remark' : 'marker-gene-containing' , 'lineage' : hit [1 ].capitalize ()} for hit in self .hits })
371370
372- with open (get_filename ( self .file , self . output , '. json') , 'w' ) as w :
371+ with open (f' { self .outfile } . json' , 'w' ) as w :
373372 json .dump (dict (sorted (reads .items ())), w , indent = 4 )
374373
375374 ## clean up
376375 if not skip_clean :
377- for file in glob .glob (get_filename ( self .file , self . output , '. *.tmp') ):
376+ for file in glob .glob (f' { self .outfile } . *.tmp' ):
378377 os .remove (file )
0 commit comments