1515def parse_arguments ():
1616 parser = argparse .ArgumentParser (
1717 prog = "CAT bins" ,
18- description = "Run Bin Annotation Tool (BAT) on a set of bins ." ,
19- usage = "CAT bins -b DIR -d DIR -t DIR [options] [-h / --help]" ,
18+ description = "Run Bin Annotation Tool (BAT)." ,
19+ usage = "CAT bins -b DIR / FILE -d DIR -t DIR [options] [-h / --help]" ,
2020 add_help = False
2121 )
2222
2323 required = parser .add_argument_group ("Required arguments" )
24- shared .add_argument (required , "bin_folder " , True )
24+ shared .add_argument (required , "bin_fasta_or_folder " , True )
2525 shared .add_argument (required , "database_folder" , True )
2626 shared .add_argument (required , "taxonomy_folder" , True )
2727
@@ -30,28 +30,8 @@ def parse_arguments():
3030 shared .add_argument (optional , "r" , False , default = decimal .Decimal (5 ))
3131 shared .add_argument (optional , "f" , False , default = decimal .Decimal (0.3 ))
3232 shared .add_argument (optional , "out_prefix" , False , default = "./out.BAT" )
33- shared .add_argument (
34- optional ,
35- "proteins_fasta" ,
36- False ,
37- help_ = (
38- "Path to concatenated predicted proteins fasta file generated "
39- "during an earlier run of BAT on the same bins. If supplied, BAT "
40- "will skip the protein prediction step."
41- )
42- )
43- shared .add_argument (
44- optional ,
45- "alignment_file" ,
46- False ,
47- help_ = (
48- "Path to alignment table generated during an earlier run of BAT "
49- "on the same bins. If supplied, BAT will skip the alignment step "
50- "and directly classify the bins. A concatenated predicted "
51- "proteins fasta file should also be supplied with argument "
52- "[-p / --proteins]."
53- )
54- )
33+ shared .add_argument (optional , "proteins_fasta" , False )
34+ shared .add_argument (optional , "alignment_file" , False )
5535 shared .add_argument (
5636 optional , "path_to_prodigal" , False , default = "prodigal" )
5737 shared .add_argument (optional , "path_to_diamond" , False , default = "diamond" )
@@ -102,7 +82,7 @@ def import_bins(bin_folder, bin_suffix, log_file, quiet):
10282 shared .give_user_feedback (message , log_file , quiet )
10383
10484 bin2contigs = {}
105- contig_names = set ()
85+ contig2bin = {}
10686
10787 for file_ in os .listdir (bin_folder ):
10888 if file_ .startswith ("." ):
@@ -127,30 +107,30 @@ def import_bins(bin_folder, bin_suffix, log_file, quiet):
127107 if line .startswith (">" ):
128108 contig = line .split ()[0 ].rstrip ().lstrip (">" )
129109
130- # Add bin name in front of the contig name.
131- new_contig_name = "{0}_{1}" .format (bin_ , contig )
132-
133- if new_contig_name in contig_names :
110+ if contig in contig2bin :
134111 message = (
135- "BAT has encountered {0} twice in bin {1}. Each "
136- "fasta header should be unique in each bin."
137- "" .format (contig , bin_ )
112+ "BAT has encountered {0} twice, in {1} and in "
113+ "{2}. Fasta headers should be unique across bins, "
114+ "please remove or rename duplicates."
115+ "" .format (contig , contig2bin [contig ], bin_ )
138116 )
139117 shared .give_user_feedback (
140118 message , log_file , quiet , error = True )
141119
142120 sys .exit (1 )
143121
144- contig_names . add ( new_contig_name )
122+ contig2bin . setdefault ( contig , bin_ )
145123
146- bin2contigs [bin_ ].append (new_contig_name )
124+ bin2contigs [bin_ ].append (contig )
147125
148126 if len (bin2contigs ) == 1 :
149127 message = "1 bin found!"
150128 else :
151129 message = "{0:,d} bins found!" .format (len (bin2contigs ))
152130 shared .give_user_feedback (message , log_file , quiet )
153131
132+ contig_names = set (contig2bin )
133+
154134 return (bin2contigs , contig_names )
155135
156136
@@ -166,8 +146,7 @@ def make_concatenated_fasta(
166146 if line .startswith (">" ):
167147 contig = line .split ()[0 ].rstrip ().lstrip (">" )
168148
169- # add bin name in front of the contig name.
170- outf1 .write (">{0}_{1}\n " .format (bin_ , contig ))
149+ outf1 .write (">{0}\n " .format (contig ))
171150 else :
172151 outf1 .write (line )
173152
@@ -215,10 +194,9 @@ def run():
215194 message , args .log_file , args .quiet , show_time = False )
216195 elif not args .proteins_fasta and args .alignment_file :
217196 message = (
218- "if you want BAT to directly classify a set of bins , you should "
197+ "if you want BAT to directly do the classification , you should "
219198 "not only supply a DIAMOND alignment table but also a "
220- "concatenated predicted protein fasta file with argument "
221- "[-p / --proteins]."
199+ "predicted protein fasta file with argument [-p / --proteins]."
222200 )
223201 shared .give_user_feedback (
224202 message , args .log_file , args .quiet , error = True )
@@ -231,15 +209,19 @@ def run():
231209 message = (
232210 "Rarw!\n \n "
233211 "Supplied command: {0}\n \n "
234- "Bin folder: {1}\n "
235- "Taxonomy folder: {2}\n "
236- "Database folder: {3}\n "
237- "Parameter r: {4}\n "
238- "Parameter f: {5}\n "
239- "Log file: {6}\n \n "
212+ "" .format (" " .join (sys .argv ))
213+ )
214+ if "bin_folder" in args :
215+ message += "Bin folder: {0}\n " .format (args .bin_folder )
216+ if "bin_fasta" in args :
217+ message += "Bin fasta: {0}\n " .format (args .bin_fasta )
218+ message += (
219+ "Taxonomy folder: {0}\n "
220+ "Database folder: {1}\n "
221+ "Parameter r: {2}\n "
222+ "Parameter f: {3}\n "
223+ "Log file: {4}\n \n "
240224 "-----------------\n " .format (
241- " " .join (sys .argv ),
242- args .bin_folder ,
243225 args .taxonomy_folder ,
244226 args .database_folder ,
245227 int (args .r ),
@@ -257,10 +239,15 @@ def run():
257239
258240 errors = []
259241
260- errors .append (
261- check .check_bin_folder (
262- args .bin_folder , args .bin_suffix , args .log_file , args .quiet )
263- )
242+ if "bin_folder" in args :
243+ errors .append (
244+ check .check_bin_folder (
245+ args .bin_folder , args .bin_suffix , args .log_file , args .quiet )
246+ )
247+
248+ if "bin_fasta" in args :
249+ errors .append (
250+ check .check_bin_fasta (args .bin_fasta , args .log_file , args .quiet ))
264251
265252 errors .append (
266253 check .check_out_prefix (args .out_prefix , args .log_file , args .quiet ))
@@ -377,14 +364,26 @@ def run():
377364 message , args .log_file , args .quiet , show_time = False )
378365
379366 # Start BAT.
380- (bin2contigs , contig_names ) = import_bins (
381- args .bin_folder , args .bin_suffix , args .log_file , args .quiet )
367+ if "bin_folder" in args :
368+ (bin2contigs , contig_names ) = import_bins (
369+ args .bin_folder , args .bin_suffix , args .log_file , args .quiet )
370+
371+ bin_folder = args .bin_folder
372+ else :
373+ contig_names = shared .import_contig_names (
374+ args .bin_fasta , args .log_file , args .quiet )
375+
376+ bin_folder , bin_ = args .bin_fasta .rsplit ('/' , 1 )
377+ bin_folder += '/'
378+
379+ bin2contigs = {}
380+ bin2contigs [bin_ ] = sorted (contig_names )
382381
383382 if "predict_proteins" in step_list :
384383 make_concatenated_fasta (
385384 args .concatenated_fasta ,
386385 bin2contigs ,
387- args . bin_folder ,
386+ bin_folder , # Note: not in args.
388387 args .log_file ,
389388 args .quiet
390389 )
@@ -426,10 +425,12 @@ def run():
426425 n_classified_bins = 0
427426
428427 with open (args .bin2classification_output_file , "w" ) as outf1 , open (args .ORF2LCA_output_file , "w" ) as outf2 :
429- outf1 .write ("# bin\t classification\t reason\t lineage\t lineage scores\n " )
428+ outf1 .write ("# bin\t classification\t reason\t lineage\t "
429+ "lineage scores (f: {0:.2f})\n " .format (args .f ))
430+
431+ outf2 .write ("# ORF\t bin\t number of hits (r: {0})\t lineage\t "
432+ "top bit-score\n " .format (args .r ))
430433
431- outf2 .write ("# ORF\t bin\t number of hits\t lineage\t top bit-score\n " )
432-
433434 for bin_ in sorted (bin2contigs ):
434435 LCAs_ORFs = []
435436
@@ -440,7 +441,7 @@ def run():
440441 for ORF in contig2ORFs [contig ]:
441442 if ORF not in ORF2hits :
442443 outf2 .write ("{0}\t {1}\t ORF has no hit to database\n "
443- "" .format (ORF , bin_ ))
444+ "" .format (ORF , bin_ ))
444445
445446 continue
446447
@@ -451,10 +452,8 @@ def run():
451452 ORF2hits [ORF ], fastaid2LCAtaxid , taxid2parent )
452453
453454 if taxid .startswith ("no taxid found" ):
454- outf2 .write ("{0}\t {1}\t {2}\t {3}\t {4}\n "
455- "" .format (
456- ORF , bin_ , n_hits , taxid , top_bitscore )
457- )
455+ outf2 .write ("{0}\t {1}\t {2}\t {3}\t {4}\n " .format (
456+ ORF , bin_ , n_hits , taxid , top_bitscore ))
458457 else :
459458 lineage = tax .find_lineage (taxid , taxid2parent )
460459
@@ -474,7 +473,7 @@ def run():
474473
475474 if len (LCAs_ORFs ) == 0 :
476475 outf1 .write ("{0}\t no taxid assigned\t no hits to database\n "
477- "" .format (bin_ ))
476+ "" .format (bin_ ))
478477
479478 continue
480479
@@ -485,7 +484,7 @@ def run():
485484
486485 if lineages == "no ORFs with taxids found." :
487486 outf1 .write ("{0}\t no taxid assigned\t "
488- "hits not found in taxonomy files\n " .format (bin_ ))
487+ "hits not found in taxonomy files\n " .format (bin_ ))
489488
490489 continue
491490
@@ -512,7 +511,7 @@ def run():
512511 lineage , taxids_with_multiple_offspring )
513512
514513 scores = ["{0:.2f}" .format (score ) for
515- score in lineages_scores [i ]]
514+ score in lineages_scores [i ]]
516515
517516 if len (lineages ) == 1 :
518517 # There is only one classification.
@@ -547,21 +546,25 @@ def run():
547546
548547 message = (
549548 "\n -----------------\n \n "
550- "{0} BAT is done! {1:,d}/{2:,d} bins have taxonomy assigned."
551- "" .format (shared .timestamp (), n_classified_bins , len (bin2contigs ))
549+ "{0} BAT is done! {1:,d}/{2:,d} bins ({3:.2f}%) have "
550+ "taxonomy assigned." .format (
551+ shared .timestamp (),
552+ n_classified_bins ,
553+ len (bin2contigs ),
554+ n_classified_bins / len (bin2contigs ) * 100
555+ )
552556 )
553557 shared .give_user_feedback (
554558 message , args .log_file , args .quiet , show_time = False )
555559
556560 if args .f < 0.5 :
557561 message = ("since f is set to smaller than 0.5, one bin "
558- "may have multiple classifications." )
562+ "may have multiple classifications." )
559563 shared .give_user_feedback (
560564 message , args .log_file , args .quiet , show_time = False , warning = True )
561565
562566 return
563567
564568
565569if __name__ == "__main__" :
566- sys .exit ("Run \' CAT bins\' to run Bin Annotation Tool (BAT) on a "
567- "set of bins." )
570+ sys .exit ("Run \' CAT bins\' to run Bin Annotation Tool (BAT)." )
0 commit comments