Skip to content

Commit 3e2bf36

Browse files
Merge pull request #96 from MGXlab/master
CAT v5.3
2 parents 8a75be2 + 1dc53b0 commit 3e2bf36

File tree

9 files changed

+146
-597
lines changed

9 files changed

+146
-597
lines changed

CAT_pack/CAT

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,15 @@ import bins
88
import contigs
99
import download
1010
import prepare
11-
import single_bin
1211
import summarise
1312

1413

1514
def usage():
1615
message = (
17-
"usage: CAT (download | prepare | contigs | bin | bins | add_names | "
16+
"usage: CAT (download | prepare | contigs | bins | add_names | "
1817
"summarise) [-v / --version] [-h / --help]\n"
1918
"CAT: error: one of the arguments "
20-
"download prepare contigs bin bins add_names summarise "
19+
"download prepare contigs bins add_names summarise "
2120
"is required"
2221
)
2322

@@ -45,8 +44,7 @@ def help():
4544
" download\t\tDownload and preprocess data from NCBI nr or GTDB.\n"
4645
" prepare\t\tConstruct database files.\n"
4746
" contigs\t\tRun CAT.\n"
48-
" bin\t\t\tRun BAT on a single bin.\n"
49-
" bins\t\t\tRun BAT on a set of bins.\n"
47+
" bins\t\t\tRun BAT.\n"
5048
" add_names\t\tAdd taxonomic names to CAT or BAT output files.\n"
5149
" summarise\t\tSummarise a named CAT or BAT classification file."
5250
"\n\n"
@@ -69,8 +67,6 @@ def main():
6967
prepare.run()
7068
elif sys.argv[1] == "contigs":
7169
contigs.run()
72-
elif sys.argv[1] == "bin":
73-
single_bin.run()
7470
elif sys.argv[1] == "bins":
7571
bins.run()
7672
elif sys.argv[1] == "add_names":

CAT_pack/about.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#!/usr/bin/env python3
22

33
__author__ = "F. A. Bastiaan von Meijenfeldt"
4-
__version__ = "5.2.3"
5-
__date__ = "10 February, 2021"
4+
__version__ = "5.3"
5+
__date__ = "4 November, 2023"

CAT_pack/add_names.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def run():
9090
lineage_index = line.index("lineage")
9191
else:
9292
message = ("{0} is not a supported classification file."
93-
"".format(input_file))
93+
"".format(args.input_file))
9494
shared.give_user_feedback(
9595
message, args.log_file, args.quiet, error=True)
9696

CAT_pack/bins.py

Lines changed: 74 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@
1515
def parse_arguments():
1616
parser = argparse.ArgumentParser(
1717
prog="CAT bins",
18-
description="Run Bin Annotation Tool (BAT) on a set of bins.",
19-
usage="CAT bins -b DIR -d DIR -t DIR [options] [-h / --help]",
18+
description="Run Bin Annotation Tool (BAT).",
19+
usage="CAT bins -b DIR / FILE -d DIR -t DIR [options] [-h / --help]",
2020
add_help=False
2121
)
2222

2323
required = parser.add_argument_group("Required arguments")
24-
shared.add_argument(required, "bin_folder", True)
24+
shared.add_argument(required, "bin_fasta_or_folder", True)
2525
shared.add_argument(required, "database_folder", True)
2626
shared.add_argument(required, "taxonomy_folder", True)
2727

@@ -30,28 +30,8 @@ def parse_arguments():
3030
shared.add_argument(optional, "r", False, default=decimal.Decimal(5))
3131
shared.add_argument(optional, "f", False, default=decimal.Decimal(0.3))
3232
shared.add_argument(optional, "out_prefix", False, default="./out.BAT")
33-
shared.add_argument(
34-
optional,
35-
"proteins_fasta",
36-
False,
37-
help_=(
38-
"Path to concatenated predicted proteins fasta file generated "
39-
"during an earlier run of BAT on the same bins. If supplied, BAT "
40-
"will skip the protein prediction step."
41-
)
42-
)
43-
shared.add_argument(
44-
optional,
45-
"alignment_file",
46-
False,
47-
help_=(
48-
"Path to alignment table generated during an earlier run of BAT "
49-
"on the same bins. If supplied, BAT will skip the alignment step "
50-
"and directly classify the bins. A concatenated predicted "
51-
"proteins fasta file should also be supplied with argument "
52-
"[-p / --proteins]."
53-
)
54-
)
33+
shared.add_argument(optional, "proteins_fasta", False)
34+
shared.add_argument(optional, "alignment_file", False)
5535
shared.add_argument(
5636
optional, "path_to_prodigal", False, default="prodigal")
5737
shared.add_argument(optional, "path_to_diamond", False, default="diamond")
@@ -102,7 +82,7 @@ def import_bins(bin_folder, bin_suffix, log_file, quiet):
10282
shared.give_user_feedback(message, log_file, quiet)
10383

10484
bin2contigs = {}
105-
contig_names = set()
85+
contig2bin = {}
10686

10787
for file_ in os.listdir(bin_folder):
10888
if file_.startswith("."):
@@ -127,30 +107,30 @@ def import_bins(bin_folder, bin_suffix, log_file, quiet):
127107
if line.startswith(">"):
128108
contig = line.split()[0].rstrip().lstrip(">")
129109

130-
# Add bin name in front of the contig name.
131-
new_contig_name = "{0}_{1}".format(bin_, contig)
132-
133-
if new_contig_name in contig_names:
110+
if contig in contig2bin:
134111
message = (
135-
"BAT has encountered {0} twice in bin {1}. Each "
136-
"fasta header should be unique in each bin."
137-
"".format(contig, bin_)
112+
"BAT has encountered {0} twice, in {1} and in "
113+
"{2}. Fasta headers should be unique across bins, "
114+
"please remove or rename duplicates."
115+
"".format(contig, contig2bin[contig], bin_)
138116
)
139117
shared.give_user_feedback(
140118
message, log_file, quiet, error=True)
141119

142120
sys.exit(1)
143121

144-
contig_names.add(new_contig_name)
122+
contig2bin.setdefault(contig, bin_)
145123

146-
bin2contigs[bin_].append(new_contig_name)
124+
bin2contigs[bin_].append(contig)
147125

148126
if len(bin2contigs) == 1:
149127
message = "1 bin found!"
150128
else:
151129
message = "{0:,d} bins found!".format(len(bin2contigs))
152130
shared.give_user_feedback(message, log_file, quiet)
153131

132+
contig_names = set(contig2bin)
133+
154134
return (bin2contigs, contig_names)
155135

156136

@@ -166,8 +146,7 @@ def make_concatenated_fasta(
166146
if line.startswith(">"):
167147
contig = line.split()[0].rstrip().lstrip(">")
168148

169-
# add bin name in front of the contig name.
170-
outf1.write(">{0}_{1}\n".format(bin_, contig))
149+
outf1.write(">{0}\n".format(contig))
171150
else:
172151
outf1.write(line)
173152

@@ -215,10 +194,9 @@ def run():
215194
message, args.log_file, args.quiet, show_time=False)
216195
elif not args.proteins_fasta and args.alignment_file:
217196
message = (
218-
"if you want BAT to directly classify a set of bins, you should "
197+
"if you want BAT to directly do the classification, you should "
219198
"not only supply a DIAMOND alignment table but also a "
220-
"concatenated predicted protein fasta file with argument "
221-
"[-p / --proteins]."
199+
"predicted protein fasta file with argument [-p / --proteins]."
222200
)
223201
shared.give_user_feedback(
224202
message, args.log_file, args.quiet, error=True)
@@ -231,15 +209,19 @@ def run():
231209
message = (
232210
"Rarw!\n\n"
233211
"Supplied command: {0}\n\n"
234-
"Bin folder: {1}\n"
235-
"Taxonomy folder: {2}\n"
236-
"Database folder: {3}\n"
237-
"Parameter r: {4}\n"
238-
"Parameter f: {5}\n"
239-
"Log file: {6}\n\n"
212+
"".format(" ".join(sys.argv))
213+
)
214+
if "bin_folder" in args:
215+
message += "Bin folder: {0}\n".format(args.bin_folder)
216+
if "bin_fasta" in args:
217+
message += "Bin fasta: {0}\n".format(args.bin_fasta)
218+
message += (
219+
"Taxonomy folder: {0}\n"
220+
"Database folder: {1}\n"
221+
"Parameter r: {2}\n"
222+
"Parameter f: {3}\n"
223+
"Log file: {4}\n\n"
240224
"-----------------\n".format(
241-
" ".join(sys.argv),
242-
args.bin_folder,
243225
args.taxonomy_folder,
244226
args.database_folder,
245227
int(args.r),
@@ -257,10 +239,15 @@ def run():
257239

258240
errors = []
259241

260-
errors.append(
261-
check.check_bin_folder(
262-
args.bin_folder, args.bin_suffix, args.log_file, args.quiet)
263-
)
242+
if "bin_folder" in args:
243+
errors.append(
244+
check.check_bin_folder(
245+
args.bin_folder, args.bin_suffix, args.log_file, args.quiet)
246+
)
247+
248+
if "bin_fasta" in args:
249+
errors.append(
250+
check.check_bin_fasta(args.bin_fasta, args.log_file, args.quiet))
264251

265252
errors.append(
266253
check.check_out_prefix(args.out_prefix, args.log_file, args.quiet))
@@ -377,14 +364,26 @@ def run():
377364
message, args.log_file, args.quiet, show_time=False)
378365

379366
# Start BAT.
380-
(bin2contigs, contig_names) = import_bins(
381-
args.bin_folder, args.bin_suffix, args.log_file, args.quiet)
367+
if "bin_folder" in args:
368+
(bin2contigs, contig_names) = import_bins(
369+
args.bin_folder, args.bin_suffix, args.log_file, args.quiet)
370+
371+
bin_folder = args.bin_folder
372+
else:
373+
contig_names = shared.import_contig_names(
374+
args.bin_fasta, args.log_file, args.quiet)
375+
376+
bin_folder, bin_ = args.bin_fasta.rsplit('/', 1)
377+
bin_folder += '/'
378+
379+
bin2contigs = {}
380+
bin2contigs[bin_] = sorted(contig_names)
382381

383382
if "predict_proteins" in step_list:
384383
make_concatenated_fasta(
385384
args.concatenated_fasta,
386385
bin2contigs,
387-
args.bin_folder,
386+
bin_folder, # Note: not in args.
388387
args.log_file,
389388
args.quiet
390389
)
@@ -426,10 +425,12 @@ def run():
426425
n_classified_bins = 0
427426

428427
with open(args.bin2classification_output_file, "w") as outf1, open(args.ORF2LCA_output_file, "w") as outf2:
429-
outf1.write("# bin\tclassification\treason\tlineage\tlineage scores\n")
428+
outf1.write("# bin\tclassification\treason\tlineage\t"
429+
"lineage scores (f: {0:.2f})\n".format(args.f))
430+
431+
outf2.write("# ORF\tbin\tnumber of hits (r: {0})\tlineage\t"
432+
"top bit-score\n".format(args.r))
430433

431-
outf2.write("# ORF\tbin\tnumber of hits\tlineage\ttop bit-score\n")
432-
433434
for bin_ in sorted(bin2contigs):
434435
LCAs_ORFs = []
435436

@@ -440,7 +441,7 @@ def run():
440441
for ORF in contig2ORFs[contig]:
441442
if ORF not in ORF2hits:
442443
outf2.write("{0}\t{1}\tORF has no hit to database\n"
443-
"".format(ORF, bin_))
444+
"".format(ORF, bin_))
444445

445446
continue
446447

@@ -451,10 +452,8 @@ def run():
451452
ORF2hits[ORF], fastaid2LCAtaxid, taxid2parent)
452453

453454
if taxid.startswith("no taxid found"):
454-
outf2.write("{0}\t{1}\t{2}\t{3}\t{4}\n"
455-
"".format(
456-
ORF, bin_, n_hits, taxid, top_bitscore)
457-
)
455+
outf2.write("{0}\t{1}\t{2}\t{3}\t{4}\n".format(
456+
ORF, bin_, n_hits, taxid, top_bitscore))
458457
else:
459458
lineage = tax.find_lineage(taxid, taxid2parent)
460459

@@ -474,7 +473,7 @@ def run():
474473

475474
if len(LCAs_ORFs) == 0:
476475
outf1.write("{0}\tno taxid assigned\tno hits to database\n"
477-
"".format(bin_))
476+
"".format(bin_))
478477

479478
continue
480479

@@ -485,7 +484,7 @@ def run():
485484

486485
if lineages == "no ORFs with taxids found.":
487486
outf1.write("{0}\tno taxid assigned\t"
488-
"hits not found in taxonomy files\n".format(bin_))
487+
"hits not found in taxonomy files\n".format(bin_))
489488

490489
continue
491490

@@ -512,7 +511,7 @@ def run():
512511
lineage, taxids_with_multiple_offspring)
513512

514513
scores = ["{0:.2f}".format(score) for
515-
score in lineages_scores[i]]
514+
score in lineages_scores[i]]
516515

517516
if len(lineages) == 1:
518517
# There is only one classification.
@@ -547,21 +546,25 @@ def run():
547546

548547
message = (
549548
"\n-----------------\n\n"
550-
"{0} BAT is done! {1:,d}/{2:,d} bins have taxonomy assigned."
551-
"".format(shared.timestamp(), n_classified_bins, len(bin2contigs))
549+
"{0} BAT is done! {1:,d}/{2:,d} bins ({3:.2f}%) have "
550+
"taxonomy assigned.".format(
551+
shared.timestamp(),
552+
n_classified_bins,
553+
len(bin2contigs),
554+
n_classified_bins / len(bin2contigs) * 100
555+
)
552556
)
553557
shared.give_user_feedback(
554558
message, args.log_file, args.quiet, show_time=False)
555559

556560
if args.f < 0.5:
557561
message = ("since f is set to smaller than 0.5, one bin "
558-
"may have multiple classifications.")
562+
"may have multiple classifications.")
559563
shared.give_user_feedback(
560564
message, args.log_file, args.quiet, show_time=False, warning=True)
561565

562566
return
563567

564568

565569
if __name__ == "__main__":
566-
sys.exit("Run \'CAT bins\' to run Bin Annotation Tool (BAT) on a "
567-
"set of bins.")
570+
sys.exit("Run \'CAT bins\' to run Bin Annotation Tool (BAT).")

0 commit comments

Comments
 (0)