Skip to content

Commit 6244725

Browse files
Mike LeeMike Lee
authored andcommitted
modularized and added tests for bit-mutate-seqs
1 parent 099527e commit 6244725

File tree

12 files changed

+458
-274
lines changed

12 files changed

+458
-274
lines changed

CHANGELOG.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,18 @@
1414
-->
1515

1616

17-
## v1.13.5 (NOT RELEASED YET)
17+
## v1.13.5 (31-Dec-2025)
1818

1919
### Added
2020

2121
### Changed
2222
- `bit-count-bases-per-seq` has been removed with its function combined into `bit-count-bases`
2323
- if input fasta has one sequence, it prints the length to the terminal; if it has 2 or more, it will print out summary stats; in either of the two prior cases, if an output file is specified, the program will additionally write lengths of all sequences to that specified file
2424
- modularized, test added
25+
- modularized and added unit tests for `bit-lineage-to-tsv`
26+
- `bit-mutate-seqs`
27+
- `--seed-for-randomization` long-parameter shortened to just `--seed`
28+
- modularized and test added
2529

2630
### Fixed
2731
- `bit-assemble` now properly filters spades-assembled contigs based on user-specific min-contig length

bit/cli/lineage_to_tsv.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
import os
2+
import sys
3+
import argparse
4+
from bit.cli.common import (CustomRichHelpFormatter, add_help)
5+
from bit.modules.lineage_to_tsv import convert_lineage_to_tsv
6+
from bit.modules.general import check_files_are_found
7+
8+
def build_parser():
9+
10+
desc = """
11+
This script converts lineages (in, e.g., this format: "d__Bacteria;p__Campylobacterota;c__Campylobacteria",
12+
only including up to the standard 7 ranks d,p,c,o,f,g,s) into tsv format (e.g., the previous would be output
13+
tab-separated as "Bacteria\tCampylobacterota\tCampylobacteria\tNA\tNA\tNA\tNA"). It expects as input a 2-column
14+
tab-delimited file with column 1 holding an identifier and column 2 holding the lineage. For version info,
15+
run `bit-version`.
16+
"""
17+
18+
parser = argparse.ArgumentParser(
19+
description=desc,
20+
epilog="Ex. usage: `bit-lineage-to-tsv -i input-lineages.tsv -o formatted-tax.tsv`",
21+
formatter_class=CustomRichHelpFormatter,
22+
add_help=False
23+
)
24+
25+
required = parser.add_argument_group("REQUIRED PARAMETERS")
26+
optional = parser.add_argument_group("OPTIONAL PARAMETERS")
27+
28+
required.add_argument(
29+
"-i",
30+
"--input-lineage",
31+
metavar="<FILE>",
32+
help="Input assembly file",
33+
required=True
34+
)
35+
36+
optional.add_argument(
37+
"-o",
38+
"--output-tsv",
39+
metavar="<FILE>",
40+
help='Name of output standard taxonomy tsv file (default: formatted-tax.tsv)',
41+
default="formatted-tax.tsv"
42+
)
43+
44+
optional.add_argument(
45+
"--make-taxid",
46+
help="Provide this flag to make a unique taxid (string of all rank fields) for each lineage (" \
47+
"will be added as second column of output).",
48+
action="store_true"
49+
)
50+
51+
add_help(optional)
52+
53+
return parser
54+
55+
56+
def main():
57+
58+
parser = build_parser()
59+
60+
if len(sys.argv) == 1: # pragma: no cover
61+
parser.print_help(sys.stderr)
62+
sys.exit(0)
63+
64+
args = parser.parse_args()
65+
66+
check_files_are_found([args.input_lineage])
67+
68+
convert_lineage_to_tsv(args.input_lineage, args.output_tsv, args.make_taxid)

bit/cli/mutate_seqs.py

Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
import sys
2+
import argparse
3+
import datetime
4+
import random
5+
from bit.cli.common import CustomRichHelpFormatter, add_help
6+
from bit.modules.general import check_files_are_found
7+
from Bio import SeqIO
8+
from bit.modules.seqs import mutate_seq
9+
10+
11+
def build_parser():
12+
13+
desc = """
14+
This script will mutate all sequences of a nucleotide or amino-acid multifasta with
15+
the specified mutation rate. It does not take into consideration transition/transversion
16+
rates. By default it only swaps bases, but it can optionally introduce indels also.
17+
For version info, run `bit-version`.
18+
"""
19+
20+
parser = argparse.ArgumentParser(
21+
description=desc,
22+
epilog="Ex. usage: `bit-mutate-seqs -i input.fasta`",
23+
formatter_class=CustomRichHelpFormatter,
24+
add_help=False
25+
)
26+
27+
required = parser.add_argument_group("REQUIRED PARAMETERS")
28+
optional = parser.add_argument_group("OPTIONAL PARAMETERS")
29+
30+
required.add_argument(
31+
"-i",
32+
"--input-fasta",
33+
metavar = "<FILE>",
34+
help = "Starting fasta file",
35+
action = "store",
36+
required = True
37+
)
38+
39+
optional.add_argument(
40+
"-m",
41+
"--mutation-rate",
42+
metavar = "<FLOAT>",
43+
help = 'Wanted mutation rate, must be a float between 0 and 1 (default: "0.01")',
44+
action = "store",
45+
type = float,
46+
default = 0.01
47+
)
48+
49+
optional.add_argument(
50+
"-t",
51+
"--molecule-type",
52+
help = "'NT' for nucleotides or 'AA' for amino acids (default = 'NT')",
53+
action = "store",
54+
choices = ["NT", "AA"],
55+
default = 'NT'
56+
)
57+
58+
optional.add_argument(
59+
"-o",
60+
"--output-fasta",
61+
metavar = "<FILE>",
62+
help = 'Output mutated fasta file (default: "mutated.fasta").',
63+
action = "store",
64+
default = "mutated.fasta"
65+
)
66+
67+
optional.add_argument(
68+
"-l",
69+
"--output-log",
70+
metavar = "<FILE>",
71+
help = 'Output file reporting the seed used and how many characters were changed in each input sequence (default: "mutated.log")',
72+
action = "store",
73+
default = "mutated.log"
74+
)
75+
76+
optional.add_argument(
77+
"-I",
78+
"--indel-rate",
79+
metavar = "<FLOAT>",
80+
help = """
81+
Specify the frequency of indels wanted, must be a float between 0 and 1. This
82+
is a subset of the overall mutation rate provided via the '-m' argument.
83+
E.g., providing '-f 0.2' means 1 out of every 5 generated mutations will be an indel.
84+
By default, no indels are incorporated (default = 0.0). The likelihood
85+
of adding an insertion/deletion is 50/50, with all characters equally likely
86+
to be inserted.
87+
""",
88+
action = "store",
89+
type = float,
90+
default = 0.0
91+
)
92+
93+
optional.add_argument(
94+
"-s",
95+
"--seed",
96+
metavar = "<INT>",
97+
help = "Optionally set the randomization seed",
98+
action = "store",
99+
type = int
100+
)
101+
102+
add_help(optional)
103+
104+
return parser
105+
106+
107+
def main():
108+
109+
parser = build_parser()
110+
111+
if len(sys.argv) == 1: # pragma: no cover
112+
parser.print_help(sys.stderr)
113+
sys.exit(0)
114+
115+
args = parser.parse_args()
116+
117+
available_substitutions, seed = preflight_checks_and_setup(args)
118+
119+
# doing mutations per sequence
120+
with (open (args.input_fasta, "r") as in_fasta,
121+
open(args.output_fasta, "w") as out_fasta,
122+
open(args.output_log, "w") as log):
123+
124+
# starting log file
125+
log.write(f"# seed used: {seed}\n")
126+
log.write(f"# indel frequency utilized: {args.indel_rate}\n")
127+
log.write("# the rest below is a tab-delimited table summarizing the changes introduced to each input sequence\n")
128+
log.write("seq_id\tseq_length\tnum_total_changes_made\tnum_substitutions\tnum_total_indels\tnum_insertions\tnum_deletions\n")
129+
130+
for seq_record in SeqIO.parse(in_fasta, "fasta"):
131+
132+
(seq, total_num_mutations, num_substitutions, num_indels,
133+
num_insertions, num_deletions) = mutate_seq(seq_record.seq, available_substitutions,
134+
args.mutation_rate, args.indel_rate)
135+
136+
out_fasta.write(f">{seq_record.id}\n")
137+
out_fasta.write(f"{seq}\n")
138+
139+
# writing out log file
140+
log.write(f"{seq_record.id}\t{len(seq_record)}\t{total_num_mutations}\t{num_substitutions}\t{num_indels}\t{num_insertions}\t{num_deletions}\n")
141+
142+
143+
def preflight_checks_and_setup(args):
144+
145+
check_files_are_found([args.input_fasta])
146+
147+
# checking mutation rate is within 0 and 1
148+
validate_mutation_rate(args.mutation_rate)
149+
150+
# checking indel frequency rate is within 0 and 1
151+
validate_indel_rate(args.indel_rate)
152+
153+
# setting substitution type
154+
available_substitutions = get_available_substitutions(args.molecule_type)
155+
156+
# setting seed if provided, and recording it for the output log either way
157+
seed = set_seed(args.seed)
158+
159+
return available_substitutions, seed
160+
161+
162+
def validate_mutation_rate(mutation_rate):
163+
164+
if not 0 <= mutation_rate <= 1:
165+
print("\n The '--mutation-rate' argument needs to be between 0 and and 1.\n")
166+
print(" Exiting for now.\n")
167+
sys.exit(1)
168+
169+
170+
def validate_indel_rate(indel_rate):
171+
172+
if not 0 <= indel_rate <= 1:
173+
print("\n The '--indel-rate' argument needs to be between 0 and and 1.\n")
174+
print(" Exiting for now.\n")
175+
sys.exit(1)
176+
177+
178+
def get_available_substitutions(molecule_type):
179+
180+
if molecule_type == 'NT':
181+
return ['A', 'T', 'C', 'G']
182+
183+
else:
184+
return ['A', 'R', 'N', 'D',
185+
'C', 'E', 'Q', 'G',
186+
'H', 'I', 'L', 'K',
187+
'M', 'F', 'P', 'S',
188+
'T', 'W', 'Y', 'V']
189+
190+
191+
def set_seed(input_seed):
192+
193+
if input_seed:
194+
195+
seed = input_seed
196+
197+
else:
198+
# setting seed manually (pseudo-randomly) BECAUSE it seems if you don't set it, you can't
199+
# pull it out otherwise (and i want to log it whether it was specified or not by the user)
200+
time = datetime.datetime.now()
201+
seed = time.hour * 10000 + time.minute * 100 + time.second
202+
203+
random.seed(seed)
204+
205+
return seed

bit/modules/get_cov_stats.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,7 @@
55
from dataclasses import dataclass, field
66
from Bio import SeqIO #type: ignore
77
from colorama import Fore, init
8-
from bit.modules.general import (check_files_are_found,
9-
notify_premature_exit,
10-
report_message)
8+
from bit.modules.general import check_files_are_found
119

1210

1311
init(autoreset=True)

bit/modules/lineage_to_tsv.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import re
2+
3+
def get_rank(lineage, prefix):
4+
5+
if lineage.startswith(prefix):
6+
7+
curr_rank = lineage.split(";")[0].replace(prefix, "", 1)
8+
9+
lineage = re.sub(f"^{prefix}{curr_rank};", "", lineage)
10+
11+
else:
12+
13+
curr_rank = "NA"
14+
15+
return(lineage, curr_rank)
16+
17+
18+
def convert_lineage_to_tsv(input_tsv, output_tsv, make_taxid):
19+
20+
with open(output_tsv, "w") as out_tab:
21+
22+
if make_taxid:
23+
out_tab.write("seq_ID\ttaxid\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n")
24+
else:
25+
out_tab.write("seq_ID\tdomain\tphylum\tclass\torder\tfamily\tgenus\tspecies\n")
26+
27+
for line in open(input_tsv):
28+
29+
line = line.strip().split("\t")
30+
ID = line[0]
31+
32+
# this is if there is no second column (no lineage)
33+
if len(line) == 1:
34+
35+
out_line = f"{ID}\tNA\tNA\tNA\tNA\tNA\tNA\tNA"
36+
37+
else:
38+
39+
lineage = line[1]
40+
41+
# removing "root" if that's at the start
42+
if lineage.startswith("root;"):
43+
lineage = re.sub("^root;", "", lineage)
44+
45+
# getting all ranks present, setting to NA if not
46+
lineage, t_domain = get_rank(lineage, "d__")
47+
lineage, t_phylum = get_rank(lineage, "p__")
48+
lineage, t_class = get_rank(lineage, "c__")
49+
lineage, t_order = get_rank(lineage, "o__")
50+
lineage, t_family = get_rank(lineage, "f__")
51+
lineage, t_genus = get_rank(lineage, "g__")
52+
lineage, t_species = get_rank(lineage, "s__")
53+
54+
if make_taxid:
55+
56+
taxid_string = "_".join([t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species]).replace(" ", "_")
57+
out_line = "\t".join([ID, taxid_string, t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species])
58+
59+
else:
60+
61+
out_line = "\t".join([ID, t_domain, t_phylum, t_class, t_order, t_family, t_genus, t_species])
62+
63+
out_tab.write(out_line + "\n")

0 commit comments

Comments
 (0)