Skip to content

Commit 295bd82

Browse files
Mike LeeMike Lee
authored andcommitted
modularizing and adding test for bit-calc-variation-in-msa
1 parent f96bd69 commit 295bd82

File tree

5 files changed

+138
-52
lines changed

5 files changed

+138
-52
lines changed

CHANGELOG.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,16 @@
1212
1313
-->
1414

15-
## Not Yet Released
15+
## v1.13.3 (29-Sep-2025)
1616

1717
### Added
1818
- more test coverage of `bit-ez-screen`
19-
- unit tests for `bit-gen-kraken2-tax-plots` and `bit-kraken2-to-taxon-summaries`
19+
- unit tests for `bit-gen-kraken2-tax-plots`, `bit-kraken2-to-taxon-summaries`, and `bit-calc-variation-in-msa`
2020
- integration test for `bit-cov-analyzer`
2121

22+
2223
### Changed
24+
- modularized `bit-calc-variation-in-msa`
2325
- updates to `bit-gen-kraken2-tax-plots`
2426
- modularized
2527
- appropriately adds domain letter to plots from GTDB tax kraken2 reports now

bit/cli/calc_variation_in_msa.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/usr/bin/env python
2+
import sys
3+
import argparse
4+
from bit.modules.seqs import calc_variation_in_msa
5+
from bit.cli.common import (CustomRichHelpFormatter,
6+
add_help)
7+
8+
def main():
9+
10+
desc = """
11+
This script takes an alignment in fasta format as input and returns the Shannon uncertainty values for each column
12+
(using: https://scikit.bio/docs/dev/generated/skbio.alignment.TabularMSA.html). In output, a "variation" value of 0 would
13+
mean the same character in all sequences for that position (highest conservation); 1 would mean equal probability of any character
14+
(greatest variability). "Conservation" column is inverse. As written, any ambiguous bases or residues are converted to gap characters.
15+
For version info, run `bit-version`.
16+
"""
17+
18+
parser = argparse.ArgumentParser(
19+
description=desc,
20+
epilog="Ex. usage: `bit-calc-variation-in-msa -i alignment.fasta`",
21+
formatter_class=CustomRichHelpFormatter,
22+
add_help=False
23+
)
24+
25+
required = parser.add_argument_group("REQUIRED PARAMETERS")
26+
optional = parser.add_argument_group("OPTIONAL PARAMETERS")
27+
28+
required.add_argument(
29+
"-i",
30+
"--input-alignment-fasta",
31+
metavar="<FILE>",
32+
help="Input alignment fasta file",
33+
required=True,
34+
)
35+
36+
optional.add_argument(
37+
"-o",
38+
"--output-tsv",
39+
metavar="<FILE>",
40+
help='Name of output tab-separated file (default: "variation.tsv")',
41+
action="store",
42+
default="variation.tsv"
43+
)
44+
45+
optional.add_argument(
46+
"-t",
47+
"--type",
48+
metavar="<STR>",
49+
help='Either "DNA" or "Protein" (default: "Protein")',
50+
choices=["DNA", "Protein"],
51+
action="store",
52+
default="Protein"
53+
)
54+
55+
optional.add_argument(
56+
"-g",
57+
"--gap-treatment",
58+
metavar="<STR>",
59+
help='How to treat gaps, either "nan", "ignore", "error", "include" (default: "ignore")',
60+
choices=["nan", "ignore", "error", "include"],
61+
action="store",
62+
default="ignore"
63+
)
64+
65+
add_help(optional)
66+
67+
if len(sys.argv) == 1: # pragma: no cover
68+
parser.print_help(sys.stderr)
69+
sys.exit(0)
70+
71+
args = parser.parse_args()
72+
73+
df = calc_variation_in_msa(args)
74+
df.to_csv(args.output_tsv, sep="\t", index=False)

bit/modules/seqs.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from Bio import SeqIO
2+
from skbio import TabularMSA, DNA, Protein
3+
import pandas as pd
24

35
def calc_gc_per_seq(input_fasta):
46
"""
@@ -75,3 +77,26 @@ def filter_fasta_by_length(in_fasta, out_fasta, min_length, max_length):
7577
out_file.write(">" + str(seq_record.description) + "\n" + str(seq_record.seq) + "\n")
7678

7779
return (num_initial_seqs, num_seqs_retained, num_initial_bases, num_bases_retained)
80+
81+
82+
def calc_variation_in_msa(args):
83+
84+
msa = TabularMSA.read(args.input_alignment_fasta, constructor=eval(args.type), lowercase=True)
85+
86+
list_of_cleaned_seqs = []
87+
88+
# converting degenerate bases to gaps
89+
for seq in msa:
90+
91+
seq = seq.replace(seq.degenerates(), "-")
92+
list_of_cleaned_seqs.append(seq)
93+
94+
clean_msa = TabularMSA(list_of_cleaned_seqs)
95+
96+
conserved = clean_msa.conservation(gap_mode=args.gap_treatment)
97+
indexes = list(range(1,clean_msa.shape[1] + 1))
98+
99+
df = pd.DataFrame({"position": indexes, "variation":1 - conserved, "conservation": conserved})
100+
101+
return df
102+
Lines changed: 3 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,6 @@
11
#!/usr/bin/env python
22

3-
from skbio import TabularMSA, DNA, Protein
4-
import pandas as pd
5-
import argparse
6-
import sys
3+
from bit.cli.calc_variation_in_msa import main
74

8-
parser = argparse.ArgumentParser(description='This script takes an alignment in fasta format as input and returns the Shannon uncertainty values for each column \
9-
using: http://scikit-bio.org/docs/0.5.3/generated/skbio.alignment.TabularMSA.conservation.html. In output "variation" column: 0 is \
10-
same character in all sequences for that position (highest conservation); 1 is equal probability of any character \
11-
(greatest variability). "Conservation" column is inverse. As written, any ambiguous bases or residues are converted to gap characters. \
12-
For version info, run `bit-version`.')
13-
14-
required = parser.add_argument_group('required arguments')
15-
16-
required.add_argument("-i", "--input_alignment_fasta", metavar = "<FILE>", help = "Input alignment fasta file", action = "store", dest = "input_alignment_fasta", required = True)
17-
18-
parser.add_argument("-g", "--gap_treatment", metavar = "<STR>", help = 'How to treat gaps, either "nan", "ignore", "error", "include" (default: "ignore")', choices = ["nan", "ignore", "error", "include"], action = "store", dest = "gap_treatment", default = "ignore")
19-
parser.add_argument("-t", "--type", metavar = "<STR>", help = 'Either "DNA" or "Protein" (default: "Protein")', choices = ["DNA", "Protein"], action = "store", dest = "type", default = "Protein")
20-
parser.add_argument("-o", "--output_file", metavar = "<FILE>", help = 'Name of output tab-separated file (default: "variation.tsv")', action = "store", dest = "output_tsv", default = "variation.tsv")
21-
22-
if len(sys.argv)==1:
23-
parser.print_help(sys.stderr)
24-
sys.exit(0)
25-
26-
args = parser.parse_args()
27-
28-
# i'm not certain unequal alignments are all that would throw this error, so i'm leaving this out for now so skbio just spits out their problem if they have one reading in the alignment
29-
# try:
30-
# msa = TabularMSA.read(args.input_alignment_fasta, constructor=DNA)
31-
# except ValueError:
32-
# print('\n\tSorry, it seems not all sequences in the alignment are the same length... :(\n')
33-
# sys.exit(1)
34-
35-
msa = TabularMSA.read(args.input_alignment_fasta, constructor=eval(args.type), lowercase=True)
36-
37-
list_of_cleaned_seqs = []
38-
39-
# converting degenerate bases to gaps
40-
for seq in msa:
41-
42-
seq = seq.replace(seq.degenerates(), "-")
43-
list_of_cleaned_seqs.append(seq)
44-
45-
clean_msa = TabularMSA(list_of_cleaned_seqs)
46-
47-
conserved = clean_msa.conservation(gap_mode=args.gap_treatment)
48-
indexes = list(range(1,clean_msa.shape[1] + 1))
49-
50-
df = pd.DataFrame({"position": indexes, "variation":1 - conserved, "conservation": conserved})
51-
52-
df.to_csv(args.output_tsv, sep="\t", index=False)
5+
if __name__ == "__main__":
6+
main()

bit/tests/test_seqs.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
from bit.modules.general import get_package_path
22
import bit.modules.seqs as seqs
33
from Bio import SeqIO
4-
4+
import pandas as pd
5+
from types import SimpleNamespace
56

67
test_targets_fasta = get_package_path("tests/data/ez-screen-targets.fasta")
78

@@ -66,3 +67,33 @@ def test_filter_fasta_by_length(tmp_path):
6667
assert str(records[0].seq) == "ATGCGT"
6768
assert records[1].id == "seq3"
6869
assert str(records[1].seq) == "ATGCGTAA"
70+
71+
72+
def test_calc_variation_in_msa(tmp_path):
73+
fasta_file = tmp_path / "test.fasta"
74+
fasta_file.write_text(""">seq1
75+
ATGCATGC
76+
>seq2
77+
ATGCATGA
78+
""")
79+
80+
output_file = tmp_path / "variation.tsv"
81+
82+
# mocking args
83+
args = SimpleNamespace(
84+
input_alignment_fasta=str(fasta_file),
85+
output_tsv=str(output_file),
86+
type="DNA",
87+
gap_treatment="ignore"
88+
)
89+
90+
df = seqs.calc_variation_in_msa(args)
91+
92+
assert set(df.columns) == {"position", "variation", "conservation"}
93+
assert len(df) == 8
94+
95+
for i, row in df.iterrows():
96+
assert abs(row["variation"] + row["conservation"] - 1) < 1e-6
97+
98+
row_8 = df[df["position"] == 8].iloc[0]
99+
assert abs(row_8["variation"] - 0.5) < 1e-6

0 commit comments

Comments
 (0)