modularizing and adding test for bit-calc-variation-in-msa

Mike Lee · Mike Lee · commit 295bd82d7e5c · 2025-09-29T13:18:51.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,14 +12,16 @@
 
 -->
 
-## Not Yet Released
+## v1.13.3 (29-Sep-2025)
 
 ### Added
 - more test coverage of `bit-ez-screen`
-- unit tests for `bit-gen-kraken2-tax-plots` and `bit-kraken2-to-taxon-summaries`
+- unit tests for `bit-gen-kraken2-tax-plots`, `bit-kraken2-to-taxon-summaries`, and `bit-calc-variation-in-msa`
 - integration test for `bit-cov-analyzer`
 
+
 ### Changed
+- modularized `bit-calc-variation-in-msa`
 - updates to `bit-gen-kraken2-tax-plots`
   - modularized
   - appropriately adds domain letter to plots from GTDB tax kraken2 reports now
diff --git a/bit/cli/calc_variation_in_msa.py b/bit/cli/calc_variation_in_msa.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+import sys
+import argparse
+from bit.modules.seqs import calc_variation_in_msa
+from bit.cli.common import (CustomRichHelpFormatter,
+                            add_help)
+
+def main():
+
+    desc = """
+        This script takes an alignment in fasta format as input and returns the Shannon uncertainty values for each column
+        (using: https://scikit.bio/docs/dev/generated/skbio.alignment.TabularMSA.html). In output, a "variation" value of 0 would
+        mean the same character in all sequences for that position (highest conservation); 1 would mean equal probability of any character
+        (greatest variability). "Conservation" column is inverse. As written, any ambiguous bases or residues are converted to gap characters.
+        For version info, run `bit-version`.
+        """
+
+    parser = argparse.ArgumentParser(
+        description=desc,
+        epilog="Ex. usage: `bit-calc-variation-in-msa -i alignment.fasta`",
+        formatter_class=CustomRichHelpFormatter,
+        add_help=False
+    )
+
+    required = parser.add_argument_group("REQUIRED PARAMETERS")
+    optional = parser.add_argument_group("OPTIONAL PARAMETERS")
+
+    required.add_argument(
+        "-i",
+        "--input-alignment-fasta",
+        metavar="<FILE>",
+        help="Input alignment fasta file",
+        required=True,
+    )
+
+    optional.add_argument(
+        "-o",
+        "--output-tsv",
+        metavar="<FILE>",
+        help='Name of output tab-separated file (default: "variation.tsv")',
+        action="store",
+        default="variation.tsv"
+    )
+
+    optional.add_argument(
+        "-t",
+        "--type",
+        metavar="<STR>",
+        help='Either "DNA" or "Protein" (default: "Protein")',
+        choices=["DNA", "Protein"],
+        action="store",
+        default="Protein"
+    )
+
+    optional.add_argument(
+        "-g",
+        "--gap-treatment",
+        metavar="<STR>",
+        help='How to treat gaps, either "nan", "ignore", "error", "include" (default: "ignore")',
+        choices=["nan", "ignore", "error", "include"],
+        action="store",
+        default="ignore"
+    )
+
+    add_help(optional)
+
+    if len(sys.argv) == 1:  # pragma: no cover
+        parser.print_help(sys.stderr)
+        sys.exit(0)
+
+    args = parser.parse_args()
+
+    df = calc_variation_in_msa(args)
+    df.to_csv(args.output_tsv, sep="\t", index=False)
diff --git a/bit/modules/seqs.py b/bit/modules/seqs.py
@@ -1,4 +1,6 @@
 from Bio import SeqIO
+from skbio import TabularMSA, DNA, Protein
+import pandas as pd
 
 def calc_gc_per_seq(input_fasta):
     """
@@ -75,3 +77,26 @@ def filter_fasta_by_length(in_fasta, out_fasta, min_length, max_length):
                 out_file.write(">" + str(seq_record.description) + "\n" + str(seq_record.seq) + "\n")
 
     return (num_initial_seqs, num_seqs_retained, num_initial_bases, num_bases_retained)
+
+
+def calc_variation_in_msa(args):
+
+    msa = TabularMSA.read(args.input_alignment_fasta, constructor=eval(args.type), lowercase=True)
+
+    list_of_cleaned_seqs = []
+
+    # converting degenerate bases to gaps
+    for seq in msa:
+
+        seq = seq.replace(seq.degenerates(), "-")
+        list_of_cleaned_seqs.append(seq)
+
+    clean_msa = TabularMSA(list_of_cleaned_seqs)
+
+    conserved = clean_msa.conservation(gap_mode=args.gap_treatment)
+    indexes = list(range(1,clean_msa.shape[1] + 1))
+
+    df = pd.DataFrame({"position": indexes, "variation":1 - conserved, "conservation": conserved})
+
+    return df
+
diff --git a/bit/scripts/bit-calc-variation-in-msa b/bit/scripts/bit-calc-variation-in-msa
@@ -1,52 +1,6 @@
 #!/usr/bin/env python
 
-from skbio import TabularMSA, DNA, Protein
-import pandas as pd
-import argparse
-import sys
+from bit.cli.calc_variation_in_msa import main
 
-parser = argparse.ArgumentParser(description='This script takes an alignment in fasta format as input and returns the Shannon uncertainty values for each column \
-                                              using: http://scikit-bio.org/docs/0.5.3/generated/skbio.alignment.TabularMSA.conservation.html. In output "variation" column: 0 is \
-                                              same character in all sequences for that position (highest conservation); 1 is equal probability of any character \
-                                              (greatest variability). "Conservation" column is inverse. As written, any ambiguous bases or residues are converted to gap characters. \
-                                              For version info, run `bit-version`.')
-
-required = parser.add_argument_group('required arguments')
-
-required.add_argument("-i", "--input_alignment_fasta", metavar = "<FILE>", help = "Input alignment fasta file", action = "store", dest = "input_alignment_fasta", required = True)
-
-parser.add_argument("-g", "--gap_treatment", metavar = "<STR>", help = 'How to treat gaps, either "nan", "ignore", "error", "include" (default: "ignore")', choices = ["nan", "ignore", "error", "include"], action = "store", dest = "gap_treatment", default = "ignore")
-parser.add_argument("-t", "--type", metavar = "<STR>", help = 'Either "DNA" or "Protein" (default: "Protein")', choices = ["DNA", "Protein"], action = "store", dest = "type", default = "Protein")
-parser.add_argument("-o", "--output_file", metavar = "<FILE>", help = 'Name of output tab-separated file (default: "variation.tsv")', action = "store", dest = "output_tsv", default = "variation.tsv")
-
-if len(sys.argv)==1:
-    parser.print_help(sys.stderr)
-    sys.exit(0)
-
-args = parser.parse_args()
-
-# i'm not certain unequal alignments are all that would throw this error, so i'm leaving this out for now so skbio just spits out their problem if they have one reading in the alignment
-# try:
-    # msa = TabularMSA.read(args.input_alignment_fasta, constructor=DNA)
-# except ValueError:
-#     print('\n\tSorry, it seems not all sequences in the alignment are the same length... :(\n')
-#     sys.exit(1)
-
-msa = TabularMSA.read(args.input_alignment_fasta, constructor=eval(args.type), lowercase=True)
-
-list_of_cleaned_seqs = []
-
-# converting degenerate bases to gaps
-for seq in msa:
-
-    seq = seq.replace(seq.degenerates(), "-")
-    list_of_cleaned_seqs.append(seq)
-
-clean_msa = TabularMSA(list_of_cleaned_seqs)
-
-conserved = clean_msa.conservation(gap_mode=args.gap_treatment)
-indexes = list(range(1,clean_msa.shape[1] + 1))
-
-df = pd.DataFrame({"position": indexes, "variation":1 - conserved, "conservation": conserved})
-
-df.to_csv(args.output_tsv, sep="\t", index=False)
+if __name__ == "__main__":
+    main()
diff --git a/bit/tests/test_seqs.py b/bit/tests/test_seqs.py
@@ -1,7 +1,8 @@
 from bit.modules.general import get_package_path
 import bit.modules.seqs as seqs
 from Bio import SeqIO
-
+import pandas as pd
+from types import SimpleNamespace
 
 test_targets_fasta = get_package_path("tests/data/ez-screen-targets.fasta")
 
@@ -66,3 +67,33 @@ def test_filter_fasta_by_length(tmp_path):
     assert str(records[0].seq) == "ATGCGT"
     assert records[1].id == "seq3"
     assert str(records[1].seq) == "ATGCGTAA"
+
+
+def test_calc_variation_in_msa(tmp_path):
+    fasta_file = tmp_path / "test.fasta"
+    fasta_file.write_text(""">seq1
+ATGCATGC
+>seq2
+ATGCATGA
+""")
+
+    output_file = tmp_path / "variation.tsv"
+
+    # mocking args
+    args = SimpleNamespace(
+        input_alignment_fasta=str(fasta_file),
+        output_tsv=str(output_file),
+        type="DNA",
+        gap_treatment="ignore"
+    )
+
+    df = seqs.calc_variation_in_msa(args)
+
+    assert set(df.columns) == {"position", "variation", "conservation"}
+    assert len(df) == 8
+
+    for i, row in df.iterrows():
+        assert abs(row["variation"] + row["conservation"] - 1) < 1e-6
+
+    row_8 = df[df["position"] == 8].iloc[0]
+    assert abs(row_8["variation"] - 0.5) < 1e-6