Refactor simulation tests

jeromekelleher · jeromekelleher · commit 3157637a0b2d · 2025-03-19T10:28:47.000Z
diff --git a/tests/test_simulated_data.py b/tests/test_simulated_data.py
@@ -1,5 +1,6 @@
 import sys
 
+import msprime
 import numpy.testing as nt
 import pysam
 import pytest
@@ -8,32 +9,37 @@
 from bio2zarr import vcf2zarr
 
 
+def run_simulation(ploidy=1):
+    ts = msprime.sim_ancestry(
+        2,
+        population_size=10**4,
+        ploidy=ploidy,
+        sequence_length=100_000,
+        random_seed=42,
+    )
+    tables = ts.dump_tables()
+    for u in ts.samples():
+        site = tables.sites.add_row(u + 1, "A")
+        tables.mutations.add_row(site, derived_state="T", node=u)
+    return tables.tree_sequence()
+
+
+def write_vcf(ts, vcf_path, contig_id="1"):
+    with open(vcf_path, "w") as f:
+        ts.write_vcf(f, contig_id=contig_id)
+    # This also compresses the input file
+    pysam.tabix_index(str(vcf_path), preset="vcf")
+    return vcf_path.with_suffix(vcf_path.suffix + ".gz")
+
+
 @pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")
 class TestTskitRoundTripVcf:
     @pytest.mark.parametrize("ploidy", [1, 2, 3, 4])
     def test_ploidy(self, ploidy, tmp_path):
-        # FIXME importing here so pytest.skip avoids importing msprime.
-        import msprime
-
-        ts = msprime.sim_ancestry(
-            2,
-            population_size=10**4,
-            ploidy=ploidy,
-            sequence_length=100_000,
-            random_seed=42,
-        )
-        tables = ts.dump_tables()
-        for u in ts.samples():
-            site = tables.sites.add_row(u + 1, "A")
-            tables.mutations.add_row(site, derived_state="T", node=u)
-        ts = tables.tree_sequence()
-        vcf_file = tmp_path / "sim.vcf"
-        with open(vcf_file, "w") as f:
-            ts.write_vcf(f)
-        # This also compresses the input file
-        pysam.tabix_index(str(vcf_file), preset="vcf")
+        ts = run_simulation(ploidy=ploidy)
+        vcf_path = write_vcf(ts, tmp_path / "sim.vcf")
         out = tmp_path / "example.vcf.zarr"
-        vcf2zarr.convert([tmp_path / "sim.vcf.gz"], out)
+        vcf2zarr.convert([vcf_path], out)
         ds = sg.load_dataset(out)
         assert ds.sizes["ploidy"] == ploidy
         assert ds.sizes["variants"] == ts.num_sites
@@ -46,3 +52,28 @@ def test_ploidy(self, ploidy, tmp_path):
         nt.assert_equal(ds.variant_allele[:, 0].values, "A")
         nt.assert_equal(ds.variant_allele[:, 1].values, "T")
         nt.assert_equal(ds.variant_position, ts.sites_position)
+
+    # @pytest.mark.parametrize("contig_ids", [["A"], ["1", "2"]])
+    # def test_multi_contig(self, contig_ids, tmp_path):
+    #     ts = run_simulation()
+    #     # for contig in range(num_contigs):
+    #     #     vcf_file = tmp_path / "sim.vcf"
+    #     #     with open(vcf_file, "w") as f:
+    #     #         ts.write_vcf(f, contig_id=)
+    #     #     # This also compresses the input file
+    #     #     pysam.tabix_index(str(vcf_file), preset="vcf")
+
+    #     out = tmp_path / "example.vcf.zarr"
+    #     vcf2zarr.convert([tmp_path / "sim.vcf.gz"], out)
+    #     ds = sg.load_dataset(out)
+    #     assert ds.sizes["ploidy"] == ploidy
+    #     assert ds.sizes["variants"] == ts.num_sites
+    #     assert ds.sizes["samples"] == ts.num_individuals
+    #     # Msprime guarantees that this will be true.
+    #     nt.assert_array_equal(
+    #         ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
+    #         ds.call_genotype.values,
+    #     )
+    #     nt.assert_equal(ds.variant_allele[:, 0].values, "A")
+    #     nt.assert_equal(ds.variant_allele[:, 1].values, "T")
+    #     nt.assert_equal(ds.variant_position, ts.sites_position)