Add support for multiple contigs across headers

jeromekelleher · jeromekelleher · commit 1f9274b78434 · 2025-03-20T09:29:16.000Z
Closes #334
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 0.1.5 2025-03-xx
+
+- Add support for merging contig IDs across multiple VCFs (#342)
+
 # 0.1.4 2025-03-10
 
 - Fix bug in handling all-missing genotypes (#328)
diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py
@@ -324,14 +324,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
     # are compatible.
     all_partitions = []
     total_records = 0
+    contigs = {}
     for metadata, _ in results:
         for partition in metadata.partitions:
             logger.debug(f"Scanned partition {partition}")
             all_partitions.append(partition)
+        for contig in metadata.contigs:
+            if contig.id in contigs:
+                if contig != contigs[contig.id]:
+                    raise ValueError(
+                        "Incompatible contig definitions: "
+                        f"{contig} != {contigs[contig.id]}"
+                    )
+            else:
+                contigs[contig.id] = contig
         total_records += metadata.num_records
         metadata.num_records = 0
         metadata.partitions = []
 
+    contig_union = list(contigs.values())
+    for metadata, _ in results:
+        metadata.contigs = contig_union
+
     icf_metadata, header = results[0]
     for metadata, _ in results[1:]:
         if metadata != icf_metadata:
diff --git a/tests/test_simulated_data.py b/tests/test_simulated_data.py
@@ -8,41 +8,101 @@
 from bio2zarr import vcf2zarr
 
 
+def run_simulation(num_samples=2, ploidy=1, seed=42, sequence_length=100_000):
+    # Import here to avoid problems on OSX (see below)
+    # https://github.com/sgkit-dev/bio2zarr/issues/336
+    import msprime
+
+    ts = msprime.sim_ancestry(
+        num_samples,
+        population_size=10**4,
+        ploidy=ploidy,
+        sequence_length=sequence_length,
+        random_seed=seed,
+    )
+    tables = ts.dump_tables()
+    # Lazy hard coding of states here to make things simpler
+    for u in range(ts.num_nodes - 1):
+        site = tables.sites.add_row(u + 1, "A")
+        tables.mutations.add_row(site, derived_state="T", node=u)
+    return tables.tree_sequence()
+
+
+def assert_ts_ds_equal(ts, ds, ploidy=1):
+    assert ds.sizes["ploidy"] == ploidy
+    assert ds.sizes["variants"] == ts.num_sites
+    assert ds.sizes["samples"] == ts.num_individuals
+    # Msprime guarantees that this will be true.
+    nt.assert_array_equal(
+        ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
+        ds.call_genotype.values,
+    )
+    nt.assert_equal(ds.variant_allele[:, 0].values, "A")
+    nt.assert_equal(ds.variant_allele[:, 1].values, "T")
+    nt.assert_equal(ds.variant_position, ts.sites_position)
+
+
+def write_vcf(ts, vcf_path, contig_id="1"):
+    with open(vcf_path, "w") as f:
+        ts.write_vcf(f, contig_id=contig_id)
+    # This also compresses the input file
+    pysam.tabix_index(str(vcf_path), preset="vcf")
+    return vcf_path.with_suffix(vcf_path.suffix + ".gz")
+
+
+# https://github.com/sgkit-dev/bio2zarr/issues/336
 @pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")
 class TestTskitRoundTripVcf:
     @pytest.mark.parametrize("ploidy", [1, 2, 3, 4])
     def test_ploidy(self, ploidy, tmp_path):
-        # FIXME importing here so pytest.skip avoids importing msprime.
-        import msprime
-
-        ts = msprime.sim_ancestry(
-            2,
-            population_size=10**4,
-            ploidy=ploidy,
-            sequence_length=100_000,
-            random_seed=42,
-        )
-        tables = ts.dump_tables()
-        for u in ts.samples():
-            site = tables.sites.add_row(u + 1, "A")
-            tables.mutations.add_row(site, derived_state="T", node=u)
-        ts = tables.tree_sequence()
-        vcf_file = tmp_path / "sim.vcf"
-        with open(vcf_file, "w") as f:
-            ts.write_vcf(f)
-        # This also compresses the input file
-        pysam.tabix_index(str(vcf_file), preset="vcf")
+        ts = run_simulation(ploidy=ploidy)
+        vcf_path = write_vcf(ts, tmp_path / "sim.vcf")
         out = tmp_path / "example.vcf.zarr"
-        vcf2zarr.convert([tmp_path / "sim.vcf.gz"], out)
+        vcf2zarr.convert([vcf_path], out)
         ds = sg.load_dataset(out)
-        assert ds.sizes["ploidy"] == ploidy
-        assert ds.sizes["variants"] == ts.num_sites
-        assert ds.sizes["samples"] == ts.num_individuals
-        # Msprime guarantees that this will be true.
-        nt.assert_array_equal(
-            ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
-            ds.call_genotype.values,
+        assert_ts_ds_equal(ts, ds, ploidy)
+
+    @pytest.mark.parametrize(
+        "contig_ids",
+        [["ContigName"], ["1", "2"], ["2", "3", "1"], ["a", "b", "c", "d", "e"]],
+    )
+    def test_multi_contig(self, contig_ids, tmp_path):
+        vcfs = []
+        tss = {}
+        for seed, contig_id in enumerate(contig_ids, 1):
+            ts = run_simulation(num_samples=8, seed=seed)
+            vcf_path = write_vcf(ts, tmp_path / f"{contig_id}.vcf", contig_id=contig_id)
+            vcfs.append(vcf_path)
+            tss[contig_id] = ts
+
+        out = tmp_path / "example.vcf.zarr"
+        vcf2zarr.convert(vcfs, out)
+        ds = sg.load_dataset(out).set_index(
+            variants=("variant_contig", "variant_position")
+        )
+        assert ds.sizes["ploidy"] == 1
+        assert ds.sizes["contigs"] == len(contig_ids)
+        # Files processed in order of sorted filename, and the contigs therefore
+        # get sorted into this order.
+        nt.assert_equal(ds["contig_id"].values, sorted(contig_ids))
+        nt.assert_equal(
+            ds["contig_length"].values, [ts.sequence_length for ts in tss.values()]
         )
-        nt.assert_equal(ds.variant_allele[:, 0].values, "A")
-        nt.assert_equal(ds.variant_allele[:, 1].values, "T")
-        nt.assert_equal(ds.variant_position, ts.sites_position)
+        for contig_id in contig_ids:
+            contig = list(ds.contig_id).index(contig_id)
+            dss = ds.sel(variants=(contig, slice(0, None)))
+            assert_ts_ds_equal(tss[contig_id], dss)
+
+
+# https://github.com/sgkit-dev/bio2zarr/issues/336
+@pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")
+class TestIncompatibleContigs:
+    def test_different_lengths(self, tmp_path):
+        vcfs = []
+        for length in [100_000, 99_999]:
+            ts = run_simulation(sequence_length=length)
+            vcf_path = write_vcf(ts, tmp_path / f"{length}.vcf", contig_id="1")
+            vcfs.append(vcf_path)
+        out = tmp_path / "example.vcf.zarr"
+        with pytest.raises(ValueError, match="Incompatible contig definitions"):
+            vcf2zarr.convert(vcfs, out)