Skip to content

Commit 1f9274b

Browse files
Add support for multiple contigs across headers
Closes #334
1 parent 8f7c672 commit 1f9274b

File tree

3 files changed

+109
-31
lines changed

3 files changed

+109
-31
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# 0.1.5 2025-03-xx
2+
3+
- Add support for merging contig IDs across multiple VCFs (#342)
4+
15
# 0.1.4 2025-03-10
26

37
- Fix bug in handling all-missing genotypes (#328)

bio2zarr/vcf2zarr/icf.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,14 +324,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
324324
# are compatible.
325325
all_partitions = []
326326
total_records = 0
327+
contigs = {}
327328
for metadata, _ in results:
328329
for partition in metadata.partitions:
329330
logger.debug(f"Scanned partition {partition}")
330331
all_partitions.append(partition)
332+
for contig in metadata.contigs:
333+
if contig.id in contigs:
334+
if contig != contigs[contig.id]:
335+
raise ValueError(
336+
"Incompatible contig definitions: "
337+
f"{contig} != {contigs[contig.id]}"
338+
)
339+
else:
340+
contigs[contig.id] = contig
331341
total_records += metadata.num_records
332342
metadata.num_records = 0
333343
metadata.partitions = []
334344

345+
contig_union = list(contigs.values())
346+
for metadata, _ in results:
347+
metadata.contigs = contig_union
348+
335349
icf_metadata, header = results[0]
336350
for metadata, _ in results[1:]:
337351
if metadata != icf_metadata:

tests/test_simulated_data.py

Lines changed: 91 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,41 +8,101 @@
88
from bio2zarr import vcf2zarr
99

1010

11+
def run_simulation(num_samples=2, ploidy=1, seed=42, sequence_length=100_000):
12+
# Import here to avoid problems on OSX (see below)
13+
# https://github.com/sgkit-dev/bio2zarr/issues/336
14+
import msprime
15+
16+
ts = msprime.sim_ancestry(
17+
num_samples,
18+
population_size=10**4,
19+
ploidy=ploidy,
20+
sequence_length=sequence_length,
21+
random_seed=seed,
22+
)
23+
tables = ts.dump_tables()
24+
# Lazy hard coding of states here to make things simpler
25+
for u in range(ts.num_nodes - 1):
26+
site = tables.sites.add_row(u + 1, "A")
27+
tables.mutations.add_row(site, derived_state="T", node=u)
28+
return tables.tree_sequence()
29+
30+
31+
def assert_ts_ds_equal(ts, ds, ploidy=1):
32+
assert ds.sizes["ploidy"] == ploidy
33+
assert ds.sizes["variants"] == ts.num_sites
34+
assert ds.sizes["samples"] == ts.num_individuals
35+
# Msprime guarantees that this will be true.
36+
nt.assert_array_equal(
37+
ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
38+
ds.call_genotype.values,
39+
)
40+
nt.assert_equal(ds.variant_allele[:, 0].values, "A")
41+
nt.assert_equal(ds.variant_allele[:, 1].values, "T")
42+
nt.assert_equal(ds.variant_position, ts.sites_position)
43+
44+
45+
def write_vcf(ts, vcf_path, contig_id="1"):
46+
with open(vcf_path, "w") as f:
47+
ts.write_vcf(f, contig_id=contig_id)
48+
# This also compresses the input file
49+
pysam.tabix_index(str(vcf_path), preset="vcf")
50+
return vcf_path.with_suffix(vcf_path.suffix + ".gz")
51+
52+
53+
# https://github.com/sgkit-dev/bio2zarr/issues/336
1154
@pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")
1255
class TestTskitRoundTripVcf:
1356
@pytest.mark.parametrize("ploidy", [1, 2, 3, 4])
1457
def test_ploidy(self, ploidy, tmp_path):
15-
# FIXME importing here so pytest.skip avoids importing msprime.
16-
import msprime
17-
18-
ts = msprime.sim_ancestry(
19-
2,
20-
population_size=10**4,
21-
ploidy=ploidy,
22-
sequence_length=100_000,
23-
random_seed=42,
24-
)
25-
tables = ts.dump_tables()
26-
for u in ts.samples():
27-
site = tables.sites.add_row(u + 1, "A")
28-
tables.mutations.add_row(site, derived_state="T", node=u)
29-
ts = tables.tree_sequence()
30-
vcf_file = tmp_path / "sim.vcf"
31-
with open(vcf_file, "w") as f:
32-
ts.write_vcf(f)
33-
# This also compresses the input file
34-
pysam.tabix_index(str(vcf_file), preset="vcf")
58+
ts = run_simulation(ploidy=ploidy)
59+
vcf_path = write_vcf(ts, tmp_path / "sim.vcf")
3560
out = tmp_path / "example.vcf.zarr"
36-
vcf2zarr.convert([tmp_path / "sim.vcf.gz"], out)
61+
vcf2zarr.convert([vcf_path], out)
3762
ds = sg.load_dataset(out)
38-
assert ds.sizes["ploidy"] == ploidy
39-
assert ds.sizes["variants"] == ts.num_sites
40-
assert ds.sizes["samples"] == ts.num_individuals
41-
# Msprime guarantees that this will be true.
42-
nt.assert_array_equal(
43-
ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
44-
ds.call_genotype.values,
63+
assert_ts_ds_equal(ts, ds, ploidy)
64+
65+
@pytest.mark.parametrize(
66+
"contig_ids",
67+
[["ContigName"], ["1", "2"], ["2", "3", "1"], ["a", "b", "c", "d", "e"]],
68+
)
69+
def test_multi_contig(self, contig_ids, tmp_path):
70+
vcfs = []
71+
tss = {}
72+
for seed, contig_id in enumerate(contig_ids, 1):
73+
ts = run_simulation(num_samples=8, seed=seed)
74+
vcf_path = write_vcf(ts, tmp_path / f"{contig_id}.vcf", contig_id=contig_id)
75+
vcfs.append(vcf_path)
76+
tss[contig_id] = ts
77+
78+
out = tmp_path / "example.vcf.zarr"
79+
vcf2zarr.convert(vcfs, out)
80+
ds = sg.load_dataset(out).set_index(
81+
variants=("variant_contig", "variant_position")
82+
)
83+
assert ds.sizes["ploidy"] == 1
84+
assert ds.sizes["contigs"] == len(contig_ids)
85+
# Files processed in order of sorted filename, and the contigs therefore
86+
# get sorted into this order.
87+
nt.assert_equal(ds["contig_id"].values, sorted(contig_ids))
88+
nt.assert_equal(
89+
ds["contig_length"].values, [ts.sequence_length for ts in tss.values()]
4590
)
46-
nt.assert_equal(ds.variant_allele[:, 0].values, "A")
47-
nt.assert_equal(ds.variant_allele[:, 1].values, "T")
48-
nt.assert_equal(ds.variant_position, ts.sites_position)
91+
for contig_id in contig_ids:
92+
contig = list(ds.contig_id).index(contig_id)
93+
dss = ds.sel(variants=(contig, slice(0, None)))
94+
assert_ts_ds_equal(tss[contig_id], dss)
95+
96+
97+
# https://github.com/sgkit-dev/bio2zarr/issues/336
98+
@pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")
99+
class TestIncompatibleContigs:
100+
def test_different_lengths(self, tmp_path):
101+
vcfs = []
102+
for length in [100_000, 99_999]:
103+
ts = run_simulation(sequence_length=length)
104+
vcf_path = write_vcf(ts, tmp_path / f"{length}.vcf", contig_id="1")
105+
vcfs.append(vcf_path)
106+
out = tmp_path / "example.vcf.zarr"
107+
with pytest.raises(ValueError, match="Incompatible contig definitions"):
108+
vcf2zarr.convert(vcfs, out)

0 commit comments

Comments
 (0)