Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# 0.1.5 2025-03-xx

- Add support for merging contig IDs across multiple VCFs (#342)

# 0.1.4 2025-03-10

- Fix bug in handling all-missing genotypes (#328)
Expand Down
14 changes: 14 additions & 0 deletions bio2zarr/vcf2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,14 +324,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1):
# are compatible.
all_partitions = []
total_records = 0
contigs = {}
for metadata, _ in results:
for partition in metadata.partitions:
logger.debug(f"Scanned partition {partition}")
all_partitions.append(partition)
for contig in metadata.contigs:
if contig.id in contigs:
if contig != contigs[contig.id]:
raise ValueError(
"Incompatible contig definitions: "
f"{contig} != {contigs[contig.id]}"
)
else:
contigs[contig.id] = contig
total_records += metadata.num_records
metadata.num_records = 0
metadata.partitions = []

contig_union = list(contigs.values())
for metadata, _ in results:
metadata.contigs = contig_union

icf_metadata, header = results[0]
for metadata, _ in results[1:]:
if metadata != icf_metadata:
Expand Down
122 changes: 91 additions & 31 deletions tests/test_simulated_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,101 @@
from bio2zarr import vcf2zarr


def run_simulation(num_samples=2, ploidy=1, seed=42, sequence_length=100_000):
# Import here to avoid problems on OSX (see below)
# https://github.com/sgkit-dev/bio2zarr/issues/336
import msprime

ts = msprime.sim_ancestry(
num_samples,
population_size=10**4,
ploidy=ploidy,
sequence_length=sequence_length,
random_seed=seed,
)
tables = ts.dump_tables()
# Lazy hard coding of states here to make things simpler
for u in range(ts.num_nodes - 1):
site = tables.sites.add_row(u + 1, "A")
tables.mutations.add_row(site, derived_state="T", node=u)
return tables.tree_sequence()


def assert_ts_ds_equal(ts, ds, ploidy=1):
assert ds.sizes["ploidy"] == ploidy
assert ds.sizes["variants"] == ts.num_sites
assert ds.sizes["samples"] == ts.num_individuals
# Msprime guarantees that this will be true.
nt.assert_array_equal(
ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
ds.call_genotype.values,
)
nt.assert_equal(ds.variant_allele[:, 0].values, "A")
nt.assert_equal(ds.variant_allele[:, 1].values, "T")
nt.assert_equal(ds.variant_position, ts.sites_position)


def write_vcf(ts, vcf_path, contig_id="1"):
with open(vcf_path, "w") as f:
ts.write_vcf(f, contig_id=contig_id)
# This also compresses the input file
pysam.tabix_index(str(vcf_path), preset="vcf")
return vcf_path.with_suffix(vcf_path.suffix + ".gz")


# https://github.com/sgkit-dev/bio2zarr/issues/336
@pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")
class TestTskitRoundTripVcf:
@pytest.mark.parametrize("ploidy", [1, 2, 3, 4])
def test_ploidy(self, ploidy, tmp_path):
# FIXME importing here so pytest.skip avoids importing msprime.
import msprime

ts = msprime.sim_ancestry(
2,
population_size=10**4,
ploidy=ploidy,
sequence_length=100_000,
random_seed=42,
)
tables = ts.dump_tables()
for u in ts.samples():
site = tables.sites.add_row(u + 1, "A")
tables.mutations.add_row(site, derived_state="T", node=u)
ts = tables.tree_sequence()
vcf_file = tmp_path / "sim.vcf"
with open(vcf_file, "w") as f:
ts.write_vcf(f)
# This also compresses the input file
pysam.tabix_index(str(vcf_file), preset="vcf")
ts = run_simulation(ploidy=ploidy)
vcf_path = write_vcf(ts, tmp_path / "sim.vcf")
out = tmp_path / "example.vcf.zarr"
vcf2zarr.convert([tmp_path / "sim.vcf.gz"], out)
vcf2zarr.convert([vcf_path], out)
ds = sg.load_dataset(out)
assert ds.sizes["ploidy"] == ploidy
assert ds.sizes["variants"] == ts.num_sites
assert ds.sizes["samples"] == ts.num_individuals
# Msprime guarantees that this will be true.
nt.assert_array_equal(
ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)),
ds.call_genotype.values,
assert_ts_ds_equal(ts, ds, ploidy)

@pytest.mark.parametrize(
"contig_ids",
[["ContigName"], ["1", "2"], ["2", "3", "1"], ["a", "b", "c", "d", "e"]],
)
def test_multi_contig(self, contig_ids, tmp_path):
vcfs = []
tss = {}
for seed, contig_id in enumerate(contig_ids, 1):
ts = run_simulation(num_samples=8, seed=seed)
vcf_path = write_vcf(ts, tmp_path / f"{contig_id}.vcf", contig_id=contig_id)
vcfs.append(vcf_path)
tss[contig_id] = ts

out = tmp_path / "example.vcf.zarr"
vcf2zarr.convert(vcfs, out)
ds = sg.load_dataset(out).set_index(
variants=("variant_contig", "variant_position")
)
assert ds.sizes["ploidy"] == 1
assert ds.sizes["contigs"] == len(contig_ids)
# Files processed in order of sorted filename, and the contigs therefore
# get sorted into this order.
nt.assert_equal(ds["contig_id"].values, sorted(contig_ids))
nt.assert_equal(
ds["contig_length"].values, [ts.sequence_length for ts in tss.values()]
)
nt.assert_equal(ds.variant_allele[:, 0].values, "A")
nt.assert_equal(ds.variant_allele[:, 1].values, "T")
nt.assert_equal(ds.variant_position, ts.sites_position)
for contig_id in contig_ids:
contig = list(ds.contig_id).index(contig_id)
dss = ds.sel(variants=(contig, slice(0, None)))
assert_ts_ds_equal(tss[contig_id], dss)


# https://github.com/sgkit-dev/bio2zarr/issues/336
@pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")
class TestIncompatibleContigs:
def test_different_lengths(self, tmp_path):
vcfs = []
for length in [100_000, 99_999]:
ts = run_simulation(sequence_length=length)
vcf_path = write_vcf(ts, tmp_path / f"{length}.vcf", contig_id="1")
vcfs.append(vcf_path)
out = tmp_path / "example.vcf.zarr"
with pytest.raises(ValueError, match="Incompatible contig definitions"):
vcf2zarr.convert(vcfs, out)