diff --git a/CHANGELOG.md b/CHANGELOG.md index ccfb30bf..fdbe8a8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 0.1.5 2025-03-xx + +- Add support for merging contig IDs across multiple VCFs (#342) + # 0.1.4 2025-03-10 - Fix bug in handling all-missing genotypes (#328) diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py index bd124cb2..32c4b9a0 100644 --- a/bio2zarr/vcf2zarr/icf.py +++ b/bio2zarr/vcf2zarr/icf.py @@ -324,14 +324,28 @@ def scan_vcfs(paths, show_progress, target_num_partitions, worker_processes=1): # are compatible. all_partitions = [] total_records = 0 + contigs = {} for metadata, _ in results: for partition in metadata.partitions: logger.debug(f"Scanned partition {partition}") all_partitions.append(partition) + for contig in metadata.contigs: + if contig.id in contigs: + if contig != contigs[contig.id]: + raise ValueError( + "Incompatible contig definitions: " + f"{contig} != {contigs[contig.id]}" + ) + else: + contigs[contig.id] = contig total_records += metadata.num_records metadata.num_records = 0 metadata.partitions = [] + contig_union = list(contigs.values()) + for metadata, _ in results: + metadata.contigs = contig_union + icf_metadata, header = results[0] for metadata, _ in results[1:]: if metadata != icf_metadata: diff --git a/tests/test_simulated_data.py b/tests/test_simulated_data.py index 9067ab81..fe78ffa2 100644 --- a/tests/test_simulated_data.py +++ b/tests/test_simulated_data.py @@ -8,41 +8,101 @@ from bio2zarr import vcf2zarr +def run_simulation(num_samples=2, ploidy=1, seed=42, sequence_length=100_000): + # Import here to avoid problems on OSX (see below) + # https://github.com/sgkit-dev/bio2zarr/issues/336 + import msprime + + ts = msprime.sim_ancestry( + num_samples, + population_size=10**4, + ploidy=ploidy, + sequence_length=sequence_length, + random_seed=seed, + ) + tables = ts.dump_tables() + # Lazy hard coding of states here to make things simpler + for u in range(ts.num_nodes - 1): + site = tables.sites.add_row(u + 1, "A") + tables.mutations.add_row(site, derived_state="T", node=u) + return tables.tree_sequence() + + +def assert_ts_ds_equal(ts, ds, ploidy=1): + assert ds.sizes["ploidy"] == ploidy + assert ds.sizes["variants"] == ts.num_sites + assert ds.sizes["samples"] == ts.num_individuals + # Msprime guarantees that this will be true. + nt.assert_array_equal( + ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)), + ds.call_genotype.values, + ) + nt.assert_equal(ds.variant_allele[:, 0].values, "A") + nt.assert_equal(ds.variant_allele[:, 1].values, "T") + nt.assert_equal(ds.variant_position, ts.sites_position) + + +def write_vcf(ts, vcf_path, contig_id="1"): + with open(vcf_path, "w") as f: + ts.write_vcf(f, contig_id=contig_id) + # This also compresses the input file + pysam.tabix_index(str(vcf_path), preset="vcf") + return vcf_path.with_suffix(vcf_path.suffix + ".gz") + + +# https://github.com/sgkit-dev/bio2zarr/issues/336 @pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken") class TestTskitRoundTripVcf: @pytest.mark.parametrize("ploidy", [1, 2, 3, 4]) def test_ploidy(self, ploidy, tmp_path): - # FIXME importing here so pytest.skip avoids importing msprime. - import msprime - - ts = msprime.sim_ancestry( - 2, - population_size=10**4, - ploidy=ploidy, - sequence_length=100_000, - random_seed=42, - ) - tables = ts.dump_tables() - for u in ts.samples(): - site = tables.sites.add_row(u + 1, "A") - tables.mutations.add_row(site, derived_state="T", node=u) - ts = tables.tree_sequence() - vcf_file = tmp_path / "sim.vcf" - with open(vcf_file, "w") as f: - ts.write_vcf(f) - # This also compresses the input file - pysam.tabix_index(str(vcf_file), preset="vcf") + ts = run_simulation(ploidy=ploidy) + vcf_path = write_vcf(ts, tmp_path / "sim.vcf") out = tmp_path / "example.vcf.zarr" - vcf2zarr.convert([tmp_path / "sim.vcf.gz"], out) + vcf2zarr.convert([vcf_path], out) ds = sg.load_dataset(out) - assert ds.sizes["ploidy"] == ploidy - assert ds.sizes["variants"] == ts.num_sites - assert ds.sizes["samples"] == ts.num_individuals - # Msprime guarantees that this will be true. - nt.assert_array_equal( - ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)), - ds.call_genotype.values, + assert_ts_ds_equal(ts, ds, ploidy) + + @pytest.mark.parametrize( + "contig_ids", + [["ContigName"], ["1", "2"], ["2", "3", "1"], ["a", "b", "c", "d", "e"]], + ) + def test_multi_contig(self, contig_ids, tmp_path): + vcfs = [] + tss = {} + for seed, contig_id in enumerate(contig_ids, 1): + ts = run_simulation(num_samples=8, seed=seed) + vcf_path = write_vcf(ts, tmp_path / f"{contig_id}.vcf", contig_id=contig_id) + vcfs.append(vcf_path) + tss[contig_id] = ts + + out = tmp_path / "example.vcf.zarr" + vcf2zarr.convert(vcfs, out) + ds = sg.load_dataset(out).set_index( + variants=("variant_contig", "variant_position") + ) + assert ds.sizes["ploidy"] == 1 + assert ds.sizes["contigs"] == len(contig_ids) + # Files processed in order of sorted filename, and the contigs therefore + # get sorted into this order. + nt.assert_equal(ds["contig_id"].values, sorted(contig_ids)) + nt.assert_equal( + ds["contig_length"].values, [ts.sequence_length for ts in tss.values()] ) - nt.assert_equal(ds.variant_allele[:, 0].values, "A") - nt.assert_equal(ds.variant_allele[:, 1].values, "T") - nt.assert_equal(ds.variant_position, ts.sites_position) + for contig_id in contig_ids: + contig = list(ds.contig_id).index(contig_id) + dss = ds.sel(variants=(contig, slice(0, None))) + assert_ts_ds_equal(tss[contig_id], dss) + + +# https://github.com/sgkit-dev/bio2zarr/issues/336 +@pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken") +class TestIncompatibleContigs: + def test_different_lengths(self, tmp_path): + vcfs = [] + for length in [100_000, 99_999]: + ts = run_simulation(sequence_length=length) + vcf_path = write_vcf(ts, tmp_path / f"{length}.vcf", contig_id="1") + vcfs.append(vcf_path) + out = tmp_path / "example.vcf.zarr" + with pytest.raises(ValueError, match="Incompatible contig definitions"): + vcf2zarr.convert(vcfs, out)