|
8 | 8 | from bio2zarr import vcf2zarr
|
9 | 9 |
|
10 | 10 |
|
| 11 | +def run_simulation(num_samples=2, ploidy=1, seed=42, sequence_length=100_000): |
| 12 | + # Import here to avoid problems on OSX (see below) |
| 13 | + # https://github.com/sgkit-dev/bio2zarr/issues/336 |
| 14 | + import msprime |
| 15 | + |
| 16 | + ts = msprime.sim_ancestry( |
| 17 | + num_samples, |
| 18 | + population_size=10**4, |
| 19 | + ploidy=ploidy, |
| 20 | + sequence_length=sequence_length, |
| 21 | + random_seed=seed, |
| 22 | + ) |
| 23 | + tables = ts.dump_tables() |
| 24 | + # Lazy hard coding of states here to make things simpler |
| 25 | + for u in range(ts.num_nodes - 1): |
| 26 | + site = tables.sites.add_row(u + 1, "A") |
| 27 | + tables.mutations.add_row(site, derived_state="T", node=u) |
| 28 | + return tables.tree_sequence() |
| 29 | + |
| 30 | + |
| 31 | +def assert_ts_ds_equal(ts, ds, ploidy=1): |
| 32 | + assert ds.sizes["ploidy"] == ploidy |
| 33 | + assert ds.sizes["variants"] == ts.num_sites |
| 34 | + assert ds.sizes["samples"] == ts.num_individuals |
| 35 | + # Msprime guarantees that this will be true. |
| 36 | + nt.assert_array_equal( |
| 37 | + ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)), |
| 38 | + ds.call_genotype.values, |
| 39 | + ) |
| 40 | + nt.assert_equal(ds.variant_allele[:, 0].values, "A") |
| 41 | + nt.assert_equal(ds.variant_allele[:, 1].values, "T") |
| 42 | + nt.assert_equal(ds.variant_position, ts.sites_position) |
| 43 | + |
| 44 | + |
| 45 | +def write_vcf(ts, vcf_path, contig_id="1"): |
| 46 | + with open(vcf_path, "w") as f: |
| 47 | + ts.write_vcf(f, contig_id=contig_id) |
| 48 | + # This also compresses the input file |
| 49 | + pysam.tabix_index(str(vcf_path), preset="vcf") |
| 50 | + return vcf_path.with_suffix(vcf_path.suffix + ".gz") |
| 51 | + |
| 52 | + |
| 53 | +# https://github.com/sgkit-dev/bio2zarr/issues/336 |
11 | 54 | @pytest.mark.skipif(sys.platform == "darwin", reason="msprime OSX pip packages broken")
|
12 | 55 | class TestTskitRoundTripVcf:
|
13 | 56 | @pytest.mark.parametrize("ploidy", [1, 2, 3, 4])
|
14 | 57 | def test_ploidy(self, ploidy, tmp_path):
|
15 |
| - # FIXME importing here so pytest.skip avoids importing msprime. |
16 |
| - import msprime |
17 |
| - |
18 |
| - ts = msprime.sim_ancestry( |
19 |
| - 2, |
20 |
| - population_size=10**4, |
21 |
| - ploidy=ploidy, |
22 |
| - sequence_length=100_000, |
23 |
| - random_seed=42, |
24 |
| - ) |
25 |
| - tables = ts.dump_tables() |
26 |
| - for u in ts.samples(): |
27 |
| - site = tables.sites.add_row(u + 1, "A") |
28 |
| - tables.mutations.add_row(site, derived_state="T", node=u) |
29 |
| - ts = tables.tree_sequence() |
30 |
| - vcf_file = tmp_path / "sim.vcf" |
31 |
| - with open(vcf_file, "w") as f: |
32 |
| - ts.write_vcf(f) |
33 |
| - # This also compresses the input file |
34 |
| - pysam.tabix_index(str(vcf_file), preset="vcf") |
| 58 | + ts = run_simulation(ploidy=ploidy) |
| 59 | + vcf_path = write_vcf(ts, tmp_path / "sim.vcf") |
35 | 60 | out = tmp_path / "example.vcf.zarr"
|
36 |
| - vcf2zarr.convert([tmp_path / "sim.vcf.gz"], out) |
| 61 | + vcf2zarr.convert([vcf_path], out) |
37 | 62 | ds = sg.load_dataset(out)
|
38 |
| - assert ds.sizes["ploidy"] == ploidy |
39 |
| - assert ds.sizes["variants"] == ts.num_sites |
40 |
| - assert ds.sizes["samples"] == ts.num_individuals |
41 |
| - # Msprime guarantees that this will be true. |
42 |
| - nt.assert_array_equal( |
43 |
| - ts.genotype_matrix().reshape((ts.num_sites, ts.num_individuals, ploidy)), |
44 |
| - ds.call_genotype.values, |
| 63 | + assert_ts_ds_equal(ts, ds, ploidy) |
| 64 | + |
| 65 | + @pytest.mark.parametrize( |
| 66 | + "contig_ids", |
| 67 | + [["ContigName"], ["1", "2"], ["2", "3", "1"], ["a", "b", "c", "d", "e"]], |
| 68 | + ) |
| 69 | + def test_multi_contig(self, contig_ids, tmp_path): |
| 70 | + vcfs = [] |
| 71 | + tss = {} |
| 72 | + for seed, contig_id in enumerate(contig_ids, 1): |
| 73 | + ts = run_simulation(num_samples=8, seed=seed) |
| 74 | + vcf_path = write_vcf(ts, tmp_path / f"{contig_id}.vcf", contig_id=contig_id) |
| 75 | + vcfs.append(vcf_path) |
| 76 | + tss[contig_id] = ts |
| 77 | + |
| 78 | + out = tmp_path / "example.vcf.zarr" |
| 79 | + vcf2zarr.convert(vcfs, out) |
| 80 | + ds = sg.load_dataset(out).set_index( |
| 81 | + variants=("variant_contig", "variant_position") |
| 82 | + ) |
| 83 | + assert ds.sizes["ploidy"] == 1 |
| 84 | + assert ds.sizes["contigs"] == len(contig_ids) |
| 85 | + # Files processed in order of sorted filename, and the contigs therefore |
| 86 | + # get sorted into this order. |
| 87 | + nt.assert_equal(ds["contig_id"].values, sorted(contig_ids)) |
| 88 | + nt.assert_equal( |
| 89 | + ds["contig_length"].values, [ts.sequence_length for ts in tss.values()] |
45 | 90 | )
|
46 |
| - nt.assert_equal(ds.variant_allele[:, 0].values, "A") |
47 |
| - nt.assert_equal(ds.variant_allele[:, 1].values, "T") |
48 |
| - nt.assert_equal(ds.variant_position, ts.sites_position) |
| 91 | + for contig_id in contig_ids: |
| 92 | + contig = list(ds.contig_id).index(contig_id) |
| 93 | + dss = ds.sel(variants=(contig, slice(0, None))) |
| 94 | + assert_ts_ds_equal(tss[contig_id], dss) |
| 95 | + |
| 96 | + |
| 97 | +class TestIncompatibleContigs: |
| 98 | + def test_different_lengths(self, tmp_path): |
| 99 | + vcfs = [] |
| 100 | + for length in [100_000, 99_999]: |
| 101 | + ts = run_simulation(sequence_length=length) |
| 102 | + vcf_path = write_vcf(ts, tmp_path / f"{length}.vcf", contig_id="1") |
| 103 | + vcfs.append(vcf_path) |
| 104 | + out = tmp_path / "example.vcf.zarr" |
| 105 | + with pytest.raises(ValueError, match="Incompatible contig definitions"): |
| 106 | + vcf2zarr.convert(vcfs, out) |
0 commit comments