Skip to content

Commit cfe6b87

Browse files
committed
Add a test to check sgkit can read vcf2zarr datasets
1 parent 51a8ffa commit cfe6b87

File tree

6 files changed

+254
-5
lines changed

6 files changed

+254
-5
lines changed

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ rechunker
1515
cbgen < 1.0.5; platform_system != "Windows"
1616
cbgen == 1.0.1; platform_system == "Windows"
1717
cyvcf2; platform_system != "Windows"
18+
bio2zarr; platform_system != "Windows"
1819
yarl
1920
matplotlib
2021
asv

requirements-doc.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,3 @@ matplotlib
88
seaborn
99
ablog!=0.10.27
1010
pickleshare
11-
bio2zarr

sgkit/model.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,15 +171,15 @@ def num_contigs(ds: xr.Dataset) -> ArrayLike:
171171
"""Return the number of contigs in a dataset."""
172172
if DIM_CONTIG in ds.sizes:
173173
return ds.sizes[DIM_CONTIG]
174-
else:
174+
else: # pragma: no cover
175175
return len(ds.attrs["contigs"])
176176

177177

178178
def get_contigs(ds: xr.Dataset) -> ArrayLike:
179179
"""Return the contigs in a dataset."""
180180
if "contig_id" in ds:
181181
return ds["contig_id"].values
182-
else:
182+
else: # pragma: no cover
183183
warnings.warn(
184184
"The 'contigs' VCF Zarr group attribute is deprecated and should be converted to a 'contig_id' array.",
185185
DeprecationWarning,
@@ -191,11 +191,11 @@ def get_filters(ds: xr.Dataset) -> Optional[ArrayLike]:
191191
"""Return the filters in a dataset."""
192192
if "filter_id" in ds:
193193
return ds["filter_id"].values
194-
elif "filters" in ds.attrs:
194+
elif "filters" in ds.attrs: # pragma: no cover
195195
warnings.warn(
196196
"The 'filters' VCF Zarr group attribute is deprecated and should be converted to a 'filter_id' array.",
197197
DeprecationWarning,
198198
)
199199
return np.array(ds.attrs["filters"], dtype="S")
200-
else:
200+
else: # pragma: no cover
201201
return None

sgkit/tests/io/data/sample.vcf.gz

954 Bytes
Binary file not shown.
185 Bytes
Binary file not shown.
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
import numpy as np
2+
import pytest
3+
4+
pytest.importorskip("bio2zarr")
5+
from bio2zarr import vcf2zarr
6+
from bio2zarr.constants import (
7+
FLOAT32_FILL,
8+
FLOAT32_MISSING,
9+
INT_FILL,
10+
INT_MISSING,
11+
STR_FILL,
12+
STR_MISSING,
13+
)
14+
from numpy.testing import assert_array_almost_equal, assert_array_equal
15+
16+
from sgkit import load_dataset, save_dataset
17+
from sgkit.model import get_contigs, get_filters, num_contigs
18+
from sgkit.tests.io.test_dataset import assert_identical
19+
20+
21+
@pytest.mark.filterwarnings("ignore::xarray.coding.variables.SerializationWarning")
22+
def test_vcf2zarr_compat(shared_datadir, tmp_path):
23+
vcf_path = shared_datadir / "sample.vcf.gz"
24+
vcz_path = tmp_path.joinpath("sample.vcz").as_posix()
25+
26+
vcf2zarr.convert(
27+
[vcf_path],
28+
vcz_path,
29+
variants_chunk_size=5,
30+
samples_chunk_size=2,
31+
worker_processes=0,
32+
)
33+
34+
ds = load_dataset(vcz_path)
35+
36+
assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"])
37+
assert_array_equal(get_filters(ds), ["PASS", "s50", "q10"]) # utility function
38+
assert_array_equal(
39+
ds["variant_filter"],
40+
[
41+
[False, False, False],
42+
[False, False, False],
43+
[True, False, False],
44+
[False, False, True],
45+
[True, False, False],
46+
[True, False, False],
47+
[True, False, False],
48+
[False, False, False],
49+
[True, False, False],
50+
],
51+
)
52+
assert num_contigs(ds) == 3
53+
assert_array_equal(ds["contig_id"], ["19", "20", "X"])
54+
assert_array_equal(get_contigs(ds), ["19", "20", "X"]) # utility function
55+
assert "contig_length" not in ds
56+
assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
57+
assert ds["variant_contig"].chunks[0][0] == 5
58+
59+
assert_array_equal(
60+
ds["variant_position"],
61+
[111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
62+
)
63+
assert ds["variant_position"].chunks[0][0] == 5
64+
65+
im = INT_MISSING
66+
if_ = INT_FILL
67+
fm = FLOAT32_MISSING
68+
ff = FLOAT32_FILL
69+
sm = STR_MISSING
70+
sf = STR_FILL
71+
72+
assert_array_equal(
73+
ds["variant_NS"],
74+
[im, im, 3, 3, 2, 3, 3, im, im],
75+
)
76+
assert ds["variant_NS"].chunks[0][0] == 5
77+
78+
assert_array_equal(
79+
ds["variant_AN"],
80+
[im, im, im, im, im, im, 6, im, im],
81+
)
82+
assert ds["variant_AN"].chunks[0][0] == 5
83+
84+
assert_array_equal(
85+
ds["variant_AA"],
86+
[
87+
sm,
88+
sm,
89+
sm,
90+
sm,
91+
"T",
92+
"T",
93+
"G",
94+
sm,
95+
sm,
96+
],
97+
)
98+
assert ds["variant_AN"].chunks[0][0] == 5
99+
100+
assert_array_equal(
101+
ds["variant_DB"],
102+
[
103+
False,
104+
False,
105+
True,
106+
False,
107+
True,
108+
False,
109+
False,
110+
False,
111+
False,
112+
],
113+
)
114+
assert ds["variant_AN"].chunks[0][0] == 5
115+
116+
variant_AF = np.array(
117+
[
118+
[fm, fm],
119+
[fm, fm],
120+
[0.5, ff],
121+
[0.017, ff],
122+
[0.333, 0.667],
123+
[fm, fm],
124+
[fm, fm],
125+
[fm, fm],
126+
[fm, fm],
127+
],
128+
dtype=np.float32,
129+
)
130+
values = ds["variant_AF"].values
131+
assert_array_almost_equal(values, variant_AF, 3)
132+
nans = np.isnan(variant_AF)
133+
assert_array_equal(variant_AF.view(np.int32)[nans], values.view(np.int32)[nans])
134+
assert ds["variant_AF"].chunks[0][0] == 5
135+
136+
assert_array_equal(
137+
ds["variant_AC"],
138+
[
139+
[im, im],
140+
[im, im],
141+
[im, im],
142+
[im, im],
143+
[im, im],
144+
[im, im],
145+
[3, 1],
146+
[im, im],
147+
[im, im],
148+
],
149+
)
150+
assert ds["variant_AC"].chunks[0][0] == 5
151+
152+
assert_array_equal(
153+
ds["variant_allele"].values.tolist(),
154+
[
155+
["A", "C", sf, sf],
156+
["A", "G", sf, sf],
157+
["G", "A", sf, sf],
158+
["T", "A", sf, sf],
159+
["A", "G", "T", sf],
160+
["T", sf, sf, sf],
161+
["G", "GA", "GAC", sf],
162+
["T", sf, sf, sf],
163+
["AC", "A", "ATG", "C"],
164+
],
165+
)
166+
assert ds["variant_allele"].chunks[0][0] == 5
167+
assert ds["variant_allele"].dtype == "O"
168+
assert_array_equal(
169+
ds["variant_id"].values.tolist(),
170+
[sm, sm, "rs6054257", sm, "rs6040355", sm, "microsat1", sm, "rsTest"],
171+
)
172+
assert ds["variant_id"].chunks[0][0] == 5
173+
assert ds["variant_id"].dtype == "O"
174+
assert_array_equal(
175+
ds["variant_id_mask"],
176+
[True, True, False, True, False, True, False, True, False],
177+
)
178+
assert ds["variant_id_mask"].chunks[0][0] == 5
179+
180+
assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])
181+
assert ds["sample_id"].chunks[0][0] == 2
182+
183+
call_genotype = np.array(
184+
[
185+
[[0, 0], [0, 0], [0, 1]],
186+
[[0, 0], [0, 0], [0, 1]],
187+
[[0, 0], [1, 0], [1, 1]],
188+
[[0, 0], [0, 1], [0, 0]],
189+
[[1, 2], [2, 1], [2, 2]],
190+
[[0, 0], [0, 0], [0, 0]],
191+
[[0, 1], [0, 2], [im, im]],
192+
[[0, 0], [0, 0], [im, im]],
193+
[[0, if_], [0, 1], [0, 2]],
194+
],
195+
dtype="i1",
196+
)
197+
call_genotype_phased = np.array(
198+
[
199+
[True, True, False],
200+
[True, True, False],
201+
[True, True, False],
202+
[True, True, False],
203+
[True, True, False],
204+
[True, True, False],
205+
[False, False, False],
206+
[False, True, False],
207+
[True, False, True],
208+
],
209+
dtype=bool,
210+
)
211+
call_DP = [
212+
[im, im, im],
213+
[im, im, im],
214+
[1, 8, 5],
215+
[3, 5, 3],
216+
[6, 0, 4],
217+
[im, 4, 2],
218+
[4, 2, 3],
219+
[im, im, im],
220+
[im, im, im],
221+
]
222+
call_HQ = [
223+
[[10, 15], [10, 10], [3, 3]],
224+
[[10, 10], [10, 10], [3, 3]],
225+
[[51, 51], [51, 51], [im, im]],
226+
[[58, 50], [65, 3], [im, im]],
227+
[[23, 27], [18, 2], [im, im]],
228+
[[56, 60], [51, 51], [im, im]],
229+
[[im, im], [im, im], [im, im]],
230+
[[im, im], [im, im], [im, im]],
231+
[[im, im], [im, im], [im, im]],
232+
]
233+
234+
assert_array_equal(ds["call_genotype"], call_genotype)
235+
assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
236+
assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
237+
assert_array_equal(ds["call_DP"], call_DP)
238+
assert_array_equal(ds["call_HQ"], call_HQ)
239+
240+
for name in ["call_genotype", "call_genotype_mask", "call_HQ"]:
241+
assert ds[name].chunks == ((5, 4), (2, 1), (2,))
242+
243+
for name in ["call_genotype_phased", "call_DP"]:
244+
assert ds[name].chunks == ((5, 4), (2, 1))
245+
246+
# save and load again to test https://github.com/pydata/xarray/issues/3476
247+
path2 = tmp_path / "ds2.zarr"
248+
save_dataset(ds, path2)
249+
assert_identical(ds, load_dataset(path2))

0 commit comments

Comments
 (0)