Skip to content

Commit fbe8c4d

Browse files
committed
Add a test to check sgkit can read vcf2zarr datasets
1 parent d522c0f commit fbe8c4d

File tree

5 files changed

+244
-1
lines changed

5 files changed

+244
-1
lines changed

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,4 @@ networkx
2222
aiohttp
2323
requests
2424
graphviz
25+
bio2zarr

requirements-doc.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,3 @@ matplotlib
88
seaborn
99
ablog!=0.10.27
1010
pickleshare
11-
bio2zarr

sgkit/tests/io/data/sample.vcf.gz

954 Bytes
Binary file not shown.
185 Bytes
Binary file not shown.
Lines changed: 243 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,243 @@
1+
import numpy as np
2+
import pytest
3+
from bio2zarr import vcf2zarr
4+
from bio2zarr.constants import (
5+
FLOAT32_FILL,
6+
FLOAT32_MISSING,
7+
INT_FILL,
8+
INT_MISSING,
9+
STR_FILL,
10+
STR_MISSING,
11+
)
12+
from numpy.testing import assert_array_almost_equal, assert_array_equal
13+
14+
from sgkit import load_dataset, save_dataset
15+
from sgkit.tests.io.test_dataset import assert_identical
16+
17+
18+
@pytest.mark.filterwarnings("ignore::xarray.coding.variables.SerializationWarning")
19+
def test_vcf2zarr_compat(shared_datadir, tmp_path):
20+
vcf_path = shared_datadir / "sample.vcf.gz"
21+
vcz_path = tmp_path.joinpath("sample.vcz").as_posix()
22+
23+
vcf2zarr.convert(
24+
[vcf_path],
25+
vcz_path,
26+
variants_chunk_size=5,
27+
samples_chunk_size=2,
28+
worker_processes=0,
29+
)
30+
31+
ds = load_dataset(vcz_path)
32+
33+
assert_array_equal(ds["filter_id"], ["PASS", "s50", "q10"])
34+
assert_array_equal(
35+
ds["variant_filter"],
36+
[
37+
[False, False, False],
38+
[False, False, False],
39+
[True, False, False],
40+
[False, False, True],
41+
[True, False, False],
42+
[True, False, False],
43+
[True, False, False],
44+
[False, False, False],
45+
[True, False, False],
46+
],
47+
)
48+
assert_array_equal(ds["contig_id"], ["19", "20", "X"])
49+
assert "contig_length" not in ds
50+
assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
51+
assert ds["variant_contig"].chunks[0][0] == 5
52+
53+
assert_array_equal(
54+
ds["variant_position"],
55+
[111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
56+
)
57+
assert ds["variant_position"].chunks[0][0] == 5
58+
59+
im = INT_MISSING
60+
if_ = INT_FILL
61+
fm = FLOAT32_MISSING
62+
ff = FLOAT32_FILL
63+
sm = STR_MISSING
64+
sf = STR_FILL
65+
66+
assert_array_equal(
67+
ds["variant_NS"],
68+
[im, im, 3, 3, 2, 3, 3, im, im],
69+
)
70+
assert ds["variant_NS"].chunks[0][0] == 5
71+
72+
assert_array_equal(
73+
ds["variant_AN"],
74+
[im, im, im, im, im, im, 6, im, im],
75+
)
76+
assert ds["variant_AN"].chunks[0][0] == 5
77+
78+
assert_array_equal(
79+
ds["variant_AA"],
80+
[
81+
sm,
82+
sm,
83+
sm,
84+
sm,
85+
"T",
86+
"T",
87+
"G",
88+
sm,
89+
sm,
90+
],
91+
)
92+
assert ds["variant_AN"].chunks[0][0] == 5
93+
94+
assert_array_equal(
95+
ds["variant_DB"],
96+
[
97+
False,
98+
False,
99+
True,
100+
False,
101+
True,
102+
False,
103+
False,
104+
False,
105+
False,
106+
],
107+
)
108+
assert ds["variant_AN"].chunks[0][0] == 5
109+
110+
variant_AF = np.array(
111+
[
112+
[fm, fm],
113+
[fm, fm],
114+
[0.5, ff],
115+
[0.017, ff],
116+
[0.333, 0.667],
117+
[fm, fm],
118+
[fm, fm],
119+
[fm, fm],
120+
[fm, fm],
121+
],
122+
dtype=np.float32,
123+
)
124+
values = ds["variant_AF"].values
125+
assert_array_almost_equal(values, variant_AF, 3)
126+
nans = np.isnan(variant_AF)
127+
assert_array_equal(variant_AF.view(np.int32)[nans], values.view(np.int32)[nans])
128+
assert ds["variant_AF"].chunks[0][0] == 5
129+
130+
assert_array_equal(
131+
ds["variant_AC"],
132+
[
133+
[im, im],
134+
[im, im],
135+
[im, im],
136+
[im, im],
137+
[im, im],
138+
[im, im],
139+
[3, 1],
140+
[im, im],
141+
[im, im],
142+
],
143+
)
144+
assert ds["variant_AC"].chunks[0][0] == 5
145+
146+
assert_array_equal(
147+
ds["variant_allele"].values.tolist(),
148+
[
149+
["A", "C", sf, sf],
150+
["A", "G", sf, sf],
151+
["G", "A", sf, sf],
152+
["T", "A", sf, sf],
153+
["A", "G", "T", sf],
154+
["T", sf, sf, sf],
155+
["G", "GA", "GAC", sf],
156+
["T", sf, sf, sf],
157+
["AC", "A", "ATG", "C"],
158+
],
159+
)
160+
assert ds["variant_allele"].chunks[0][0] == 5
161+
assert ds["variant_allele"].dtype == "O"
162+
assert_array_equal(
163+
ds["variant_id"].values.tolist(),
164+
[sm, sm, "rs6054257", sm, "rs6040355", sm, "microsat1", sm, "rsTest"],
165+
)
166+
assert ds["variant_id"].chunks[0][0] == 5
167+
assert ds["variant_id"].dtype == "O"
168+
assert_array_equal(
169+
ds["variant_id_mask"],
170+
[True, True, False, True, False, True, False, True, False],
171+
)
172+
assert ds["variant_id_mask"].chunks[0][0] == 5
173+
174+
assert_array_equal(ds["sample_id"], ["NA00001", "NA00002", "NA00003"])
175+
assert ds["sample_id"].chunks[0][0] == 2
176+
177+
call_genotype = np.array(
178+
[
179+
[[0, 0], [0, 0], [0, 1]],
180+
[[0, 0], [0, 0], [0, 1]],
181+
[[0, 0], [1, 0], [1, 1]],
182+
[[0, 0], [0, 1], [0, 0]],
183+
[[1, 2], [2, 1], [2, 2]],
184+
[[0, 0], [0, 0], [0, 0]],
185+
[[0, 1], [0, 2], [im, im]],
186+
[[0, 0], [0, 0], [im, im]],
187+
[[0, if_], [0, 1], [0, 2]],
188+
],
189+
dtype="i1",
190+
)
191+
call_genotype_phased = np.array(
192+
[
193+
[True, True, False],
194+
[True, True, False],
195+
[True, True, False],
196+
[True, True, False],
197+
[True, True, False],
198+
[True, True, False],
199+
[False, False, False],
200+
[False, True, False],
201+
[True, False, True],
202+
],
203+
dtype=bool,
204+
)
205+
call_DP = [
206+
[im, im, im],
207+
[im, im, im],
208+
[1, 8, 5],
209+
[3, 5, 3],
210+
[6, 0, 4],
211+
[im, 4, 2],
212+
[4, 2, 3],
213+
[im, im, im],
214+
[im, im, im],
215+
]
216+
call_HQ = [
217+
[[10, 15], [10, 10], [3, 3]],
218+
[[10, 10], [10, 10], [3, 3]],
219+
[[51, 51], [51, 51], [im, im]],
220+
[[58, 50], [65, 3], [im, im]],
221+
[[23, 27], [18, 2], [im, im]],
222+
[[56, 60], [51, 51], [im, im]],
223+
[[im, im], [im, im], [im, im]],
224+
[[im, im], [im, im], [im, im]],
225+
[[im, im], [im, im], [im, im]],
226+
]
227+
228+
assert_array_equal(ds["call_genotype"], call_genotype)
229+
assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
230+
assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
231+
assert_array_equal(ds["call_DP"], call_DP)
232+
assert_array_equal(ds["call_HQ"], call_HQ)
233+
234+
for name in ["call_genotype", "call_genotype_mask", "call_HQ"]:
235+
assert ds[name].chunks == ((5, 4), (2, 1), (2,))
236+
237+
for name in ["call_genotype_phased", "call_DP"]:
238+
assert ds[name].chunks == ((5, 4), (2, 1))
239+
240+
# save and load again to test https://github.com/pydata/xarray/issues/3476
241+
path2 = tmp_path / "ds2.zarr"
242+
save_dataset(ds, path2)
243+
assert_identical(ds, load_dataset(path2))

0 commit comments

Comments
 (0)