Skip to content

Commit a2134a2

Browse files
Switch to VCF allele ordering for plink
Closes #391
1 parent fbbd878 commit a2134a2

File tree

2 files changed

+64
-100
lines changed

2 files changed

+64
-100
lines changed

bio2zarr/plink.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def __init__(self, prefix):
3636
bim_location=paths.bim_path,
3737
fam_location=paths.fam_path,
3838
num_threads=1,
39-
count_A1=False,
39+
count_A1=True,
4040
)
4141

4242
@property
@@ -216,14 +216,16 @@ def convert(
216216

217217
# FIXME do this more efficiently - currently reading the whole thing
218218
# in for convenience, and also comparing call-by-call
219+
# TODO we should remove this function from the API - it's a test function
220+
# and should be moved into the suite
219221
@core.requires_optional_dependency("bed_reader", "plink")
220222
def validate(bed_path, zarr_path):
221223
import bed_reader
222224

223225
root = zarr.open(store=zarr_path, mode="r")
224226
call_genotype = root["call_genotype"][:]
225227

226-
bed = bed_reader.open_bed(bed_path + ".bed", count_A1=False, num_threads=1)
228+
bed = bed_reader.open_bed(bed_path + ".bed", count_A1=True, num_threads=1)
227229

228230
assert call_genotype.shape[0] == bed.sid_count
229231
assert call_genotype.shape[1] == bed.iid_count

tests/test_plink.py

Lines changed: 60 additions & 98 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,33 @@
55
import numpy.testing as nt
66
import pytest
77
import sgkit as sg
8-
import sgkit.io.plink
98
import xarray.testing as xt
109

1110
from bio2zarr import plink
1211

1312

13+
def test_missing_dependency():
14+
with mock.patch(
15+
"importlib.import_module",
16+
side_effect=ImportError("No module named 'bed_reader'"),
17+
):
18+
with pytest.raises(ImportError) as exc_info:
19+
plink.convert(
20+
"UNUSED_PATH",
21+
"UNUSED_PATH",
22+
)
23+
assert (
24+
"This process requires the optional bed_reader module. "
25+
"Install it with: pip install bio2zarr[plink]" in str(exc_info.value)
26+
)
27+
28+
1429
class TestSmallExample:
1530
@pytest.fixture(scope="class")
1631
def bed_path(self, tmp_path_factory):
1732
tmp_path = tmp_path_factory.mktemp("data")
1833
path = tmp_path / "example.bed"
1934
# 7 sites x 3 samples
20-
# These are counts of allele 1, so we have to flip them
21-
# to get the values we expect
2235
dosages = np.array(
2336
[
2437
[0, 1, 2],
@@ -56,13 +69,13 @@ def test_genotypes(self, ds):
5669
nt.assert_array_equal(
5770
call_genotype,
5871
[
59-
[[1, 1], [1, 0], [0, 0]],
60-
[[1, 0], [1, 1], [0, 0]],
61-
[[1, 1], [1, 1], [1, 1]],
62-
[[-1, -1], [1, 1], [1, 1]],
63-
[[0, 0], [1, 1], [1, 1]],
64-
[[-1, -1], [-1, -1], [-1, -1]],
72+
[[0, 0], [1, 0], [1, 1]],
73+
[[1, 0], [0, 0], [1, 1]],
6574
[[0, 0], [0, 0], [0, 0]],
75+
[[-1, -1], [0, 0], [0, 0]],
76+
[[1, 1], [0, 0], [0, 0]],
77+
[[-1, -1], [-1, -1], [-1, -1]],
78+
[[1, 1], [1, 1], [1, 1]],
6679
],
6780
)
6881

@@ -82,63 +95,6 @@ def test_variant_allele(self, ds):
8295
def test_sample_id(self, ds):
8396
nt.assert_array_equal(ds["sample_id"], ["s0", "s1", "s2"])
8497

85-
def test_missing_dependency(self):
86-
with mock.patch(
87-
"importlib.import_module",
88-
side_effect=ImportError("No module named 'bed_reader'"),
89-
):
90-
with pytest.raises(ImportError) as exc_info:
91-
plink.convert(
92-
"UNUSED_PATH",
93-
"UNUSED_PATH",
94-
)
95-
assert (
96-
"This process requires the optional bed_reader module. "
97-
"Install it with: pip install bio2zarr[plink]" in str(exc_info.value)
98-
)
99-
100-
101-
class TestEqualSgkit:
102-
def test_simulated_example(self, tmp_path):
103-
prefix = "tests/data/plink/plink_sim_10s_100v_10pmiss"
104-
sg_ds = sgkit.io.plink.read_plink(
105-
bed_path=prefix + ".bed", fam_path=prefix + ".fam", bim_path=prefix + ".bim"
106-
)
107-
out = tmp_path / "example.plink.zarr"
108-
plink.convert(prefix=prefix, out=out)
109-
ds = sg.load_dataset(out)
110-
nt.assert_array_equal(ds.call_genotype.values, sg_ds.call_genotype.values)
111-
nt.assert_array_equal(
112-
ds.call_genotype_mask.values, sg_ds.call_genotype_mask.values
113-
)
114-
# sgkit doesn't have phased
115-
nt.assert_array_equal(ds.variant_position.values, sg_ds.variant_position.values)
116-
# nt.assert_array_equal(
117-
# ds.variant_allele.values, sg_ds.variant_allele.values.astype("U")
118-
# )
119-
nt.assert_array_equal(ds.variant_contig.values, sg_ds.variant_contig.values)
120-
nt.assert_array_equal(ds.variant_id.values, sg_ds.variant_id.values)
121-
# print(sg_ds.variant_id.values)
122-
123-
# Can't compare to sgkit because of
124-
# https://github.com/sgkit-dev/sgkit/issues/1314
125-
nt.assert_array_equal(
126-
ds.sample_id.values,
127-
[
128-
"000",
129-
"001",
130-
"002",
131-
"003",
132-
"004",
133-
"005",
134-
"006",
135-
"007",
136-
"008",
137-
"009",
138-
],
139-
)
140-
# We don't do the additional sample_ fields yet
141-
14298

14399
class TestExample:
144100
"""
@@ -155,6 +111,10 @@ class TestExample:
155111
Base-pair coordinate (1-based; limited to 231-2)
156112
Allele 1 (corresponding to clear bits in .bed; usually minor)
157113
Allele 2 (corresponding to set bits in .bed; usually major)
114+
115+
116+
Note, the VCF interpretation here can be derived by running
117+
plink1.9 --bfile tests/data/plink/example --export vcf
158118
"""
159119

160120
@pytest.fixture(scope="class")
@@ -171,7 +131,7 @@ def test_variant_position(self, ds):
171131
nt.assert_array_equal(ds.variant_position, [10, 20])
172132

173133
def test_variant_allele(self, ds):
174-
nt.assert_array_equal(ds.variant_allele, [["A", "GG"], ["TTT", "C"]])
134+
nt.assert_array_equal(ds.variant_allele, [["GG", "A"], ["C", "TTT"]])
175135

176136
def test_variant_length(self, ds):
177137
nt.assert_array_equal(ds.variant_length, [2, 3])
@@ -193,42 +153,32 @@ def test_genotypes(self, ds):
193153
call_genotype,
194154
[
195155
[
196-
[0, 0],
197-
[0, 0],
198-
[0, 0],
199-
[1, 1],
200-
[1, 1],
201156
[1, 1],
202157
[1, 1],
203158
[1, 1],
204-
[1, 1],
205-
[1, 1],
206-
],
207-
[
208159
[0, 0],
209160
[0, 0],
210161
[0, 0],
211162
[0, 0],
163+
[0, 0],
164+
[0, 0],
165+
[0, 0],
166+
],
167+
[
212168
[1, 1],
213169
[1, 1],
214170
[1, 1],
215171
[1, 1],
216-
[1, 1],
217-
[1, 1],
172+
[0, 0],
173+
[0, 0],
174+
[0, 0],
175+
[0, 0],
176+
[0, 0],
177+
[0, 0],
218178
],
219179
],
220180
)
221181

222-
def test_sgkit(self, ds):
223-
path = "tests/data/plink/example"
224-
sg_ds = sgkit.io.plink.read_plink(path=path)
225-
# Can't compare the full dataset yet
226-
nt.assert_array_equal(ds.call_genotype.values, sg_ds.call_genotype.values)
227-
# https://github.com/pystatgen/sgkit/issues/1209
228-
nt.assert_array_equal(ds.variant_allele, sg_ds.variant_allele.astype(str))
229-
nt.assert_array_equal(ds.variant_position, sg_ds.variant_position)
230-
nt.assert_array_equal(ds.sample_id, sg_ds.sample_id)
231-
232182

233183
class TestSimulatedExample:
234184
@pytest.fixture(scope="class")
@@ -259,14 +209,14 @@ def test_genotypes(self, ds):
259209
[
260210
[1, 0],
261211
[1, 0],
262-
[0, 0],
263-
[0, 0],
264-
[-1, -1],
265212
[1, 1],
266213
[1, 1],
214+
[-1, -1],
215+
[0, 0],
267216
[0, 0],
268217
[1, 1],
269-
[1, 1],
218+
[0, 0],
219+
[0, 0],
270220
]
271221
)
272222
gt = ds["call_genotype"].values
@@ -364,7 +314,6 @@ def multi_contig_bed_path(self, tmp_path_factory):
364314
# - 1 variant on chromosome 2
365315
# - 2 variants on chromosome X
366316
# - 1 variant on chromosome Y
367-
# values are counts of allele 1, will be flipped by the converter
368317
dosages = np.array(
369318
[
370319
[0, 1, 2, 0], # chr1
@@ -406,6 +355,19 @@ def ds(self, tmp_path_factory, multi_contig_bed_path):
406355
def test_contig_ids(self, ds):
407356
nt.assert_array_equal(ds.contig_id, ["1", "2", "X", "Y"])
408357

358+
def test_variant_allele(self, ds):
359+
nt.assert_array_equal(
360+
ds.variant_allele,
361+
[
362+
["G", "A"],
363+
["C", "T"],
364+
["G", "A"],
365+
["T", "C"],
366+
["A", "G"],
367+
["C", "T"],
368+
],
369+
)
370+
409371
def test_variant_contig(self, ds):
410372
nt.assert_array_equal(
411373
ds.variant_contig, [0, 0, 1, 2, 2, 3]
@@ -417,12 +379,12 @@ def test_genotypes(self, ds):
417379
nt.assert_array_equal(
418380
call_genotype,
419381
[
420-
[[1, 1], [1, 0], [0, 0], [1, 1]], # chr1
421-
[[1, 0], [1, 1], [0, 0], [1, 0]], # chr1
422-
[[1, 1], [1, 1], [1, 1], [0, 0]], # chr2
423-
[[0, 0], [1, 1], [1, 0], [1, 1]], # chrX
382+
[[0, 0], [1, 0], [1, 1], [0, 0]], # chr1
383+
[[1, 0], [0, 0], [1, 1], [1, 0]], # chr1
384+
[[0, 0], [0, 0], [0, 0], [1, 1]], # chr2
385+
[[1, 1], [0, 0], [1, 0], [0, 0]], # chrX
424386
[[1, 0], [1, 0], [1, 0], [1, 0]], # chrX
425-
[[1, 1], [0, 0], [1, 1], [0, 0]], # chrY
387+
[[0, 0], [1, 1], [0, 0], [1, 1]], # chrY
426388
],
427389
)
428390

0 commit comments

Comments
 (0)