55import numpy .testing as nt
66import pytest
77import sgkit as sg
8- import sgkit .io .plink
98import xarray .testing as xt
109
1110from bio2zarr import plink
1211
1312
13+ def test_missing_dependency ():
14+ with mock .patch (
15+ "importlib.import_module" ,
16+ side_effect = ImportError ("No module named 'bed_reader'" ),
17+ ):
18+ with pytest .raises (ImportError ) as exc_info :
19+ plink .convert (
20+ "UNUSED_PATH" ,
21+ "UNUSED_PATH" ,
22+ )
23+ assert (
24+ "This process requires the optional bed_reader module. "
25+ "Install it with: pip install bio2zarr[plink]" in str (exc_info .value )
26+ )
27+
28+
1429class TestSmallExample :
1530 @pytest .fixture (scope = "class" )
1631 def bed_path (self , tmp_path_factory ):
1732 tmp_path = tmp_path_factory .mktemp ("data" )
1833 path = tmp_path / "example.bed"
1934 # 7 sites x 3 samples
20- # These are counts of allele 1, so we have to flip them
21- # to get the values we expect
2235 dosages = np .array (
2336 [
2437 [0 , 1 , 2 ],
@@ -56,13 +69,13 @@ def test_genotypes(self, ds):
5669 nt .assert_array_equal (
5770 call_genotype ,
5871 [
59- [[1 , 1 ], [1 , 0 ], [0 , 0 ]],
60- [[1 , 0 ], [1 , 1 ], [0 , 0 ]],
61- [[1 , 1 ], [1 , 1 ], [1 , 1 ]],
62- [[- 1 , - 1 ], [1 , 1 ], [1 , 1 ]],
63- [[0 , 0 ], [1 , 1 ], [1 , 1 ]],
64- [[- 1 , - 1 ], [- 1 , - 1 ], [- 1 , - 1 ]],
72+ [[0 , 0 ], [1 , 0 ], [1 , 1 ]],
73+ [[1 , 0 ], [0 , 0 ], [1 , 1 ]],
6574 [[0 , 0 ], [0 , 0 ], [0 , 0 ]],
75+ [[- 1 , - 1 ], [0 , 0 ], [0 , 0 ]],
76+ [[1 , 1 ], [0 , 0 ], [0 , 0 ]],
77+ [[- 1 , - 1 ], [- 1 , - 1 ], [- 1 , - 1 ]],
78+ [[1 , 1 ], [1 , 1 ], [1 , 1 ]],
6679 ],
6780 )
6881
@@ -82,63 +95,6 @@ def test_variant_allele(self, ds):
8295 def test_sample_id (self , ds ):
8396 nt .assert_array_equal (ds ["sample_id" ], ["s0" , "s1" , "s2" ])
8497
85- def test_missing_dependency (self ):
86- with mock .patch (
87- "importlib.import_module" ,
88- side_effect = ImportError ("No module named 'bed_reader'" ),
89- ):
90- with pytest .raises (ImportError ) as exc_info :
91- plink .convert (
92- "UNUSED_PATH" ,
93- "UNUSED_PATH" ,
94- )
95- assert (
96- "This process requires the optional bed_reader module. "
97- "Install it with: pip install bio2zarr[plink]" in str (exc_info .value )
98- )
99-
100-
101- class TestEqualSgkit :
102- def test_simulated_example (self , tmp_path ):
103- prefix = "tests/data/plink/plink_sim_10s_100v_10pmiss"
104- sg_ds = sgkit .io .plink .read_plink (
105- bed_path = prefix + ".bed" , fam_path = prefix + ".fam" , bim_path = prefix + ".bim"
106- )
107- out = tmp_path / "example.plink.zarr"
108- plink .convert (prefix = prefix , out = out )
109- ds = sg .load_dataset (out )
110- nt .assert_array_equal (ds .call_genotype .values , sg_ds .call_genotype .values )
111- nt .assert_array_equal (
112- ds .call_genotype_mask .values , sg_ds .call_genotype_mask .values
113- )
114- # sgkit doesn't have phased
115- nt .assert_array_equal (ds .variant_position .values , sg_ds .variant_position .values )
116- # nt.assert_array_equal(
117- # ds.variant_allele.values, sg_ds.variant_allele.values.astype("U")
118- # )
119- nt .assert_array_equal (ds .variant_contig .values , sg_ds .variant_contig .values )
120- nt .assert_array_equal (ds .variant_id .values , sg_ds .variant_id .values )
121- # print(sg_ds.variant_id.values)
122-
123- # Can't compare to sgkit because of
124- # https://github.com/sgkit-dev/sgkit/issues/1314
125- nt .assert_array_equal (
126- ds .sample_id .values ,
127- [
128- "000" ,
129- "001" ,
130- "002" ,
131- "003" ,
132- "004" ,
133- "005" ,
134- "006" ,
135- "007" ,
136- "008" ,
137- "009" ,
138- ],
139- )
140- # We don't do the additional sample_ fields yet
141-
14298
14399class TestExample :
144100 """
@@ -155,6 +111,10 @@ class TestExample:
155111 Base-pair coordinate (1-based; limited to 231-2)
156112 Allele 1 (corresponding to clear bits in .bed; usually minor)
157113 Allele 2 (corresponding to set bits in .bed; usually major)
114+
115+
116+ Note, the VCF interpretation here can be derived by running
117+ plink1.9 --bfile tests/data/plink/example --export vcf
158118 """
159119
160120 @pytest .fixture (scope = "class" )
@@ -171,7 +131,7 @@ def test_variant_position(self, ds):
171131 nt .assert_array_equal (ds .variant_position , [10 , 20 ])
172132
173133 def test_variant_allele (self , ds ):
174- nt .assert_array_equal (ds .variant_allele , [["A " , "GG " ], ["TTT " , "C " ]])
134+ nt .assert_array_equal (ds .variant_allele , [["GG " , "A " ], ["C " , "TTT " ]])
175135
176136 def test_variant_length (self , ds ):
177137 nt .assert_array_equal (ds .variant_length , [2 , 3 ])
@@ -193,42 +153,32 @@ def test_genotypes(self, ds):
193153 call_genotype ,
194154 [
195155 [
196- [0 , 0 ],
197- [0 , 0 ],
198- [0 , 0 ],
199- [1 , 1 ],
200- [1 , 1 ],
201156 [1 , 1 ],
202157 [1 , 1 ],
203158 [1 , 1 ],
204- [1 , 1 ],
205- [1 , 1 ],
206- ],
207- [
208159 [0 , 0 ],
209160 [0 , 0 ],
210161 [0 , 0 ],
211162 [0 , 0 ],
163+ [0 , 0 ],
164+ [0 , 0 ],
165+ [0 , 0 ],
166+ ],
167+ [
212168 [1 , 1 ],
213169 [1 , 1 ],
214170 [1 , 1 ],
215171 [1 , 1 ],
216- [1 , 1 ],
217- [1 , 1 ],
172+ [0 , 0 ],
173+ [0 , 0 ],
174+ [0 , 0 ],
175+ [0 , 0 ],
176+ [0 , 0 ],
177+ [0 , 0 ],
218178 ],
219179 ],
220180 )
221181
222- def test_sgkit (self , ds ):
223- path = "tests/data/plink/example"
224- sg_ds = sgkit .io .plink .read_plink (path = path )
225- # Can't compare the full dataset yet
226- nt .assert_array_equal (ds .call_genotype .values , sg_ds .call_genotype .values )
227- # https://github.com/pystatgen/sgkit/issues/1209
228- nt .assert_array_equal (ds .variant_allele , sg_ds .variant_allele .astype (str ))
229- nt .assert_array_equal (ds .variant_position , sg_ds .variant_position )
230- nt .assert_array_equal (ds .sample_id , sg_ds .sample_id )
231-
232182
233183class TestSimulatedExample :
234184 @pytest .fixture (scope = "class" )
@@ -259,14 +209,14 @@ def test_genotypes(self, ds):
259209 [
260210 [1 , 0 ],
261211 [1 , 0 ],
262- [0 , 0 ],
263- [0 , 0 ],
264- [- 1 , - 1 ],
265212 [1 , 1 ],
266213 [1 , 1 ],
214+ [- 1 , - 1 ],
215+ [0 , 0 ],
267216 [0 , 0 ],
268217 [1 , 1 ],
269- [1 , 1 ],
218+ [0 , 0 ],
219+ [0 , 0 ],
270220 ]
271221 )
272222 gt = ds ["call_genotype" ].values
@@ -364,7 +314,6 @@ def multi_contig_bed_path(self, tmp_path_factory):
364314 # - 1 variant on chromosome 2
365315 # - 2 variants on chromosome X
366316 # - 1 variant on chromosome Y
367- # values are counts of allele 1, will be flipped by the converter
368317 dosages = np .array (
369318 [
370319 [0 , 1 , 2 , 0 ], # chr1
@@ -406,6 +355,19 @@ def ds(self, tmp_path_factory, multi_contig_bed_path):
406355 def test_contig_ids (self , ds ):
407356 nt .assert_array_equal (ds .contig_id , ["1" , "2" , "X" , "Y" ])
408357
358+ def test_variant_allele (self , ds ):
359+ nt .assert_array_equal (
360+ ds .variant_allele ,
361+ [
362+ ["G" , "A" ],
363+ ["C" , "T" ],
364+ ["G" , "A" ],
365+ ["T" , "C" ],
366+ ["A" , "G" ],
367+ ["C" , "T" ],
368+ ],
369+ )
370+
409371 def test_variant_contig (self , ds ):
410372 nt .assert_array_equal (
411373 ds .variant_contig , [0 , 0 , 1 , 2 , 2 , 3 ]
@@ -417,12 +379,12 @@ def test_genotypes(self, ds):
417379 nt .assert_array_equal (
418380 call_genotype ,
419381 [
420- [[1 , 1 ], [1 , 0 ], [0 , 0 ], [1 , 1 ]], # chr1
421- [[1 , 0 ], [1 , 1 ], [0 , 0 ], [1 , 0 ]], # chr1
422- [[1 , 1 ], [1 , 1 ], [1 , 1 ], [0 , 0 ]], # chr2
423- [[0 , 0 ], [1 , 1 ], [1 , 0 ], [1 , 1 ]], # chrX
382+ [[0 , 0 ], [1 , 0 ], [1 , 1 ], [0 , 0 ]], # chr1
383+ [[1 , 0 ], [0 , 0 ], [1 , 1 ], [1 , 0 ]], # chr1
384+ [[0 , 0 ], [0 , 0 ], [0 , 0 ], [1 , 1 ]], # chr2
385+ [[1 , 1 ], [0 , 0 ], [1 , 0 ], [0 , 0 ]], # chrX
424386 [[1 , 0 ], [1 , 0 ], [1 , 0 ], [1 , 0 ]], # chrX
425- [[1 , 1 ], [ 0 , 0 ], [1 , 1 ], [0 , 0 ]], # chrY
387+ [[0 , 0 ], [1 , 1 ], [0 , 0 ], [ 1 , 1 ]], # chrY
426388 ],
427389 )
428390
0 commit comments