6
6
import pytest
7
7
import sgkit as sg
8
8
import xarray .testing as xt
9
+ import zarr
9
10
10
11
from bio2zarr import plink , vcf
11
12
@@ -69,8 +70,8 @@ def test_genotypes(self, ds):
69
70
nt .assert_array_equal (
70
71
call_genotype ,
71
72
[
72
- [[0 , 0 ], [1 , 0 ], [1 , 1 ]],
73
- [[1 , 0 ], [0 , 0 ], [1 , 1 ]],
73
+ [[0 , 0 ], [0 , 1 ], [1 , 1 ]],
74
+ [[0 , 1 ], [0 , 0 ], [1 , 1 ]],
74
75
[[0 , 0 ], [0 , 0 ], [0 , 0 ]],
75
76
[[- 1 , - 1 ], [0 , 0 ], [0 , 0 ]],
76
77
[[1 , 1 ], [0 , 0 ], [0 , 0 ]],
@@ -134,7 +135,7 @@ def test_variant_allele(self, ds):
134
135
nt .assert_array_equal (ds .variant_allele , [["GG" , "A" ], ["C" , "TTT" ]])
135
136
136
137
def test_variant_length (self , ds ):
137
- nt .assert_array_equal (ds .variant_length , [2 , 3 ])
138
+ nt .assert_array_equal (ds .variant_length , [2 , 1 ])
138
139
139
140
def test_contig_id (self , ds ):
140
141
"""Test that contig identifiers are correctly extracted and stored."""
@@ -207,8 +208,8 @@ def test_genotypes(self, ds):
207
208
)
208
209
expected = np .array (
209
210
[
210
- [1 , 0 ],
211
- [1 , 0 ],
211
+ [0 , 1 ],
212
+ [0 , 1 ],
212
213
[1 , 1 ],
213
214
[1 , 1 ],
214
215
[- 1 , - 1 ],
@@ -276,6 +277,37 @@ def test_chunk_size(
276
277
# TODO check array chunks
277
278
278
279
280
+ def validate (bed_path , zarr_path ):
281
+ root = zarr .open (store = zarr_path , mode = "r" )
282
+ call_genotype = root ["call_genotype" ][:]
283
+
284
+ bed = bed_reader .open_bed (bed_path + ".bed" , count_A1 = True , num_threads = 1 )
285
+
286
+ assert call_genotype .shape [0 ] == bed .sid_count
287
+ assert call_genotype .shape [1 ] == bed .iid_count
288
+ bed_genotypes = bed .read (dtype = "int8" ).T
289
+ assert call_genotype .shape [0 ] == bed_genotypes .shape [0 ]
290
+ assert call_genotype .shape [1 ] == bed_genotypes .shape [1 ]
291
+ assert call_genotype .shape [2 ] == 2
292
+
293
+ row_id = 0
294
+ for bed_row , zarr_row in zip (bed_genotypes , call_genotype ):
295
+ # print("ROW", row_id)
296
+ # print(bed_row, zarr_row)
297
+ row_id += 1
298
+ for bed_call , zarr_call in zip (bed_row , zarr_row ):
299
+ if bed_call == - 127 :
300
+ assert list (zarr_call ) == [- 1 , - 1 ]
301
+ elif bed_call == 0 :
302
+ assert list (zarr_call ) == [0 , 0 ]
303
+ elif bed_call == 1 :
304
+ assert list (zarr_call ) == [0 , 1 ]
305
+ elif bed_call == 2 :
306
+ assert list (zarr_call ) == [1 , 1 ]
307
+ else : # pragma no cover
308
+ raise AssertionError (f"Unexpected bed call { bed_call } " )
309
+
310
+
279
311
@pytest .mark .parametrize (
280
312
("variants_chunk_size" , "samples_chunk_size" ),
281
313
[
@@ -299,7 +331,7 @@ def test_by_validating(
299
331
samples_chunk_size = samples_chunk_size ,
300
332
worker_processes = worker_processes ,
301
333
)
302
- plink . validate (path , out )
334
+ validate (path , out )
303
335
304
336
305
337
class TestMultipleContigs :
@@ -379,11 +411,11 @@ def test_genotypes(self, ds):
379
411
nt .assert_array_equal (
380
412
call_genotype ,
381
413
[
382
- [[0 , 0 ], [1 , 0 ], [1 , 1 ], [0 , 0 ]], # chr1
383
- [[1 , 0 ], [0 , 0 ], [1 , 1 ], [1 , 0 ]], # chr1
414
+ [[0 , 0 ], [0 , 1 ], [1 , 1 ], [0 , 0 ]], # chr1
415
+ [[0 , 1 ], [0 , 0 ], [1 , 1 ], [0 , 1 ]], # chr1
384
416
[[0 , 0 ], [0 , 0 ], [0 , 0 ], [1 , 1 ]], # chr2
385
- [[1 , 1 ], [0 , 0 ], [1 , 0 ], [0 , 0 ]], # chrX
386
- [[1 , 0 ], [1 , 0 ], [1 , 0 ], [1 , 0 ]], # chrX
417
+ [[1 , 1 ], [0 , 0 ], [0 , 1 ], [0 , 0 ]], # chrX
418
+ [[0 , 1 ], [0 , 1 ], [0 , 1 ], [0 , 1 ]], # chrX
387
419
[[0 , 0 ], [1 , 1 ], [0 , 0 ], [1 , 1 ]], # chrY
388
420
],
389
421
)
0 commit comments