33import xarray .testing as xt
44import pytest
55import sgkit as sg
6+ import sgkit .io .plink
7+ import bed_reader
68import zarr
79
810from bio2zarr import plink
911
1012
1113class TestSmallExample :
14+ @pytest .fixture (scope = "class" )
15+ def bed_path (self , tmp_path_factory ):
16+ tmp_path = tmp_path_factory .mktemp ("data" )
17+ path = tmp_path / "example.bed"
18+ # 7 sites x 3 samples
19+ # These are counts of allele 1, so we have to flip them
20+ # to get the values we expect
21+ dosages = np .array (
22+ [
23+ [0 , 1 , 2 ],
24+ [1 , 0 , 2 ],
25+ [0 , 0 , 0 ],
26+ [- 127 , 0 , 0 ],
27+ [2 , 0 , 0 ],
28+ [- 127 , - 127 , - 127 ],
29+ [2 , 2 , 2 ],
30+ ],
31+ dtype = np .int8 ,
32+ )
33+ bed_reader .to_bed (path , dosages .T )
34+ return path
35+
36+ @pytest .fixture (scope = "class" )
37+ def ds (self , tmp_path_factory , bed_path ):
38+ tmp_path = tmp_path_factory .mktemp ("data" )
39+ zarr_path = tmp_path / "example.plink.zarr"
40+ plink .convert (bed_path , zarr_path )
41+ return sg .load_dataset (zarr_path )
42+
43+ def test_genotypes (self , ds ):
44+ call_genotype = ds .call_genotype .values
45+ assert call_genotype .shape == (7 , 3 , 2 )
46+ nt .assert_array_equal (
47+ call_genotype ,
48+ [
49+ [[1 , 1 ], [1 , 0 ], [0 , 0 ]],
50+ [[1 , 0 ], [1 , 1 ], [0 , 0 ]],
51+ [[1 , 1 ], [1 , 1 ], [1 , 1 ]],
52+ [[- 1 , - 1 ], [1 , 1 ], [1 , 1 ]],
53+ [[0 , 0 ], [1 , 1 ], [1 , 1 ]],
54+ [[- 1 , - 1 ], [- 1 , - 1 ], [- 1 , - 1 ]],
55+ [[0 , 0 ], [0 , 0 ], [0 , 0 ]],
56+ ],
57+ )
58+
59+
60+ class TestEqualSgkit :
61+ def test_simulated_example (self , tmp_path ):
62+ data_path = "tests/data/plink/"
63+ bed_path = data_path + "plink_sim_10s_100v_10pmiss.bed"
64+ fam_path = data_path + "plink_sim_10s_100v_10pmiss.fam"
65+ bim_path = data_path + "plink_sim_10s_100v_10pmiss.bim"
66+ # print(bed_path)
67+ # print(fam_path)
68+ sg_ds = sgkit .io .plink .read_plink (
69+ bed_path = bed_path , fam_path = fam_path , bim_path = bim_path
70+ )
71+ out = tmp_path / "example.plink.zarr"
72+ plink .convert (bed_path , out )
73+ ds = sg .load_dataset (out )
74+ nt .assert_array_equal (ds .call_genotype .values , sg_ds .call_genotype .values )
75+
76+
77+ class TestExample :
78+ """
79+ .bim file looks like this:
80+
81+ 1 1_10 0 10 A G
82+ 1 1_20 0 20 T C
83+
84+ Definition: https://www.cog-genomics.org/plink/1.9/formats#bim
85+ Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0'
86+ indicates unknown) or name
87+ Variant identifier
88+ Position in morgans or centimorgans (safe to use dummy value of '0')
89+ Base-pair coordinate (1-based; limited to 231-2)
90+ Allele 1 (corresponding to clear bits in .bed; usually minor)
91+ Allele 2 (corresponding to set bits in .bed; usually major)
92+ """
93+
94+ @pytest .fixture (scope = "class" )
95+ def ds (self , tmp_path_factory ):
96+ path = "tests/data/plink/example.bed"
97+ out = tmp_path_factory .mktemp ("data" ) / "example.plink.zarr"
98+ plink .convert (path , out )
99+ return sg .load_dataset (out )
100+
101+ def test_sample_ids (self , ds ):
102+ nt .assert_array_equal (ds .sample_id , [f"ind{ j } " for j in range (10 )])
103+
104+ def test_variant_position (self , ds ):
105+ nt .assert_array_equal (ds .variant_position , [10 , 20 ])
106+
107+ def test_variant_allele (self , ds ):
108+ nt .assert_array_equal (ds .variant_allele , [["A" , "G" ], ["T" , "C" ]])
109+
110+ def test_genotypes (self , ds ):
111+ call_genotype = ds .call_genotype .values
112+ assert call_genotype .shape == (2 , 10 , 2 )
113+ nt .assert_array_equal (
114+ call_genotype ,
115+ [
116+ [
117+ [0 , 0 ],
118+ [0 , 0 ],
119+ [0 , 0 ],
120+ [1 , 1 ],
121+ [1 , 1 ],
122+ [1 , 1 ],
123+ [1 , 1 ],
124+ [1 , 1 ],
125+ [1 , 1 ],
126+ [1 , 1 ],
127+ ],
128+ [
129+ [0 , 0 ],
130+ [0 , 0 ],
131+ [0 , 0 ],
132+ [0 , 0 ],
133+ [1 , 1 ],
134+ [1 , 1 ],
135+ [1 , 1 ],
136+ [1 , 1 ],
137+ [1 , 1 ],
138+ [1 , 1 ],
139+ ],
140+ ],
141+ )
142+
143+ def test_sgkit (self , ds ):
144+ path = "tests/data/plink/example"
145+ sg_ds = sgkit .io .plink .read_plink (path = path )
146+ # Can't compare the full dataset yet
147+ nt .assert_array_equal (ds .call_genotype .values , sg_ds .call_genotype .values )
148+ # https://github.com/pystatgen/sgkit/issues/1209
149+ nt .assert_array_equal (ds .variant_allele , sg_ds .variant_allele .astype (str ))
150+ nt .assert_array_equal (ds .variant_position , sg_ds .variant_position )
151+ nt .assert_array_equal (ds .sample_id , sg_ds .sample_id )
152+
153+
154+ class TestSimulatedExample :
12155 @pytest .fixture (scope = "class" )
13156 def ds (self , tmp_path_factory ):
14157 path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed"
15158 out = tmp_path_factory .mktemp ("data" ) / "example.plink.zarr"
16159 plink .convert (path , out )
17160 return sg .load_dataset (out )
18161
19- @pytest .mark .xfail
20- # FIXME I'm not sure these are the correct genotypes here, at least
21- # the test isn't passing and others are
22162 def test_genotypes (self , ds ):
23163 # Validate a few randomly selected individual calls
24164 # (spanning all possible states for a call)
@@ -82,13 +222,6 @@ def test_chunk_size(
82222 worker_processes = worker_processes ,
83223 )
84224 ds2 = sg .load_dataset (out )
85- # print()
86- # print(ds.call_genotype.values[2])
87- # print(ds2.call_genotype.values[2])
88-
89- # print(ds2)
90- # print(ds2.call_genotype.values)
91- # print(ds.call_genotype.values)
92225 xt .assert_equal (ds , ds2 )
93226 # TODO check array chunks
94227
@@ -101,8 +234,6 @@ def test_chunk_size(
101234 (33 , 3 ),
102235 (99 , 10 ),
103236 (3 , 10 ),
104- # This one doesn't fail as it's the same as defaults
105- # (100, 10),
106237 ],
107238)
108239@pytest .mark .parametrize ("worker_processes" , [0 ])
0 commit comments