Skip to content

Commit af4933c

Browse files
Fixup tests for plink
Covert some corner cases in the plink outout
1 parent 5775fd6 commit af4933c

File tree

2 files changed

+42
-49
lines changed

2 files changed

+42
-49
lines changed

bio2zarr/plink.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pathlib
44

55
import numpy as np
6-
import zarr
76

87
from bio2zarr import constants, core, vcz
98

@@ -216,41 +215,3 @@ def convert(
216215
)
217216
vzw.finalise(show_progress)
218217
vzw.create_index()
219-
220-
221-
# FIXME do this more efficiently - currently reading the whole thing
222-
# in for convenience, and also comparing call-by-call
223-
# TODO we should remove this function from the API - it's a test function
224-
# and should be moved into the suite
225-
@core.requires_optional_dependency("bed_reader", "plink")
226-
def validate(bed_path, zarr_path):
227-
import bed_reader
228-
229-
root = zarr.open(store=zarr_path, mode="r")
230-
call_genotype = root["call_genotype"][:]
231-
232-
bed = bed_reader.open_bed(bed_path + ".bed", count_A1=True, num_threads=1)
233-
234-
assert call_genotype.shape[0] == bed.sid_count
235-
assert call_genotype.shape[1] == bed.iid_count
236-
bed_genotypes = bed.read(dtype="int8").T
237-
assert call_genotype.shape[0] == bed_genotypes.shape[0]
238-
assert call_genotype.shape[1] == bed_genotypes.shape[1]
239-
assert call_genotype.shape[2] == 2
240-
241-
row_id = 0
242-
for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
243-
# print("ROW", row_id)
244-
# print(bed_row, zarr_row)
245-
row_id += 1
246-
for bed_call, zarr_call in zip(bed_row, zarr_row):
247-
if bed_call == -127:
248-
assert list(zarr_call) == [-1, -1]
249-
elif bed_call == 0:
250-
assert list(zarr_call) == [0, 0]
251-
elif bed_call == 1:
252-
assert list(zarr_call) == [1, 0]
253-
elif bed_call == 2:
254-
assert list(zarr_call) == [1, 1]
255-
else: # pragma no cover
256-
raise AssertionError(f"Unexpected bed call {bed_call}")

tests/test_plink.py

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77
import sgkit as sg
88
import xarray.testing as xt
9+
import zarr
910

1011
from bio2zarr import plink, vcf
1112

@@ -69,8 +70,8 @@ def test_genotypes(self, ds):
6970
nt.assert_array_equal(
7071
call_genotype,
7172
[
72-
[[0, 0], [1, 0], [1, 1]],
73-
[[1, 0], [0, 0], [1, 1]],
73+
[[0, 0], [0, 1], [1, 1]],
74+
[[0, 1], [0, 0], [1, 1]],
7475
[[0, 0], [0, 0], [0, 0]],
7576
[[-1, -1], [0, 0], [0, 0]],
7677
[[1, 1], [0, 0], [0, 0]],
@@ -134,7 +135,7 @@ def test_variant_allele(self, ds):
134135
nt.assert_array_equal(ds.variant_allele, [["GG", "A"], ["C", "TTT"]])
135136

136137
def test_variant_length(self, ds):
137-
nt.assert_array_equal(ds.variant_length, [2, 3])
138+
nt.assert_array_equal(ds.variant_length, [2, 1])
138139

139140
def test_contig_id(self, ds):
140141
"""Test that contig identifiers are correctly extracted and stored."""
@@ -207,8 +208,8 @@ def test_genotypes(self, ds):
207208
)
208209
expected = np.array(
209210
[
210-
[1, 0],
211-
[1, 0],
211+
[0, 1],
212+
[0, 1],
212213
[1, 1],
213214
[1, 1],
214215
[-1, -1],
@@ -276,6 +277,37 @@ def test_chunk_size(
276277
# TODO check array chunks
277278

278279

280+
def validate(bed_path, zarr_path):
281+
root = zarr.open(store=zarr_path, mode="r")
282+
call_genotype = root["call_genotype"][:]
283+
284+
bed = bed_reader.open_bed(bed_path + ".bed", count_A1=True, num_threads=1)
285+
286+
assert call_genotype.shape[0] == bed.sid_count
287+
assert call_genotype.shape[1] == bed.iid_count
288+
bed_genotypes = bed.read(dtype="int8").T
289+
assert call_genotype.shape[0] == bed_genotypes.shape[0]
290+
assert call_genotype.shape[1] == bed_genotypes.shape[1]
291+
assert call_genotype.shape[2] == 2
292+
293+
row_id = 0
294+
for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
295+
# print("ROW", row_id)
296+
# print(bed_row, zarr_row)
297+
row_id += 1
298+
for bed_call, zarr_call in zip(bed_row, zarr_row):
299+
if bed_call == -127:
300+
assert list(zarr_call) == [-1, -1]
301+
elif bed_call == 0:
302+
assert list(zarr_call) == [0, 0]
303+
elif bed_call == 1:
304+
assert list(zarr_call) == [0, 1]
305+
elif bed_call == 2:
306+
assert list(zarr_call) == [1, 1]
307+
else: # pragma no cover
308+
raise AssertionError(f"Unexpected bed call {bed_call}")
309+
310+
279311
@pytest.mark.parametrize(
280312
("variants_chunk_size", "samples_chunk_size"),
281313
[
@@ -299,7 +331,7 @@ def test_by_validating(
299331
samples_chunk_size=samples_chunk_size,
300332
worker_processes=worker_processes,
301333
)
302-
plink.validate(path, out)
334+
validate(path, out)
303335

304336

305337
class TestMultipleContigs:
@@ -379,11 +411,11 @@ def test_genotypes(self, ds):
379411
nt.assert_array_equal(
380412
call_genotype,
381413
[
382-
[[0, 0], [1, 0], [1, 1], [0, 0]], # chr1
383-
[[1, 0], [0, 0], [1, 1], [1, 0]], # chr1
414+
[[0, 0], [0, 1], [1, 1], [0, 0]], # chr1
415+
[[0, 1], [0, 0], [1, 1], [0, 1]], # chr1
384416
[[0, 0], [0, 0], [0, 0], [1, 1]], # chr2
385-
[[1, 1], [0, 0], [1, 0], [0, 0]], # chrX
386-
[[1, 0], [1, 0], [1, 0], [1, 0]], # chrX
417+
[[1, 1], [0, 0], [0, 1], [0, 0]], # chrX
418+
[[0, 1], [0, 1], [0, 1], [0, 1]], # chrX
387419
[[0, 0], [1, 1], [0, 0], [1, 1]], # chrY
388420
],
389421
)

0 commit comments

Comments
 (0)