Skip to content

Commit cbbf504

Browse files
Switch plink hets from [1, 0] to [0, 1]
Fixup tests for plink Covert some corner cases in the plink outout
1 parent afdd452 commit cbbf504

File tree

4 files changed

+103
-110
lines changed

4 files changed

+103
-110
lines changed

bio2zarr/cli.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -562,8 +562,8 @@ def convert_plink(
562562
):
563563
"""
564564
Convert plink fileset to VCF Zarr. Results are equivalent to
565-
`plink1.9 --bfile prefix --recode vcf-iid --out tmp` then running
566-
`vcf2zarr convert tmp.vcf zarr_path`
565+
`plink1.9 --bfile prefix --keep-allele-order --recode vcf-iid --out tmp`
566+
then running `vcf2zarr convert tmp.vcf zarr_path`
567567
"""
568568
setup_logging(verbose)
569569
plink.convert(

bio2zarr/plink.py

Lines changed: 1 addition & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import pathlib
44

55
import numpy as np
6-
import zarr
76

87
from bio2zarr import constants, core, vcz
98

@@ -86,7 +85,7 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
8685
gt[:] = 0
8786
gt[bed_chunk[i] == -127] = -1
8887
gt[bed_chunk[i] == 2] = 1
89-
gt[bed_chunk[i] == 1, 0] = 1
88+
gt[bed_chunk[i] == 1, 1] = 1
9089

9190
# rlen is the length of the REF in PLINK as there's no END annotations
9291
yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
@@ -216,41 +215,3 @@ def convert(
216215
)
217216
vzw.finalise(show_progress)
218217
vzw.create_index()
219-
220-
221-
# FIXME do this more efficiently - currently reading the whole thing
222-
# in for convenience, and also comparing call-by-call
223-
# TODO we should remove this function from the API - it's a test function
224-
# and should be moved into the suite
225-
@core.requires_optional_dependency("bed_reader", "plink")
226-
def validate(bed_path, zarr_path):
227-
import bed_reader
228-
229-
root = zarr.open(store=zarr_path, mode="r")
230-
call_genotype = root["call_genotype"][:]
231-
232-
bed = bed_reader.open_bed(bed_path + ".bed", count_A1=True, num_threads=1)
233-
234-
assert call_genotype.shape[0] == bed.sid_count
235-
assert call_genotype.shape[1] == bed.iid_count
236-
bed_genotypes = bed.read(dtype="int8").T
237-
assert call_genotype.shape[0] == bed_genotypes.shape[0]
238-
assert call_genotype.shape[1] == bed_genotypes.shape[1]
239-
assert call_genotype.shape[2] == 2
240-
241-
row_id = 0
242-
for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
243-
# print("ROW", row_id)
244-
# print(bed_row, zarr_row)
245-
row_id += 1
246-
for bed_call, zarr_call in zip(bed_row, zarr_row):
247-
if bed_call == -127:
248-
assert list(zarr_call) == [-1, -1]
249-
elif bed_call == 0:
250-
assert list(zarr_call) == [0, 0]
251-
elif bed_call == 1:
252-
assert list(zarr_call) == [1, 0]
253-
elif bed_call == 2:
254-
assert list(zarr_call) == [1, 1]
255-
else: # pragma no cover
256-
raise AssertionError(f"Unexpected bed call {bed_call}")

tests/data/plink/plink_sim_10s_100v_10pmiss.vcf

Lines changed: 57 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -5,103 +5,103 @@
55
##INFO=<ID=PR,Number=0,Type=Flag,Description="Provisional reference allele, may not be based on real reference genome">
66
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
77
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT 000 001 002 003 004 005 006 007 008 009
8-
1 1 1:1:G:CGCGCG CGCGCG G . . PR GT 0/1 0/0 0/1 0/1 1/1 1/1 0/0 0/0 0/1 0/0
8+
1 1 1:1:G:CGCGCG G CGCGCG . . PR GT 0/1 1/1 0/1 0/1 0/0 0/0 1/1 1/1 0/1 1/1
99
1 2 1:2:ACT:G ACT G . . PR GT 0/1 0/0 1/1 0/1 0/0 0/1 0/1 0/1 0/1 1/1
10-
1 3 1:3:ACT:G G ACT . . PR GT 0/1 ./. 0/0 0/0 0/0 1/1 0/1 0/1 0/1 0/1
11-
1 4 1:4:G:CGCGCG CGCGCG G . . PR GT 0/1 0/1 ./. 1/1 0/1 0/0 0/0 0/1 0/0 0/1
10+
1 3 1:3:ACT:G ACT G . . PR GT 0/1 ./. 1/1 1/1 1/1 0/0 0/1 0/1 0/1 0/1
11+
1 4 1:4:G:CGCGCG G CGCGCG . . PR GT 0/1 0/1 ./. 0/0 0/1 1/1 1/1 0/1 1/1 0/1
1212
1 5 1:5:G:CGCGCG G CGCGCG . . PR GT ./. 0/0 1/1 1/1 0/1 1/1 ./. 0/0 0/1 0/0
13-
1 6 1:6:ACT:G G ACT . . PR GT 1/1 0/0 1/1 1/1 0/0 0/1 0/1 0/0 0/1 0/0
13+
1 6 1:6:ACT:G ACT G . . PR GT 0/0 1/1 0/0 0/0 1/1 0/1 0/1 1/1 0/1 1/1
1414
1 7 1:7:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 0/1 0/1 0/0 0/1 0/1 0/1 ./. 0/0
15-
1 8 1:8:T:GTGG GTGG T . . PR GT ./. 0/0 0/1 0/0 0/0 0/0 ./. 0/0 0/1 0/0
16-
1 9 1:9:T:GTGG GTGG T . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 ./. 0/0 0/0
17-
1 10 1:10:A:C C A . . PR GT 0/1 ./. 0/1 0/1 0/1 0/1 ./. 0/1 0/1 0/0
18-
1 11 1:11:ACT:G G ACT . . PR GT 0/0 0/0 0/0 0/0 ./. 0/0 0/0 ./. 0/0 ./.
19-
1 12 1:12:G:CGCGCG CGCGCG G . . PR GT 0/0 0/0 ./. 1/1 0/1 0/0 0/1 ./. 1/1 0/0
20-
1 13 1:13:G:CGCGCG CGCGCG G . . PR GT 0/0 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0
21-
1 14 1:14:T:GTGG GTGG T . . PR GT 0/0 ./. 0/1 0/0 0/1 0/0 ./. 0/1 ./. ./.
22-
1 15 1:15:ACT:G G ACT . . PR GT 0/1 0/0 1/1 0/0 0/0 0/0 1/1 0/0 0/1 0/1
15+
1 8 1:8:T:GTGG T GTGG . . PR GT ./. 1/1 0/1 1/1 1/1 1/1 ./. 1/1 0/1 1/1
16+
1 9 1:9:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 1/1 1/1 1/1 ./. 1/1 1/1
17+
1 10 1:10:A:C A C . . PR GT 0/1 ./. 0/1 0/1 0/1 0/1 ./. 0/1 0/1 1/1
18+
1 11 1:11:ACT:G ACT G . . PR GT 1/1 1/1 1/1 1/1 ./. 1/1 1/1 ./. 1/1 ./.
19+
1 12 1:12:G:CGCGCG G CGCGCG . . PR GT 1/1 1/1 ./. 0/0 0/1 1/1 0/1 ./. 0/0 1/1
20+
1 13 1:13:G:CGCGCG G CGCGCG . . PR GT 1/1 1/1 1/1 1/1 1/1 0/1 1/1 1/1 1/1 1/1
21+
1 14 1:14:T:GTGG T GTGG . . PR GT 1/1 ./. 0/1 1/1 0/1 1/1 ./. 0/1 ./. ./.
22+
1 15 1:15:ACT:G ACT G . . PR GT 0/1 1/1 0/0 1/1 1/1 1/1 0/0 1/1 0/1 0/1
2323
1 16 1:16:A:C A C . . PR GT 0/0 0/0 0/0 ./. 0/0 0/0 0/0 0/0 0/0 0/0
24-
1 17 1:17:ACT:G G ACT . . PR GT 0/0 0/0 0/0 0/1 0/1 0/0 1/1 ./. 0/0 0/0
25-
1 18 1:18:T:GTGG GTGG T . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
26-
1 19 1:19:A:C C A . . PR GT 0/1 0/1 1/1 0/1 0/1 ./. 0/0 0/1 0/0 0/1
24+
1 17 1:17:ACT:G ACT G . . PR GT 1/1 1/1 1/1 0/1 0/1 1/1 0/0 ./. 1/1 1/1
25+
1 18 1:18:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 1/1 1/1 1/1 1/1 1/1 1/1
26+
1 19 1:19:A:C A C . . PR GT 0/1 0/1 0/0 0/1 0/1 ./. 1/1 0/1 1/1 0/1
2727
1 20 1:20:A:C A C . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 ./. 0/1 0/0
28-
1 21 1:21:T:GTGG GTGG T . . PR GT 0/0 0/1 0/0 0/0 0/0 0/0 0/1 0/1 0/0 0/0
29-
1 22 1:22:G:CGCGCG CGCGCG G . . PR GT 0/1 0/1 0/1 ./. 0/0 0/0 ./. 0/0 0/1 ./.
30-
1 23 1:23:T:GTGG GTGG T . . PR GT 0/0 0/1 0/1 ./. ./. 0/1 0/1 0/1 0/0 0/1
28+
1 21 1:21:T:GTGG T GTGG . . PR GT 1/1 0/1 1/1 1/1 1/1 1/1 0/1 0/1 1/1 1/1
29+
1 22 1:22:G:CGCGCG G CGCGCG . . PR GT 0/1 0/1 0/1 ./. 1/1 1/1 ./. 1/1 0/1 ./.
30+
1 23 1:23:T:GTGG T GTGG . . PR GT 1/1 0/1 0/1 ./. ./. 0/1 0/1 0/1 1/1 0/1
3131
1 24 1:24:A:C A C . . PR GT 0/0 0/0 0/1 0/0 0/1 0/1 0/0 0/0 0/0 0/0
3232
1 25 1:25:A:C A C . . PR GT 0/0 0/0 ./. ./. ./. ./. 0/0 0/1 0/1 0/0
33-
1 26 1:26:ACT:G G ACT . . PR GT 0/1 0/0 0/0 0/1 0/0 ./. 0/0 0/1 0/1 0/0
33+
1 26 1:26:ACT:G ACT G . . PR GT 0/1 1/1 1/1 0/1 1/1 ./. 1/1 0/1 0/1 1/1
3434
1 27 1:27:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 0/0 0/0 ./. 0/0 ./. 0/1 0/0 0/0
3535
1 28 1:28:ACT:G ACT G . . PR GT 0/0 1/1 0/0 0/1 0/0 ./. 0/1 0/0 0/1 1/1
36-
1 29 1:29:T:GTGG GTGG T . . PR GT 0/0 0/1 0/0 0/0 ./. ./. ./. 0/0 0/1 0/0
36+
1 29 1:29:T:GTGG T GTGG . . PR GT 1/1 0/1 1/1 1/1 ./. ./. ./. 1/1 0/1 1/1
3737
1 30 1:30:A:C A C . . PR GT 0/0 0/0 0/0 0/1 0/0 0/1 0/1 0/0 0/1 0/0
38-
1 31 1:31:T:GTGG GTGG T . . PR GT 1/1 0/0 0/0 0/0 0/0 0/1 0/0 ./. 0/1 0/0
38+
1 31 1:31:T:GTGG T GTGG . . PR GT 0/0 1/1 1/1 1/1 1/1 0/1 1/1 ./. 0/1 1/1
3939
1 32 1:32:G:CGCGCG G CGCGCG . . PR GT 0/1 0/1 0/0 0/0 ./. 0/0 1/1 ./. 0/0 0/1
40-
1 33 1:33:ACT:G G ACT . . PR GT 0/1 0/1 0/1 0/1 0/0 0/1 0/0 0/1 0/1 0/0
40+
1 33 1:33:ACT:G ACT G . . PR GT 0/1 0/1 0/1 0/1 1/1 0/1 1/1 0/1 0/1 1/1
4141
1 34 1:34:G:CGCGCG G CGCGCG . . PR GT 0/1 1/1 0/1 ./. 0/1 ./. 0/1 0/0 0/0 ./.
4242
1 35 1:35:A:C A C . . PR GT 0/0 0/1 0/0 0/0 0/0 0/0 0/0 0/0 ./. 0/1
43-
1 36 1:36:G:CGCGCG CGCGCG G . . PR GT ./. 1/1 ./. 0/0 0/0 0/0 0/0 0/0 0/1 0/0
44-
1 37 1:37:T:GTGG GTGG T . . PR GT 0/0 0/1 0/1 0/0 0/1 0/0 0/0 0/0 0/0 0/0
43+
1 36 1:36:G:CGCGCG G CGCGCG . . PR GT ./. 0/0 ./. 1/1 1/1 1/1 1/1 1/1 0/1 1/1
44+
1 37 1:37:T:GTGG T GTGG . . PR GT 1/1 0/1 0/1 1/1 0/1 1/1 1/1 1/1 1/1 1/1
4545
1 38 1:38:A:C A C . . PR GT 0/1 0/0 0/0 ./. 1/1 1/1 ./. 0/0 0/1 0/0
4646
1 39 1:39:A:C A C . . PR GT 0/0 0/0 0/1 0/1 0/1 0/0 0/0 0/0 0/1 0/0
47-
1 40 1:40:T:GTGG GTGG T . . PR GT 0/0 0/0 1/1 0/1 ./. 0/1 0/0 0/1 0/0 0/0
47+
1 40 1:40:T:GTGG T GTGG . . PR GT 1/1 1/1 0/0 0/1 ./. 0/1 1/1 0/1 1/1 1/1
4848
1 41 1:41:A:C A C . . PR GT 0/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
49-
1 42 1:42:G:CGCGCG CGCGCG G . . PR GT 0/1 ./. 0/1 0/1 0/1 0/0 0/1 0/1 0/0 1/1
50-
1 43 1:43:T:GTGG GTGG T . . PR GT 0/0 0/0 0/0 0/0 ./. 0/0 0/0 0/0 ./. 0/0
51-
1 44 1:44:ACT:G G ACT . . PR GT 0/0 1/1 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
52-
1 45 1:45:G:CGCGCG CGCGCG G . . PR GT 1/1 0/0 0/0 0/0 0/1 0/0 0/0 ./. 0/0 0/1
53-
1 46 1:46:ACT:G G ACT . . PR GT 0/0 0/0 0/0 0/1 0/1 0/0 0/1 0/0 0/1 0/0
54-
1 47 1:47:G:CGCGCG CGCGCG G . . PR GT 0/0 0/1 ./. 0/1 0/0 0/0 0/1 0/1 0/1 1/1
49+
1 42 1:42:G:CGCGCG G CGCGCG . . PR GT 0/1 ./. 0/1 0/1 0/1 1/1 0/1 0/1 1/1 0/0
50+
1 43 1:43:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 ./. 1/1 1/1 1/1 ./. 1/1
51+
1 44 1:44:ACT:G ACT G . . PR GT 1/1 0/0 1/1 1/1 1/1 1/1 1/1 1/1 1/1 1/1
52+
1 45 1:45:G:CGCGCG G CGCGCG . . PR GT 0/0 1/1 1/1 1/1 0/1 1/1 1/1 ./. 1/1 0/1
53+
1 46 1:46:ACT:G ACT G . . PR GT 1/1 1/1 1/1 0/1 0/1 1/1 0/1 1/1 0/1 1/1
54+
1 47 1:47:G:CGCGCG G CGCGCG . . PR GT 1/1 0/1 ./. 0/1 1/1 1/1 0/1 0/1 0/1 0/0
5555
1 48 1:48:A:C A C . . PR GT 0/1 0/0 0/1 ./. 0/0 0/0 0/1 0/0 0/0 0/0
5656
1 49 1:49:A:C A C . . PR GT 0/0 0/1 0/0 0/1 0/0 0/0 0/0 0/0 ./. 0/0
5757
1 50 1:50:A:C A C . . PR GT 0/1 ./. ./. ./. 0/0 0/0 ./. ./. 0/0 0/1
5858
1 51 1:51:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 1/1 0/0 0/0 0/1 0/1 0/1 0/1 0/0
5959
1 52 1:52:A:C A C . . PR GT 0/0 0/0 0/0 0/1 0/0 0/0 0/1 ./. ./. 0/1
60-
1 53 1:53:ACT:G G ACT . . PR GT 0/0 ./. 0/1 0/0 0/0 ./. 0/0 0/1 0/1 1/1
60+
1 53 1:53:ACT:G ACT G . . PR GT 1/1 ./. 0/1 1/1 1/1 ./. 1/1 0/1 0/1 0/0
6161
1 54 1:54:A:C A C . . PR GT 0/1 ./. 0/0 0/1 ./. ./. ./. 1/1 0/0 0/0
62-
1 55 1:55:G:CGCGCG CGCGCG G . . PR GT 0/1 0/0 0/1 0/0 0/1 0/1 ./. 0/1 0/0 0/1
63-
1 56 1:56:T:GTGG GTGG T . . PR GT 0/0 0/1 ./. 0/1 0/0 0/1 0/1 0/0 1/1 ./.
62+
1 55 1:55:G:CGCGCG G CGCGCG . . PR GT 0/1 1/1 0/1 1/1 0/1 0/1 ./. 0/1 1/1 0/1
63+
1 56 1:56:T:GTGG T GTGG . . PR GT 1/1 0/1 ./. 0/1 1/1 0/1 0/1 1/1 0/0 ./.
6464
1 57 1:57:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 1/1 0/1 0/1 ./. 0/0 1/1 0/0 1/1
65-
1 58 1:58:A:C C A . . PR GT 0/0 0/0 0/1 0/0 0/0 0/1 1/1 0/1 0/0 0/1
66-
1 59 1:59:T:GTGG GTGG T . . PR GT 0/1 0/1 0/0 0/0 ./. 0/1 0/1 ./. 0/1 0/1
67-
1 60 1:60:G:CGCGCG CGCGCG G . . PR GT 0/0 0/1 0/1 0/1 1/1 0/0 1/1 0/1 0/1 0/0
68-
1 61 1:61:ACT:G G ACT . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/1 0/0 ./.
65+
1 58 1:58:A:C A C . . PR GT 1/1 1/1 0/1 1/1 1/1 0/1 0/0 0/1 1/1 0/1
66+
1 59 1:59:T:GTGG T GTGG . . PR GT 0/1 0/1 1/1 1/1 ./. 0/1 0/1 ./. 0/1 0/1
67+
1 60 1:60:G:CGCGCG G CGCGCG . . PR GT 1/1 0/1 0/1 0/1 0/0 1/1 0/0 0/1 0/1 1/1
68+
1 61 1:61:ACT:G ACT G . . PR GT 1/1 1/1 1/1 1/1 1/1 1/1 1/1 0/1 1/1 ./.
6969
1 62 1:62:A:C A C . . PR GT 1/1 0/0 0/0 0/0 0/0 ./. 0/0 0/1 0/1 ./.
70-
1 63 1:63:G:CGCGCG CGCGCG G . . PR GT 0/1 ./. 1/1 0/0 0/1 ./. 0/1 0/0 0/1 0/0
71-
1 64 1:64:T:GTGG GTGG T . . PR GT 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/0 0/0
72-
1 65 1:65:T:GTGG GTGG T . . PR GT 0/1 0/0 0/0 0/0 0/1 0/1 0/0 0/1 0/0 0/0
70+
1 63 1:63:G:CGCGCG G CGCGCG . . PR GT 0/1 ./. 0/0 1/1 0/1 ./. 0/1 1/1 0/1 1/1
71+
1 64 1:64:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 0/1 1/1 1/1 1/1 1/1 1/1
72+
1 65 1:65:T:GTGG T GTGG . . PR GT 0/1 1/1 1/1 1/1 0/1 0/1 1/1 0/1 1/1 1/1
7373
1 66 1:66:ACT:G ACT G . . PR GT 1/1 0/0 0/1 0/0 0/0 0/1 0/0 0/1 0/1 0/0
74-
1 67 1:67:T:GTGG GTGG T . . PR GT 0/0 0/0 0/0 0/0 0/0 ./. 0/0 0/0 0/0 0/1
75-
1 68 1:68:ACT:G G ACT . . PR GT 0/1 0/0 0/0 ./. 0/0 0/1 0/0 0/1 0/0 0/1
74+
1 67 1:67:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 1/1 1/1 ./. 1/1 1/1 1/1 0/1
75+
1 68 1:68:ACT:G ACT G . . PR GT 0/1 1/1 1/1 ./. 1/1 0/1 1/1 0/1 1/1 0/1
7676
1 69 1:69:G:CGCGCG G CGCGCG . . PR GT 0/1 0/0 0/1 0/0 0/1 0/0 0/0 0/1 0/1 0/0
77-
1 70 1:70:G:CGCGCG CGCGCG G . . PR GT 0/1 0/0 0/0 0/1 1/1 0/0 0/0 0/0 1/1 0/1
78-
1 71 1:71:ACT:G G ACT . . PR GT 0/1 0/0 0/1 1/1 1/1 0/1 0/0 0/0 0/0 0/1
79-
1 72 1:72:G:CGCGCG CGCGCG G . . PR GT 0/0 0/1 0/0 0/1 ./. ./. 0/0 0/0 1/1 0/1
77+
1 70 1:70:G:CGCGCG G CGCGCG . . PR GT 0/1 1/1 1/1 0/1 0/0 1/1 1/1 1/1 0/0 0/1
78+
1 71 1:71:ACT:G ACT G . . PR GT 0/1 1/1 0/1 0/0 0/0 0/1 1/1 1/1 1/1 0/1
79+
1 72 1:72:G:CGCGCG G CGCGCG . . PR GT 1/1 0/1 1/1 0/1 ./. ./. 1/1 1/1 0/0 0/1
8080
1 73 1:73:A:C A C . . PR GT 0/0 ./. 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0
8181
1 74 1:74:A:C A C . . PR GT 0/0 0/0 0/1 0/0 0/0 0/0 1/1 1/1 0/0 0/1
82-
1 75 1:75:T:GTGG GTGG T . . PR GT 0/0 0/1 0/1 0/0 0/0 0/0 0/1 0/0 0/0 0/0
82+
1 75 1:75:T:GTGG T GTGG . . PR GT 1/1 0/1 0/1 1/1 1/1 1/1 0/1 1/1 1/1 1/1
8383
1 76 1:76:A:C A C . . PR GT 0/0 0/0 ./. 0/0 0/0 0/0 0/0 0/0 0/0 0/0
84-
1 77 1:77:ACT:G G ACT . . PR GT ./. 0/0 0/0 0/0 0/1 0/1 ./. 0/0 0/0 0/0
85-
1 78 1:78:ACT:G G ACT . . PR GT 0/0 0/1 0/0 ./. 0/0 0/0 0/0 0/0 0/0 0/0
84+
1 77 1:77:ACT:G ACT G . . PR GT ./. 1/1 1/1 1/1 0/1 0/1 ./. 1/1 1/1 1/1
85+
1 78 1:78:ACT:G ACT G . . PR GT 1/1 0/1 1/1 ./. 1/1 1/1 1/1 1/1 1/1 1/1
8686
1 79 1:79:A:C A C . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/1 0/0 0/1
8787
1 80 1:80:A:C A C . . PR GT 0/0 0/0 0/0 ./. 0/0 0/0 0/1 0/0 ./. 0/0
8888
1 81 1:81:A:C A C . . PR GT 0/0 0/1 ./. 0/1 ./. 0/0 0/0 1/1 0/1 0/1
89-
1 82 1:82:T:GTGG GTGG T . . PR GT 0/0 0/0 0/1 0/0 0/0 0/0 0/1 0/0 0/1 0/0
89+
1 82 1:82:T:GTGG T GTGG . . PR GT 1/1 1/1 0/1 1/1 1/1 1/1 0/1 1/1 0/1 1/1
9090
1 83 1:83:A:C A C . . PR GT 0/0 0/1 0/0 0/0 1/1 ./. 0/0 0/0 0/1 0/1
91-
1 84 1:84:ACT:G G ACT . . PR GT 0/0 ./. 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0
91+
1 84 1:84:ACT:G ACT G . . PR GT 1/1 ./. 1/1 1/1 1/1 1/1 0/1 1/1 1/1 1/1
9292
1 85 1:85:A:C A C . . PR GT 0/0 0/1 0/0 0/1 0/0 0/0 0/0 0/0 0/0 0/0
9393
1 86 1:86:G:CGCGCG G CGCGCG . . PR GT 0/0 0/0 1/1 0/1 0/0 0/1 0/1 0/1 0/1 1/1
94-
1 87 1:87:ACT:G G ACT . . PR GT 0/1 0/0 0/0 0/0 0/1 0/0 0/0 0/1 0/0 0/0
94+
1 87 1:87:ACT:G ACT G . . PR GT 0/1 1/1 1/1 1/1 0/1 1/1 1/1 0/1 1/1 1/1
9595
1 88 1:88:A:C A C . . PR GT 0/1 0/1 0/1 0/0 ./. ./. 0/0 0/1 0/0 0/0
9696
1 89 1:89:A:C A C . . PR GT 0/0 0/0 0/1 0/0 0/0 ./. 0/0 0/0 0/0 1/1
9797
1 90 1:90:T:GTGG T GTGG . . PR GT 0/1 0/1 0/0 0/1 ./. 0/1 0/1 1/1 0/0 0/1
98-
1 91 1:91:T:GTGG GTGG T . . PR GT 0/1 0/0 0/0 0/1 0/0 0/0 0/0 ./. 0/0 0/1
99-
1 92 1:92:T:GTGG GTGG T . . PR GT ./. 0/0 0/0 0/1 0/0 0/1 0/1 0/1 0/0 0/0
98+
1 91 1:91:T:GTGG T GTGG . . PR GT 0/1 1/1 1/1 0/1 1/1 1/1 1/1 ./. 1/1 0/1
99+
1 92 1:92:T:GTGG T GTGG . . PR GT ./. 1/1 1/1 0/1 1/1 0/1 0/1 0/1 1/1 1/1
100100
1 93 1:93:A:C A C . . PR GT 0/1 0/1 0/1 0/1 0/0 0/0 ./. ./. 0/0 0/0
101101
1 94 1:94:A:C A C . . PR GT 0/0 0/0 0/0 0/0 0/1 0/0 0/0 0/0 0/1 0/1
102102
1 95 1:95:A:C A C . . PR GT 0/1 0/0 0/1 0/1 0/0 0/0 0/0 0/1 0/1 0/0
103103
1 96 1:96:A:C A C . . PR GT 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 0/0 ./.
104-
1 97 1:97:T:GTGG GTGG T . . PR GT 0/0 0/0 0/0 ./. 0/0 0/0 0/0 0/0 ./. 0/0
105-
1 98 1:98:ACT:G G ACT . . PR GT 0/0 0/1 1/1 1/1 0/0 0/1 0/1 0/1 0/1 0/0
106-
1 99 1:99:T:GTGG GTGG T . . PR GT 0/1 0/0 0/0 ./. 0/0 0/0 0/1 0/0 0/1 0/0
104+
1 97 1:97:T:GTGG T GTGG . . PR GT 1/1 1/1 1/1 ./. 1/1 1/1 1/1 1/1 ./. 1/1
105+
1 98 1:98:ACT:G ACT G . . PR GT 1/1 0/1 0/0 0/0 1/1 0/1 0/1 0/1 0/1 1/1
106+
1 99 1:99:T:GTGG T GTGG . . PR GT 0/1 1/1 1/1 ./. 1/1 1/1 0/1 1/1 0/1 1/1
107107
1 100 1:100:A:C A C . . PR GT 0/1 1/1 0/0 0/1 0/1 ./. ./. 0/0 0/0 0/0

tests/test_plink.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pytest
77
import sgkit as sg
88
import xarray.testing as xt
9+
import zarr
910

1011
from bio2zarr import plink, vcf
1112

@@ -69,8 +70,8 @@ def test_genotypes(self, ds):
6970
nt.assert_array_equal(
7071
call_genotype,
7172
[
72-
[[0, 0], [1, 0], [1, 1]],
73-
[[1, 0], [0, 0], [1, 1]],
73+
[[0, 0], [0, 1], [1, 1]],
74+
[[0, 1], [0, 0], [1, 1]],
7475
[[0, 0], [0, 0], [0, 0]],
7576
[[-1, -1], [0, 0], [0, 0]],
7677
[[1, 1], [0, 0], [0, 0]],
@@ -134,7 +135,7 @@ def test_variant_allele(self, ds):
134135
nt.assert_array_equal(ds.variant_allele, [["GG", "A"], ["C", "TTT"]])
135136

136137
def test_variant_length(self, ds):
137-
nt.assert_array_equal(ds.variant_length, [2, 3])
138+
nt.assert_array_equal(ds.variant_length, [2, 1])
138139

139140
def test_contig_id(self, ds):
140141
"""Test that contig identifiers are correctly extracted and stored."""
@@ -207,8 +208,8 @@ def test_genotypes(self, ds):
207208
)
208209
expected = np.array(
209210
[
210-
[1, 0],
211-
[1, 0],
211+
[0, 1],
212+
[0, 1],
212213
[1, 1],
213214
[1, 1],
214215
[-1, -1],
@@ -276,6 +277,37 @@ def test_chunk_size(
276277
# TODO check array chunks
277278

278279

280+
def validate(bed_path, zarr_path):
281+
root = zarr.open(store=zarr_path, mode="r")
282+
call_genotype = root["call_genotype"][:]
283+
284+
bed = bed_reader.open_bed(bed_path + ".bed", count_A1=True, num_threads=1)
285+
286+
assert call_genotype.shape[0] == bed.sid_count
287+
assert call_genotype.shape[1] == bed.iid_count
288+
bed_genotypes = bed.read(dtype="int8").T
289+
assert call_genotype.shape[0] == bed_genotypes.shape[0]
290+
assert call_genotype.shape[1] == bed_genotypes.shape[1]
291+
assert call_genotype.shape[2] == 2
292+
293+
row_id = 0
294+
for bed_row, zarr_row in zip(bed_genotypes, call_genotype):
295+
# print("ROW", row_id)
296+
# print(bed_row, zarr_row)
297+
row_id += 1
298+
for bed_call, zarr_call in zip(bed_row, zarr_row):
299+
if bed_call == -127:
300+
assert list(zarr_call) == [-1, -1]
301+
elif bed_call == 0:
302+
assert list(zarr_call) == [0, 0]
303+
elif bed_call == 1:
304+
assert list(zarr_call) == [0, 1]
305+
elif bed_call == 2:
306+
assert list(zarr_call) == [1, 1]
307+
else: # pragma no cover
308+
raise AssertionError(f"Unexpected bed call {bed_call}")
309+
310+
279311
@pytest.mark.parametrize(
280312
("variants_chunk_size", "samples_chunk_size"),
281313
[
@@ -299,7 +331,7 @@ def test_by_validating(
299331
samples_chunk_size=samples_chunk_size,
300332
worker_processes=worker_processes,
301333
)
302-
plink.validate(path, out)
334+
validate(path, out)
303335

304336

305337
class TestMultipleContigs:
@@ -379,11 +411,11 @@ def test_genotypes(self, ds):
379411
nt.assert_array_equal(
380412
call_genotype,
381413
[
382-
[[0, 0], [1, 0], [1, 1], [0, 0]], # chr1
383-
[[1, 0], [0, 0], [1, 1], [1, 0]], # chr1
414+
[[0, 0], [0, 1], [1, 1], [0, 0]], # chr1
415+
[[0, 1], [0, 0], [1, 1], [0, 1]], # chr1
384416
[[0, 0], [0, 0], [0, 0], [1, 1]], # chr2
385-
[[1, 1], [0, 0], [1, 0], [0, 0]], # chrX
386-
[[1, 0], [1, 0], [1, 0], [1, 0]], # chrX
417+
[[1, 1], [0, 0], [0, 1], [0, 0]], # chrX
418+
[[0, 1], [0, 1], [0, 1], [0, 1]], # chrX
387419
[[0, 0], [1, 1], [0, 0], [1, 1]], # chrY
388420
],
389421
)
@@ -403,7 +435,7 @@ def test_variant_length(self, ds):
403435
[
404436
"tests/data/plink/example",
405437
"tests/data/plink/example_with_fam",
406-
# "tests/data/plink/plink_sim_10s_100v_10pmiss",
438+
"tests/data/plink/plink_sim_10s_100v_10pmiss",
407439
],
408440
)
409441
def test_against_plinks_vcf_output(prefix, tmp_path):

0 commit comments

Comments
 (0)