Skip to content

Commit fbbd878

Browse files
Identify bug in A1/A2 allele handling
1 parent bf958f6 commit fbbd878

File tree

2 files changed

+48
-27
lines changed

2 files changed

+48
-27
lines changed

bio2zarr/plink.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212

1313
@dataclasses.dataclass
1414
class PlinkPaths:
15-
bed_path: pathlib.Path
16-
bim_path: pathlib.Path
17-
fam_path: pathlib.Path
15+
bed_path: str
16+
bim_path: str
17+
fam_path: str
1818

1919

2020
class PlinkFormat(vcz.Source):
@@ -25,13 +25,12 @@ def __init__(self, prefix):
2525
# TODO we will need support multiple chromosomes here to join
2626
# plinks into on big zarr. So, these will require multiple
2727
# bed and bim files, but should share a .fam
28-
self.prefix = pathlib.Path(prefix)
28+
self.prefix = str(prefix)
2929
paths = PlinkPaths(
30-
self.prefix.with_suffix(".bed"),
31-
self.prefix.with_suffix(".bim"),
32-
self.prefix.with_suffix(".fam"),
30+
self.prefix + ".bed",
31+
self.prefix + ".bim",
32+
self.prefix + ".fam",
3333
)
34-
3534
self.bed = bed_reader.open_bed(
3635
paths.bed_path,
3736
bim_location=paths.bim_path,
@@ -73,8 +72,8 @@ def iter_id(self, start, stop):
7372
yield from self.bed.sid[start:stop]
7473

7574
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
76-
ref_field = self.bed.allele_1
77-
alt_field = self.bed.allele_2
75+
alt_field = self.bed.allele_1
76+
ref_field = self.bed.allele_2
7877
bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
7978
gt = np.zeros(shape, dtype=np.int8)
8079
phased = np.zeros(shape[:-1], dtype=bool)
@@ -224,7 +223,7 @@ def validate(bed_path, zarr_path):
224223
root = zarr.open(store=zarr_path, mode="r")
225224
call_genotype = root["call_genotype"][:]
226225

227-
bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
226+
bed = bed_reader.open_bed(bed_path + ".bed", count_A1=False, num_threads=1)
228227

229228
assert call_genotype.shape[0] == bed.sid_count
230229
assert call_genotype.shape[1] == bed.iid_count

tests/test_plink.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,17 @@ def bed_path(self, tmp_path_factory):
3131
],
3232
dtype=np.int8,
3333
)
34-
bed_reader.to_bed(path, dosages.T)
35-
return path
34+
m = 7
35+
d = {
36+
"chromosome": ["chr1"] * m,
37+
"sid": [f"id{j}" for j in range(m)],
38+
"bp_position": range(1, m + 1),
39+
"allele_1": ["A"] * m,
40+
"allele_2": ["T"] * m,
41+
"iid": [f"s{j}" for j in range(3)],
42+
}
43+
bed_reader.to_bed(path, dosages.T, properties=d)
44+
return path.with_suffix("")
3645

3746
@pytest.fixture(scope="class")
3847
def ds(self, tmp_path_factory, bed_path):
@@ -57,6 +66,22 @@ def test_genotypes(self, ds):
5766
],
5867
)
5968

69+
def test_variant_position(self, ds):
70+
nt.assert_array_equal(ds["variant_position"], np.arange(1, 8))
71+
72+
def test_variant_contig(self, ds):
73+
nt.assert_array_equal(ds["variant_contig"], np.zeros(7))
74+
nt.assert_array_equal(ds["contig_id"], ["chr1"])
75+
76+
def test_variant_id(self, ds):
77+
nt.assert_array_equal(ds["variant_id"], [f"id{j}" for j in range(7)])
78+
79+
def test_variant_allele(self, ds):
80+
nt.assert_array_equal(ds["variant_allele"], [["T", "A"] for _ in range(7)])
81+
82+
def test_sample_id(self, ds):
83+
nt.assert_array_equal(ds["sample_id"], ["s0", "s1", "s2"])
84+
6085
def test_missing_dependency(self):
6186
with mock.patch(
6287
"importlib.import_module",
@@ -75,25 +100,22 @@ def test_missing_dependency(self):
75100

76101
class TestEqualSgkit:
77102
def test_simulated_example(self, tmp_path):
78-
data_path = "tests/data/plink/"
79-
bed_path = data_path + "plink_sim_10s_100v_10pmiss.bed"
80-
fam_path = data_path + "plink_sim_10s_100v_10pmiss.fam"
81-
bim_path = data_path + "plink_sim_10s_100v_10pmiss.bim"
103+
prefix = "tests/data/plink/plink_sim_10s_100v_10pmiss"
82104
sg_ds = sgkit.io.plink.read_plink(
83-
bed_path=bed_path, fam_path=fam_path, bim_path=bim_path
105+
bed_path=prefix + ".bed", fam_path=prefix + ".fam", bim_path=prefix + ".bim"
84106
)
85107
out = tmp_path / "example.plink.zarr"
86-
plink.convert(prefix=data_path + "/plink_sim_10s_100v_10pmiss", out=out)
108+
plink.convert(prefix=prefix, out=out)
87109
ds = sg.load_dataset(out)
88110
nt.assert_array_equal(ds.call_genotype.values, sg_ds.call_genotype.values)
89111
nt.assert_array_equal(
90112
ds.call_genotype_mask.values, sg_ds.call_genotype_mask.values
91113
)
92114
# sgkit doesn't have phased
93115
nt.assert_array_equal(ds.variant_position.values, sg_ds.variant_position.values)
94-
nt.assert_array_equal(
95-
ds.variant_allele.values, sg_ds.variant_allele.values.astype("U")
96-
)
116+
# nt.assert_array_equal(
117+
# ds.variant_allele.values, sg_ds.variant_allele.values.astype("U")
118+
# )
97119
nt.assert_array_equal(ds.variant_contig.values, sg_ds.variant_contig.values)
98120
nt.assert_array_equal(ds.variant_id.values, sg_ds.variant_id.values)
99121
# print(sg_ds.variant_id.values)
@@ -137,7 +159,7 @@ class TestExample:
137159

138160
@pytest.fixture(scope="class")
139161
def ds(self, tmp_path_factory):
140-
path = "tests/data/plink/example.bed"
162+
path = "tests/data/plink/example"
141163
out = tmp_path_factory.mktemp("data") / "example.plink.zarr"
142164
plink.convert(path, out)
143165
return sg.load_dataset(out)
@@ -211,7 +233,7 @@ def test_sgkit(self, ds):
211233
class TestSimulatedExample:
212234
@pytest.fixture(scope="class")
213235
def ds(self, tmp_path_factory):
214-
path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed"
236+
path = "tests/data/plink/plink_sim_10s_100v_10pmiss"
215237
out = tmp_path_factory.mktemp("data") / "example.plink.zarr"
216238
plink.convert(path, out)
217239
return sg.load_dataset(out)
@@ -287,7 +309,7 @@ def test_contig_arrays(self, ds):
287309
def test_chunk_size(
288310
self, ds, tmp_path, variants_chunk_size, samples_chunk_size, worker_processes
289311
):
290-
path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed"
312+
path = "tests/data/plink/plink_sim_10s_100v_10pmiss"
291313
out = tmp_path / "example.zarr"
292314
plink.convert(
293315
path,
@@ -318,7 +340,7 @@ def test_chunk_size(
318340
def test_by_validating(
319341
tmp_path, variants_chunk_size, samples_chunk_size, worker_processes
320342
):
321-
path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed"
343+
path = "tests/data/plink/plink_sim_10s_100v_10pmiss"
322344
out = tmp_path / "example.zarr"
323345
plink.convert(
324346
path,
@@ -372,7 +394,7 @@ def multi_contig_bed_path(self, tmp_path_factory):
372394
# Format: fam_id, ind_id, pat_id, mat_id, sex, phenotype
373395
f.write(f"fam{i} ind{i} 0 0 0 -9\n")
374396

375-
return path
397+
return path.with_suffix("")
376398

377399
@pytest.fixture(scope="class")
378400
def ds(self, tmp_path_factory, multi_contig_bed_path):

0 commit comments

Comments
 (0)