Skip to content

Commit a568103

Browse files
committed
Remove bed_reader
1 parent b87be4c commit a568103

File tree

2 files changed

+154
-48
lines changed

2 files changed

+154
-48
lines changed

bio2zarr/plink.py

Lines changed: 154 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import dataclasses
22
import logging
3+
import os
34
import pathlib
5+
import warnings
46

57
import numpy as np
68

@@ -16,87 +18,209 @@ class PlinkPaths:
1618
fam_path: str
1719

1820

21+
@dataclasses.dataclass
22+
class FamData:
23+
sid: np.ndarray
24+
sid_count: int
25+
26+
27+
@dataclasses.dataclass
28+
class BimData:
29+
chromosome: np.ndarray
30+
vid: np.ndarray
31+
bp_position: np.ndarray
32+
allele_1: np.ndarray
33+
allele_2: np.ndarray
34+
vid_count: int
35+
36+
1937
class PlinkFormat(vcz.Source):
20-
@core.requires_optional_dependency("bed_reader", "plink")
2138
def __init__(self, prefix):
22-
import bed_reader
23-
2439
# TODO we will need support multiple chromosomes here to join
2540
# plinks into on big zarr. So, these will require multiple
2641
# bed and bim files, but should share a .fam
2742
self.prefix = str(prefix)
28-
paths = PlinkPaths(
43+
self.paths = PlinkPaths(
2944
self.prefix + ".bed",
3045
self.prefix + ".bim",
3146
self.prefix + ".fam",
3247
)
33-
self.bed = bed_reader.open_bed(
34-
paths.bed_path,
35-
bim_location=paths.bim_path,
36-
fam_location=paths.fam_path,
37-
num_threads=1,
38-
count_A1=True,
48+
49+
# Read sample information from .fam file
50+
samples = []
51+
with open(self.paths.fam_path) as f:
52+
for line in f:
53+
fields = line.strip().split()
54+
if len(fields) >= 2: # At minimum, we need FID and IID
55+
samples.append(fields[1])
56+
self.fam = FamData(sid=np.array(samples), sid_count=len(samples))
57+
self.n_samples = len(samples)
58+
59+
# Read variant information from .bim file
60+
chromosomes = []
61+
vids = []
62+
positions = []
63+
allele1 = []
64+
allele2 = []
65+
66+
with open(self.paths.bim_path) as f:
67+
for line in f:
68+
fields = line.strip().split()
69+
if len(fields) >= 6:
70+
chrom, vid, _, pos, a1, a2 = (
71+
fields[0],
72+
fields[1],
73+
fields[2],
74+
fields[3],
75+
fields[4],
76+
fields[5],
77+
)
78+
chromosomes.append(chrom)
79+
vids.append(vid)
80+
positions.append(int(pos))
81+
allele1.append(a1)
82+
allele2.append(a2)
83+
84+
self.bim = BimData(
85+
chromosome=np.array(chromosomes),
86+
vid=np.array(vids),
87+
bp_position=np.array(positions),
88+
allele_1=np.array(allele1),
89+
allele_2=np.array(allele2),
90+
vid_count=len(vids),
3991
)
92+
self.n_variants = len(vids)
93+
94+
# Calculate bytes per SNP: 1 byte per 4 samples, rounded up
95+
self.bytes_per_snp = (self.n_samples + 3) // 4
96+
97+
# Verify BED file has correct magic bytes
98+
with open(self.paths.bed_path, "rb") as f:
99+
magic = f.read(3)
100+
assert magic == b"\x6c\x1b\x01", "Invalid BED file format"
101+
102+
expected_size = self.n_variants * self.bytes_per_snp + 3 # +3 for magic bytes
103+
actual_size = os.path.getsize(self.paths.bed_path)
104+
if actual_size < expected_size:
105+
raise ValueError(
106+
f"BED file is truncated: expected at least {expected_size} bytes, "
107+
f"but only found {actual_size} bytes. "
108+
f"Check that .bed, .bim, and .fam files match."
109+
)
110+
elif actual_size > expected_size:
111+
# Warn if there's extra data (might indicate file mismatch)
112+
warnings.warn(
113+
f"BED file contains {actual_size} bytes but only expected "
114+
f"{expected_size}. "
115+
f"Using first {expected_size} bytes only.",
116+
stacklevel=1,
117+
)
118+
119+
# Initialize the lookup table with shape (256, 4, 2)
120+
# 256 possible byte values, 4 samples per byte, 2 alleles per sample
121+
lookup = np.zeros((256, 4, 2), dtype=np.int8)
122+
123+
# For each possible byte value (0-255)
124+
for byte in range(256):
125+
# For each of the 4 samples encoded in this byte
126+
for sample in range(4):
127+
# Extract the 2 bits for this sample
128+
bits = (byte >> (sample * 2)) & 0b11
129+
# Convert PLINK's bit encoding to genotype values
130+
if bits == 0b00:
131+
lookup[byte, sample] = [1, 1]
132+
elif bits == 0b01:
133+
lookup[byte, sample] = [-1, -1]
134+
elif bits == 0b10:
135+
lookup[byte, sample] = [0, 1]
136+
elif bits == 0b11:
137+
lookup[byte, sample] = [0, 0]
138+
139+
self.byte_lookup = lookup
40140

41141
@property
42142
def path(self):
43143
return self.prefix
44144

45145
@property
46146
def num_records(self):
47-
return self.bed.sid_count
147+
return self.bim.vid_count
48148

49149
@property
50150
def samples(self):
51-
return [vcz.Sample(id=sample) for sample in self.bed.iid]
151+
return [vcz.Sample(id=sample) for sample in self.fam.sid]
52152

53153
@property
54154
def contigs(self):
55-
return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bed.chromosome)]
155+
return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bim.chromosome)]
56156

57157
@property
58158
def num_samples(self):
59159
return len(self.samples)
60160

61161
def iter_contig(self, start, stop):
62162
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
63-
for chrom in self.bed.chromosome[start:stop]:
163+
for chrom in self.bim.chromosome[start:stop]:
64164
yield chrom_to_contig_index[str(chrom)]
65165

66166
def iter_field(self, field_name, shape, start, stop):
67167
assert field_name == "position" # Only position field is supported from plink
68-
yield from self.bed.bp_position[start:stop]
168+
yield from self.bim.bp_position[start:stop]
69169

70170
def iter_id(self, start, stop):
71-
yield from self.bed.sid[start:stop]
171+
yield from self.bim.vid[start:stop]
72172

73173
def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
74-
alt_field = self.bed.allele_1
75-
ref_field = self.bed.allele_2
76-
bed_chunk = self.bed.read(slice(start, stop), dtype=np.int8).T
77-
gt = np.zeros(shape, dtype=np.int8)
78-
phased = np.zeros(shape[:-1], dtype=bool)
174+
alt_field = self.bim.allele_1
175+
ref_field = self.bim.allele_2
176+
177+
chunk_size = stop - start
178+
179+
# Calculate file offsets for the required data
180+
# 3 bytes for the magic number at the beginning of the file
181+
start_offset = 3 + (start * self.bytes_per_snp)
182+
bytes_to_read = chunk_size * self.bytes_per_snp
183+
184+
# Read only the needed portion of the BED file
185+
with open(self.paths.bed_path, "rb") as f:
186+
f.seek(start_offset)
187+
chunk_data = f.read(bytes_to_read)
188+
189+
data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
190+
data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_snp)
191+
192+
# Apply lookup table to get genotypes
193+
# Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
194+
all_genotypes = self.byte_lookup[data_matrix]
195+
196+
# Reshape to get all samples in one dimension
197+
# (chunk_size, bytes_per_snp*4, 2)
198+
samples_padded = self.bytes_per_snp * 4
199+
genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
200+
201+
gt = genotypes_reshaped[:, : self.n_samples]
202+
203+
phased = np.zeros((chunk_size, self.n_samples), dtype=bool)
204+
79205
for i, (ref, alt) in enumerate(
80206
zip(ref_field[start:stop], alt_field[start:stop])
81207
):
82208
alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
83209
alleles[0] = ref
84210
alleles[1 : 1 + len(alt)] = alt
85-
gt[:] = 0
86-
gt[bed_chunk[i] == -127] = -1
87-
gt[bed_chunk[i] == 2] = 1
88-
gt[bed_chunk[i] == 1, 1] = 1
89211

90212
# rlen is the length of the REF in PLINK as there's no END annotations
91-
yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
213+
yield vcz.VariantData(
214+
len(alleles[0]), alleles, gt[i].copy(), phased[i].copy()
215+
)
92216

93217
def generate_schema(
94218
self,
95219
variants_chunk_size=None,
96220
samples_chunk_size=None,
97221
):
98-
n = self.bed.iid_count
99-
m = self.bed.sid_count
222+
n = self.fam.sid_count
223+
m = self.bim.vid_count
100224
logging.info(f"Scanned plink with {n} samples and {m} variants")
101225
dimensions = vcz.standard_dimensions(
102226
variants_size=m,
@@ -119,7 +243,7 @@ def generate_schema(
119243
)
120244
# If we don't have SVLEN or END annotations, the rlen field is defined
121245
# as the length of the REF
122-
max_len = self.bed.allele_2.itemsize
246+
max_len = self.bim.allele_2.itemsize
123247

124248
array_specs = [
125249
vcz.ZarrArraySpec(
@@ -156,7 +280,7 @@ def generate_schema(
156280
),
157281
vcz.ZarrArraySpec(
158282
name="variant_contig",
159-
dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),
283+
dtype=core.min_int_dtype(0, len(np.unique(self.bim.chromosome))),
160284
dimensions=["variants"],
161285
description="Contig/chromosome index for each variant",
162286
),

tests/test_plink.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from unittest import mock
2-
31
import bed_reader
42
import numpy as np
53
import numpy.testing as nt
@@ -11,22 +9,6 @@
119
from bio2zarr import plink, vcf
1210

1311

14-
def test_missing_dependency():
15-
with mock.patch(
16-
"importlib.import_module",
17-
side_effect=ImportError("No module named 'bed_reader'"),
18-
):
19-
with pytest.raises(ImportError) as exc_info:
20-
plink.convert(
21-
"UNUSED_PATH",
22-
"UNUSED_PATH",
23-
)
24-
assert (
25-
"This process requires the optional bed_reader module. "
26-
"Install it with: pip install bio2zarr[plink]" in str(exc_info.value)
27-
)
28-
29-
3012
class TestSmallExample:
3113
@pytest.fixture(scope="class")
3214
def bed_path(self, tmp_path_factory):

0 commit comments

Comments
 (0)