Skip to content

Commit cc62564

Browse files
Separate out the BedReader code
1 parent b424a15 commit cc62564

File tree

2 files changed

+78
-74
lines changed

2 files changed

+78
-74
lines changed

bio2zarr/plink.py

Lines changed: 65 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import dataclasses
22
import logging
3-
import os
43
import pathlib
5-
import warnings
64

75
import numpy as np
86
import pandas as pd
@@ -56,54 +54,21 @@ class PlinkPaths:
5654
fam_path: str
5755

5856

59-
@dataclasses.dataclass
60-
class FamData:
61-
sid: np.ndarray
62-
sid_count: int
63-
64-
65-
class PlinkFormat(vcz.Source):
66-
def __init__(self, prefix):
67-
# TODO we will need support multiple chromosomes here to join
68-
# plinks into on big zarr. So, these will require multiple
69-
# bed and bim files, but should share a .fam
70-
self.prefix = str(prefix)
71-
self.paths = PlinkPaths(
72-
self.prefix + ".bed",
73-
self.prefix + ".bim",
74-
self.prefix + ".fam",
75-
)
76-
77-
self.bim = read_bim(self.paths.bim_path)
78-
self.fam = read_fam(self.paths.fam_path)
79-
80-
self._num_records = self.bim.shape[0]
81-
self._num_samples = self.fam.shape[0]
82-
83-
# Calculate bytes per SNP: 1 byte per 4 samples, rounded up
84-
self.bytes_per_snp = (self._num_samples + 3) // 4
57+
class BedReader:
58+
def __init__(self, path, num_variants, num_samples):
59+
self.num_variants = num_variants
60+
self.num_samples = num_samples
61+
self.path = path
62+
# bytes per variant: 1 byte per 4 samples, rounded up
63+
self.bytes_per_variant = (self.num_samples + 3) // 4
8564

86-
# Verify BED file has correct magic bytes
87-
with open(self.paths.bed_path, "rb") as f:
65+
with open(self.path, "rb") as f:
8866
magic = f.read(3)
89-
assert magic == b"\x6c\x1b\x01", "Invalid BED file format"
90-
91-
expected_size = self.num_records * self.bytes_per_snp + 3 # +3 for magic bytes
92-
actual_size = os.path.getsize(self.paths.bed_path)
93-
if actual_size < expected_size:
94-
raise ValueError(
95-
f"BED file is truncated: expected at least {expected_size} bytes, "
96-
f"but only found {actual_size} bytes. "
97-
f"Check that .bed, .bim, and .fam files match."
98-
)
99-
elif actual_size > expected_size:
100-
# Warn if there's extra data (might indicate file mismatch)
101-
warnings.warn(
102-
f"BED file contains {actual_size} bytes but only expected "
103-
f"{expected_size}. "
104-
f"Using first {expected_size} bytes only.",
105-
stacklevel=1,
106-
)
67+
if magic != b"\x6c\x1b\x01":
68+
raise ValueError("Invalid BED file magic bytes")
69+
70+
# We could check the size of the bed file here, but that would
71+
# mean we can't work with streams.
10772

10873
# Initialize the lookup table with shape (256, 4, 2)
10974
# 256 possible byte values, 4 samples per byte, 2 alleles per sample
@@ -127,6 +92,56 @@ def __init__(self, prefix):
12792

12893
self.byte_lookup = lookup
12994

95+
def decode(self, start, stop):
96+
chunk_size = stop - start
97+
98+
# Calculate file offsets for the required data
99+
# 3 bytes for the magic number at the beginning of the file
100+
start_offset = 3 + (start * self.bytes_per_variant)
101+
bytes_to_read = chunk_size * self.bytes_per_variant
102+
103+
# TODO make it possible to read sequentially from the same file handle,
104+
# seeking only when necessary.
105+
with open(self.path, "rb") as f:
106+
f.seek(start_offset)
107+
chunk_data = f.read(bytes_to_read)
108+
109+
data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
110+
data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_variant)
111+
112+
# Apply lookup table to get genotypes
113+
# Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
114+
all_genotypes = self.byte_lookup[data_matrix]
115+
116+
# Reshape to get all samples in one dimension
117+
# (chunk_size, bytes_per_variant*4, 2)
118+
samples_padded = self.bytes_per_variant * 4
119+
genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
120+
121+
return genotypes_reshaped[:, : self.num_samples]
122+
123+
124+
class PlinkFormat(vcz.Source):
125+
def __init__(self, prefix):
126+
# TODO we will need support multiple chromosomes here to join
127+
# plinks into on big zarr. So, these will require multiple
128+
# bed and bim files, but should share a .fam
129+
self.prefix = str(prefix)
130+
self.paths = PlinkPaths(
131+
self.prefix + ".bed",
132+
self.prefix + ".bim",
133+
self.prefix + ".fam",
134+
)
135+
136+
self.bim = read_bim(self.paths.bim_path)
137+
self.fam = read_fam(self.paths.fam_path)
138+
139+
self._num_records = self.bim.shape[0]
140+
self._num_samples = self.fam.shape[0]
141+
self.bed_reader = BedReader(
142+
self.paths.bed_path, self.num_records, self.num_samples
143+
)
144+
130145
@property
131146
def path(self):
132147
return self.prefix
@@ -163,33 +178,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
163178
alt_field = self.bim.allele_1.values
164179
ref_field = self.bim.allele_2.values
165180

166-
chunk_size = stop - start
167-
168-
# Calculate file offsets for the required data
169-
# 3 bytes for the magic number at the beginning of the file
170-
start_offset = 3 + (start * self.bytes_per_snp)
171-
bytes_to_read = chunk_size * self.bytes_per_snp
172-
173-
# Read only the needed portion of the BED file
174-
with open(self.paths.bed_path, "rb") as f:
175-
f.seek(start_offset)
176-
chunk_data = f.read(bytes_to_read)
177-
178-
data_bytes = np.frombuffer(chunk_data, dtype=np.uint8)
179-
data_matrix = data_bytes.reshape(chunk_size, self.bytes_per_snp)
180-
181-
# Apply lookup table to get genotypes
182-
# Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
183-
all_genotypes = self.byte_lookup[data_matrix]
184-
185-
# Reshape to get all samples in one dimension
186-
# (chunk_size, bytes_per_snp*4, 2)
187-
samples_padded = self.bytes_per_snp * 4
188-
genotypes_reshaped = all_genotypes.reshape(chunk_size, samples_padded, 2)
189-
190-
gt = genotypes_reshaped[:, : self._num_samples]
181+
gt = self.bed_reader.decode(start, stop)
191182

192-
phased = np.zeros((chunk_size, self._num_samples), dtype=bool)
183+
phased = np.zeros(gt.shape[:2], dtype=bool)
193184

194185
for i, (ref, alt) in enumerate(
195186
zip(ref_field[start:stop], alt_field[start:stop])

tests/test_plink.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,19 @@ def test_example(self):
4949
nt.assert_array_equal(df["phenotype"].values, ["-9" for j in range(10)])
5050

5151

52+
class TestBedReader:
53+
@pytest.mark.parametrize(
54+
"path",
55+
[
56+
"tests/data/plink/example.bim",
57+
"tests/data/vcf/sample.vcf.gz",
58+
],
59+
)
60+
def test_bad_file_type(self, path):
61+
with pytest.raises(ValueError, match="Invalid BED file magic bytes"):
62+
plink.BedReader(path, 1, 1)
63+
64+
5265
class TestSmallExample:
5366
@pytest.fixture(scope="class")
5467
def bed_path(self, tmp_path_factory):

0 commit comments

Comments
 (0)