1
1
import dataclasses
2
2
import logging
3
- import os
4
3
import pathlib
5
- import warnings
6
4
7
5
import numpy as np
8
6
import pandas as pd
@@ -56,54 +54,21 @@ class PlinkPaths:
56
54
fam_path : str
57
55
58
56
59
- @dataclasses .dataclass
60
- class FamData :
61
- sid : np .ndarray
62
- sid_count : int
63
-
64
-
65
- class PlinkFormat (vcz .Source ):
66
- def __init__ (self , prefix ):
67
- # TODO we will need support multiple chromosomes here to join
68
- # plinks into on big zarr. So, these will require multiple
69
- # bed and bim files, but should share a .fam
70
- self .prefix = str (prefix )
71
- self .paths = PlinkPaths (
72
- self .prefix + ".bed" ,
73
- self .prefix + ".bim" ,
74
- self .prefix + ".fam" ,
75
- )
76
-
77
- self .bim = read_bim (self .paths .bim_path )
78
- self .fam = read_fam (self .paths .fam_path )
79
-
80
- self ._num_records = self .bim .shape [0 ]
81
- self ._num_samples = self .fam .shape [0 ]
82
-
83
- # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
84
- self .bytes_per_snp = (self ._num_samples + 3 ) // 4
57
+ class BedReader :
58
+ def __init__ (self , path , num_variants , num_samples ):
59
+ self .num_variants = num_variants
60
+ self .num_samples = num_samples
61
+ self .path = path
62
+ # bytes per variant: 1 byte per 4 samples, rounded up
63
+ self .bytes_per_variant = (self .num_samples + 3 ) // 4
85
64
86
- # Verify BED file has correct magic bytes
87
- with open (self .paths .bed_path , "rb" ) as f :
65
+ with open (self .path , "rb" ) as f :
88
66
magic = f .read (3 )
89
- assert magic == b"\x6c \x1b \x01 " , "Invalid BED file format"
90
-
91
- expected_size = self .num_records * self .bytes_per_snp + 3 # +3 for magic bytes
92
- actual_size = os .path .getsize (self .paths .bed_path )
93
- if actual_size < expected_size :
94
- raise ValueError (
95
- f"BED file is truncated: expected at least { expected_size } bytes, "
96
- f"but only found { actual_size } bytes. "
97
- f"Check that .bed, .bim, and .fam files match."
98
- )
99
- elif actual_size > expected_size :
100
- # Warn if there's extra data (might indicate file mismatch)
101
- warnings .warn (
102
- f"BED file contains { actual_size } bytes but only expected "
103
- f"{ expected_size } . "
104
- f"Using first { expected_size } bytes only." ,
105
- stacklevel = 1 ,
106
- )
67
+ if magic != b"\x6c \x1b \x01 " :
68
+ raise ValueError ("Invalid BED file magic bytes" )
69
+
70
+ # We could check the size of the bed file here, but that would
71
+ # mean we can't work with streams.
107
72
108
73
# Initialize the lookup table with shape (256, 4, 2)
109
74
# 256 possible byte values, 4 samples per byte, 2 alleles per sample
@@ -127,6 +92,56 @@ def __init__(self, prefix):
127
92
128
93
self .byte_lookup = lookup
129
94
95
+ def decode (self , start , stop ):
96
+ chunk_size = stop - start
97
+
98
+ # Calculate file offsets for the required data
99
+ # 3 bytes for the magic number at the beginning of the file
100
+ start_offset = 3 + (start * self .bytes_per_variant )
101
+ bytes_to_read = chunk_size * self .bytes_per_variant
102
+
103
+ # TODO make it possible to read sequentially from the same file handle,
104
+ # seeking only when necessary.
105
+ with open (self .path , "rb" ) as f :
106
+ f .seek (start_offset )
107
+ chunk_data = f .read (bytes_to_read )
108
+
109
+ data_bytes = np .frombuffer (chunk_data , dtype = np .uint8 )
110
+ data_matrix = data_bytes .reshape (chunk_size , self .bytes_per_variant )
111
+
112
+ # Apply lookup table to get genotypes
113
+ # Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
114
+ all_genotypes = self .byte_lookup [data_matrix ]
115
+
116
+ # Reshape to get all samples in one dimension
117
+ # (chunk_size, bytes_per_variant*4, 2)
118
+ samples_padded = self .bytes_per_variant * 4
119
+ genotypes_reshaped = all_genotypes .reshape (chunk_size , samples_padded , 2 )
120
+
121
+ return genotypes_reshaped [:, : self .num_samples ]
122
+
123
+
124
+ class PlinkFormat (vcz .Source ):
125
+ def __init__ (self , prefix ):
126
+ # TODO we will need support multiple chromosomes here to join
127
+ # plinks into on big zarr. So, these will require multiple
128
+ # bed and bim files, but should share a .fam
129
+ self .prefix = str (prefix )
130
+ self .paths = PlinkPaths (
131
+ self .prefix + ".bed" ,
132
+ self .prefix + ".bim" ,
133
+ self .prefix + ".fam" ,
134
+ )
135
+
136
+ self .bim = read_bim (self .paths .bim_path )
137
+ self .fam = read_fam (self .paths .fam_path )
138
+
139
+ self ._num_records = self .bim .shape [0 ]
140
+ self ._num_samples = self .fam .shape [0 ]
141
+ self .bed_reader = BedReader (
142
+ self .paths .bed_path , self .num_records , self .num_samples
143
+ )
144
+
130
145
@property
131
146
def path (self ):
132
147
return self .prefix
@@ -163,33 +178,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
163
178
alt_field = self .bim .allele_1 .values
164
179
ref_field = self .bim .allele_2 .values
165
180
166
- chunk_size = stop - start
167
-
168
- # Calculate file offsets for the required data
169
- # 3 bytes for the magic number at the beginning of the file
170
- start_offset = 3 + (start * self .bytes_per_snp )
171
- bytes_to_read = chunk_size * self .bytes_per_snp
172
-
173
- # Read only the needed portion of the BED file
174
- with open (self .paths .bed_path , "rb" ) as f :
175
- f .seek (start_offset )
176
- chunk_data = f .read (bytes_to_read )
177
-
178
- data_bytes = np .frombuffer (chunk_data , dtype = np .uint8 )
179
- data_matrix = data_bytes .reshape (chunk_size , self .bytes_per_snp )
180
-
181
- # Apply lookup table to get genotypes
182
- # Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
183
- all_genotypes = self .byte_lookup [data_matrix ]
184
-
185
- # Reshape to get all samples in one dimension
186
- # (chunk_size, bytes_per_snp*4, 2)
187
- samples_padded = self .bytes_per_snp * 4
188
- genotypes_reshaped = all_genotypes .reshape (chunk_size , samples_padded , 2 )
189
-
190
- gt = genotypes_reshaped [:, : self ._num_samples ]
181
+ gt = self .bed_reader .decode (start , stop )
191
182
192
- phased = np .zeros (( chunk_size , self . _num_samples ) , dtype = bool )
183
+ phased = np .zeros (gt . shape [: 2 ] , dtype = bool )
193
184
194
185
for i , (ref , alt ) in enumerate (
195
186
zip (ref_field [start :stop ], alt_field [start :stop ])
0 commit comments