1
1
import dataclasses
2
2
import logging
3
+ import os
3
4
import pathlib
5
+ import warnings
4
6
5
7
import numpy as np
6
8
@@ -16,87 +18,207 @@ class PlinkPaths:
16
18
fam_path : str
17
19
18
20
21
+ @dataclasses .dataclass
22
+ class FamData :
23
+ sid : np .ndarray
24
+ sid_count : int
25
+
26
+
27
+ @dataclasses .dataclass
28
+ class BimData :
29
+ chromosome : np .ndarray
30
+ vid : np .ndarray
31
+ bp_position : np .ndarray
32
+ allele_1 : np .ndarray
33
+ allele_2 : np .ndarray
34
+ vid_count : int
35
+
36
+
19
37
class PlinkFormat (vcz .Source ):
20
- @core .requires_optional_dependency ("bed_reader" , "plink" )
21
38
def __init__ (self , prefix ):
22
- import bed_reader
23
-
24
39
# TODO we will need support multiple chromosomes here to join
25
40
# plinks into on big zarr. So, these will require multiple
26
41
# bed and bim files, but should share a .fam
27
42
self .prefix = str (prefix )
28
- paths = PlinkPaths (
43
+ self . paths = PlinkPaths (
29
44
self .prefix + ".bed" ,
30
45
self .prefix + ".bim" ,
31
46
self .prefix + ".fam" ,
32
47
)
33
- self .bed = bed_reader .open_bed (
34
- paths .bed_path ,
35
- bim_location = paths .bim_path ,
36
- fam_location = paths .fam_path ,
37
- num_threads = 1 ,
38
- count_A1 = True ,
48
+
49
+ # Read sample information from .fam file
50
+ samples = []
51
+ with open (self .paths .fam_path ) as f :
52
+ for line in f :
53
+ fields = line .strip ().split ()
54
+ if len (fields ) >= 2 : # At minimum, we need FID and IID
55
+ samples .append (fields [1 ])
56
+ self .fam = FamData (sid = np .array (samples ), sid_count = len (samples ))
57
+ self .n_samples = len (samples )
58
+
59
+ # Read variant information from .bim file
60
+ chromosomes = []
61
+ vids = []
62
+ positions = []
63
+ allele1 = []
64
+ allele2 = []
65
+
66
+ with open (self .paths .bim_path ) as f :
67
+ for line in f :
68
+ fields = line .strip ().split ()
69
+ if len (fields ) >= 6 :
70
+ chrom , vid , _ , pos , a1 , a2 = (
71
+ fields [0 ],
72
+ fields [1 ],
73
+ fields [2 ],
74
+ fields [3 ],
75
+ fields [4 ],
76
+ fields [5 ],
77
+ )
78
+ chromosomes .append (chrom )
79
+ vids .append (vid )
80
+ positions .append (int (pos ))
81
+ allele1 .append (a1 )
82
+ allele2 .append (a2 )
83
+
84
+ self .bim = BimData (
85
+ chromosome = np .array (chromosomes ),
86
+ vid = np .array (vids ),
87
+ bp_position = np .array (positions ),
88
+ allele_1 = np .array (allele1 ),
89
+ allele_2 = np .array (allele2 ),
90
+ vid_count = len (vids ),
39
91
)
92
+ self .n_variants = len (vids )
93
+
94
+ # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
95
+ self .bytes_per_snp = (self .n_samples + 3 ) // 4
96
+
97
+ # Verify BED file has correct magic bytes
98
+ with open (self .paths .bed_path , "rb" ) as f :
99
+ magic = f .read (3 )
100
+ assert magic == b"\x6c \x1b \x01 " , "Invalid BED file format"
101
+
102
+ expected_size = self .n_variants * self .bytes_per_snp + 3 # +3 for magic bytes
103
+ actual_size = os .path .getsize (self .paths .bed_path )
104
+ if actual_size < expected_size :
105
+ raise ValueError (
106
+ f"BED file is truncated: expected at least { expected_size } bytes, "
107
+ f"but only found { actual_size } bytes. "
108
+ f"Check that .bed, .bim, and .fam files match."
109
+ )
110
+ elif actual_size > expected_size :
111
+ # Warn if there's extra data (might indicate file mismatch)
112
+ warnings .warn (
113
+ f"BED file contains { actual_size } bytes but only expected "
114
+ f"{ expected_size } . "
115
+ f"Using first { expected_size } bytes only." ,
116
+ stacklevel = 1 ,
117
+ )
118
+
119
+ # Initialize the lookup table with shape (256, 4, 2)
120
+ # 256 possible byte values, 4 samples per byte, 2 alleles per sample
121
+ lookup = np .zeros ((256 , 4 , 2 ), dtype = np .int8 )
122
+
123
+ # For each possible byte value (0-255)
124
+ for byte in range (256 ):
125
+ # For each of the 4 samples encoded in this byte
126
+ for sample in range (4 ):
127
+ # Extract the 2 bits for this sample
128
+ bits = (byte >> (sample * 2 )) & 0b11
129
+ # Convert PLINK's bit encoding to genotype values
130
+ if bits == 0b00 :
131
+ lookup [byte , sample ] = [1 , 1 ]
132
+ elif bits == 0b01 :
133
+ lookup [byte , sample ] = [- 1 , - 1 ]
134
+ elif bits == 0b10 :
135
+ lookup [byte , sample ] = [0 , 1 ]
136
+ elif bits == 0b11 :
137
+ lookup [byte , sample ] = [0 , 0 ]
138
+
139
+ self .byte_lookup = lookup
40
140
41
141
@property
42
142
def path (self ):
43
143
return self .prefix
44
144
45
145
@property
46
146
def num_records (self ):
47
- return self .bed . sid_count
147
+ return self .bim . vid_count
48
148
49
149
@property
50
150
def samples (self ):
51
- return [vcz .Sample (id = sample ) for sample in self .bed . iid ]
151
+ return [vcz .Sample (id = sample ) for sample in self .fam . sid ]
52
152
53
153
@property
54
154
def contigs (self ):
55
- return [vcz .Contig (id = str (chrom )) for chrom in np .unique (self .bed .chromosome )]
155
+ return [vcz .Contig (id = str (chrom )) for chrom in np .unique (self .bim .chromosome )]
56
156
57
157
@property
58
158
def num_samples (self ):
59
159
return len (self .samples )
60
160
61
161
def iter_contig (self , start , stop ):
62
162
chrom_to_contig_index = {contig .id : i for i , contig in enumerate (self .contigs )}
63
- for chrom in self .bed .chromosome [start :stop ]:
163
+ for chrom in self .bim .chromosome [start :stop ]:
64
164
yield chrom_to_contig_index [str (chrom )]
65
165
66
166
def iter_field (self , field_name , shape , start , stop ):
67
167
assert field_name == "position" # Only position field is supported from plink
68
- yield from self .bed .bp_position [start :stop ]
168
+ yield from self .bim .bp_position [start :stop ]
69
169
70
170
def iter_id (self , start , stop ):
71
- yield from self .bed . sid [start :stop ]
171
+ yield from self .bim . vid [start :stop ]
72
172
73
173
def iter_alleles_and_genotypes (self , start , stop , shape , num_alleles ):
74
- alt_field = self .bed .allele_1
75
- ref_field = self .bed .allele_2
76
- bed_chunk = self .bed .read (slice (start , stop ), dtype = np .int8 ).T
77
- gt = np .zeros (shape , dtype = np .int8 )
78
- phased = np .zeros (shape [:- 1 ], dtype = bool )
174
+ alt_field = self .bim .allele_1
175
+ ref_field = self .bim .allele_2
176
+
177
+ chunk_size = stop - start
178
+
179
+ # Calculate file offsets for the required data
180
+ # 3 bytes for the magic number at the beginning of the file
181
+ start_offset = 3 + (start * self .bytes_per_snp )
182
+ bytes_to_read = chunk_size * self .bytes_per_snp
183
+
184
+ # Read only the needed portion of the BED file
185
+ with open (self .paths .bed_path , "rb" ) as f :
186
+ f .seek (start_offset )
187
+ chunk_data = f .read (bytes_to_read )
188
+
189
+ data_bytes = np .frombuffer (chunk_data , dtype = np .uint8 )
190
+ data_matrix = data_bytes .reshape (chunk_size , self .bytes_per_snp )
191
+
192
+ # Apply lookup table to get genotypes
193
+ # Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
194
+ all_genotypes = self .byte_lookup [data_matrix ]
195
+
196
+ # Reshape to get all samples in one dimension
197
+ # (chunk_size, bytes_per_snp*4, 2)
198
+ samples_padded = self .bytes_per_snp * 4
199
+ genotypes_reshaped = all_genotypes .reshape (chunk_size , samples_padded , 2 )
200
+
201
+ gt = genotypes_reshaped [:, : self .n_samples ]
202
+
203
+ phased = np .zeros ((chunk_size , self .n_samples ), dtype = bool )
204
+
79
205
for i , (ref , alt ) in enumerate (
80
206
zip (ref_field [start :stop ], alt_field [start :stop ])
81
207
):
82
208
alleles = np .full (num_alleles , constants .STR_FILL , dtype = "O" )
83
209
alleles [0 ] = ref
84
210
alleles [1 : 1 + len (alt )] = alt
85
- gt [:] = 0
86
- gt [bed_chunk [i ] == - 127 ] = - 1
87
- gt [bed_chunk [i ] == 2 ] = 1
88
- gt [bed_chunk [i ] == 1 , 1 ] = 1
89
211
90
212
# rlen is the length of the REF in PLINK as there's no END annotations
91
- yield vcz .VariantData (len (alleles [0 ]), alleles , gt , phased )
213
+ yield vcz .VariantData (len (alleles [0 ]), alleles , gt [ i ] , phased [ i ] )
92
214
93
215
def generate_schema (
94
216
self ,
95
217
variants_chunk_size = None ,
96
218
samples_chunk_size = None ,
97
219
):
98
- n = self .bed . iid_count
99
- m = self .bed . sid_count
220
+ n = self .fam . sid_count
221
+ m = self .bim . vid_count
100
222
logging .info (f"Scanned plink with { n } samples and { m } variants" )
101
223
dimensions = vcz .standard_dimensions (
102
224
variants_size = m ,
@@ -119,7 +241,7 @@ def generate_schema(
119
241
)
120
242
# If we don't have SVLEN or END annotations, the rlen field is defined
121
243
# as the length of the REF
122
- max_len = self .bed .allele_2 .itemsize
244
+ max_len = self .bim .allele_2 .itemsize
123
245
124
246
array_specs = [
125
247
vcz .ZarrArraySpec (
@@ -156,7 +278,7 @@ def generate_schema(
156
278
),
157
279
vcz .ZarrArraySpec (
158
280
name = "variant_contig" ,
159
- dtype = core .min_int_dtype (0 , len (np .unique (self .bed .chromosome ))),
281
+ dtype = core .min_int_dtype (0 , len (np .unique (self .bim .chromosome ))),
160
282
dimensions = ["variants" ],
161
283
description = "Contig/chromosome index for each variant" ,
162
284
),
0 commit comments