11import dataclasses
22import logging
3- import os
43import pathlib
5- import warnings
64
75import numpy as np
86import pandas as pd
@@ -56,54 +54,21 @@ class PlinkPaths:
5654 fam_path : str
5755
5856
59- @dataclasses .dataclass
60- class FamData :
61- sid : np .ndarray
62- sid_count : int
63-
64-
65- class PlinkFormat (vcz .Source ):
66- def __init__ (self , prefix ):
67- # TODO we will need support multiple chromosomes here to join
68- # plinks into on big zarr. So, these will require multiple
69- # bed and bim files, but should share a .fam
70- self .prefix = str (prefix )
71- self .paths = PlinkPaths (
72- self .prefix + ".bed" ,
73- self .prefix + ".bim" ,
74- self .prefix + ".fam" ,
75- )
76-
77- self .bim = read_bim (self .paths .bim_path )
78- self .fam = read_fam (self .paths .fam_path )
79-
80- self ._num_records = self .bim .shape [0 ]
81- self ._num_samples = self .fam .shape [0 ]
82-
83- # Calculate bytes per SNP: 1 byte per 4 samples, rounded up
84- self .bytes_per_snp = (self ._num_samples + 3 ) // 4
57+ class BedReader :
58+ def __init__ (self , path , num_variants , num_samples ):
59+ self .num_variants = num_variants
60+ self .num_samples = num_samples
61+ self .path = path
62+ # bytes per variant: 1 byte per 4 samples, rounded up
63+ self .bytes_per_variant = (self .num_samples + 3 ) // 4
8564
86- # Verify BED file has correct magic bytes
87- with open (self .paths .bed_path , "rb" ) as f :
65+ with open (self .path , "rb" ) as f :
8866 magic = f .read (3 )
89- assert magic == b"\x6c \x1b \x01 " , "Invalid BED file format"
90-
91- expected_size = self .num_records * self .bytes_per_snp + 3 # +3 for magic bytes
92- actual_size = os .path .getsize (self .paths .bed_path )
93- if actual_size < expected_size :
94- raise ValueError (
95- f"BED file is truncated: expected at least { expected_size } bytes, "
96- f"but only found { actual_size } bytes. "
97- f"Check that .bed, .bim, and .fam files match."
98- )
99- elif actual_size > expected_size :
100- # Warn if there's extra data (might indicate file mismatch)
101- warnings .warn (
102- f"BED file contains { actual_size } bytes but only expected "
103- f"{ expected_size } . "
104- f"Using first { expected_size } bytes only." ,
105- stacklevel = 1 ,
106- )
67+ if magic != b"\x6c \x1b \x01 " :
68+ raise ValueError ("Invalid BED file magic bytes" )
69+
70+ # We could check the size of the bed file here, but that would
71+ # mean we can't work with streams.
10772
10873 # Initialize the lookup table with shape (256, 4, 2)
10974 # 256 possible byte values, 4 samples per byte, 2 alleles per sample
@@ -127,6 +92,56 @@ def __init__(self, prefix):
12792
12893 self .byte_lookup = lookup
12994
95+ def decode (self , start , stop ):
96+ chunk_size = stop - start
97+
98+ # Calculate file offsets for the required data
99+ # 3 bytes for the magic number at the beginning of the file
100+ start_offset = 3 + (start * self .bytes_per_variant )
101+ bytes_to_read = chunk_size * self .bytes_per_variant
102+
103+ # TODO make it possible to read sequentially from the same file handle,
104+ # seeking only when necessary.
105+ with open (self .path , "rb" ) as f :
106+ f .seek (start_offset )
107+ chunk_data = f .read (bytes_to_read )
108+
109+ data_bytes = np .frombuffer (chunk_data , dtype = np .uint8 )
110+ data_matrix = data_bytes .reshape (chunk_size , self .bytes_per_variant )
111+
112+ # Apply lookup table to get genotypes
113+ # Shape becomes: (chunk_size, bytes_per_variant, 4, 2)
114+ all_genotypes = self .byte_lookup [data_matrix ]
115+
116+ # Reshape to get all samples in one dimension
117+ # (chunk_size, bytes_per_variant*4, 2)
118+ samples_padded = self .bytes_per_variant * 4
119+ genotypes_reshaped = all_genotypes .reshape (chunk_size , samples_padded , 2 )
120+
121+ return genotypes_reshaped [:, : self .num_samples ]
122+
123+
124+ class PlinkFormat (vcz .Source ):
125+ def __init__ (self , prefix ):
126+ # TODO we will need support multiple chromosomes here to join
127+ # plinks into on big zarr. So, these will require multiple
128+ # bed and bim files, but should share a .fam
129+ self .prefix = str (prefix )
130+ self .paths = PlinkPaths (
131+ self .prefix + ".bed" ,
132+ self .prefix + ".bim" ,
133+ self .prefix + ".fam" ,
134+ )
135+
136+ self .bim = read_bim (self .paths .bim_path )
137+ self .fam = read_fam (self .paths .fam_path )
138+
139+ self ._num_records = self .bim .shape [0 ]
140+ self ._num_samples = self .fam .shape [0 ]
141+ self .bed_reader = BedReader (
142+ self .paths .bed_path , self .num_records , self .num_samples
143+ )
144+
130145 @property
131146 def path (self ):
132147 return self .prefix
@@ -163,33 +178,9 @@ def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
163178 alt_field = self .bim .allele_1 .values
164179 ref_field = self .bim .allele_2 .values
165180
166- chunk_size = stop - start
167-
168- # Calculate file offsets for the required data
169- # 3 bytes for the magic number at the beginning of the file
170- start_offset = 3 + (start * self .bytes_per_snp )
171- bytes_to_read = chunk_size * self .bytes_per_snp
172-
173- # Read only the needed portion of the BED file
174- with open (self .paths .bed_path , "rb" ) as f :
175- f .seek (start_offset )
176- chunk_data = f .read (bytes_to_read )
177-
178- data_bytes = np .frombuffer (chunk_data , dtype = np .uint8 )
179- data_matrix = data_bytes .reshape (chunk_size , self .bytes_per_snp )
180-
181- # Apply lookup table to get genotypes
182- # Shape becomes: (chunk_size, bytes_per_snp, 4, 2)
183- all_genotypes = self .byte_lookup [data_matrix ]
184-
185- # Reshape to get all samples in one dimension
186- # (chunk_size, bytes_per_snp*4, 2)
187- samples_padded = self .bytes_per_snp * 4
188- genotypes_reshaped = all_genotypes .reshape (chunk_size , samples_padded , 2 )
189-
190- gt = genotypes_reshaped [:, : self ._num_samples ]
181+ gt = self .bed_reader .decode (start , stop )
191182
192- phased = np .zeros (( chunk_size , self . _num_samples ) , dtype = bool )
183+ phased = np .zeros (gt . shape [: 2 ] , dtype = bool )
193184
194185 for i , (ref , alt ) in enumerate (
195186 zip (ref_field [start :stop ], alt_field [start :stop ])
0 commit comments