@@ -96,6 +96,18 @@ def __init__(self, path, num_variants, num_samples):
96
96
97
97
self .byte_lookup = lookup
98
98
99
+ def iter_decode (self , start , stop , buffer_size = None ):
100
+ """
101
+ Iterate of over the variants in the specified window
102
+ with the specified approximate buffer size in bytes (default=10MiB).
103
+ """
104
+ if buffer_size is None :
105
+ buffer_size = 10 * 1024 * 1024
106
+ variants_per_read = max (1 , int (buffer_size / self .bytes_per_variant ))
107
+ for off in range (start , stop , variants_per_read ):
108
+ genotypes = self .decode (off , min (off + variants_per_read , stop ))
109
+ yield from genotypes
110
+
99
111
def decode (self , start , stop ):
100
112
chunk_size = stop - start
101
113
@@ -108,6 +120,7 @@ def decode(self, start, stop):
108
120
f"Reading { chunk_size } variants ({ bytes_to_read } bytes) "
109
121
f"from { self .path } "
110
122
)
123
+
111
124
# TODO make it possible to read sequentially from the same file handle,
112
125
# seeking only when necessary.
113
126
with open (self .path , "rb" ) as f :
@@ -181,19 +194,16 @@ def iter_id(self, start, stop):
181
194
yield from self .bim .variant_id [start :stop ]
182
195
183
196
def iter_alleles_and_genotypes (self , start , stop , shape , num_alleles ):
184
- alt_field = self .bim .allele_1 .values
185
- ref_field = self .bim .allele_2 .values
186
- gt = self .bed_reader .decode (start , stop )
187
- phased = np .zeros (gt .shape [:2 ], dtype = bool )
188
- for i , (ref , alt ) in enumerate (
189
- zip (ref_field [start :stop ], alt_field [start :stop ])
190
- ):
197
+ alt_iter = self .bim .allele_1 .values [start :stop ]
198
+ ref_iter = self .bim .allele_2 .values [start :stop ]
199
+ gt_iter = self .bed_reader .iter_decode (start , stop )
200
+ for alt , ref , gt in zip (alt_iter , ref_iter , gt_iter ):
191
201
alleles = np .full (num_alleles , constants .STR_FILL , dtype = "O" )
192
202
alleles [0 ] = ref
193
203
alleles [1 : 1 + len (alt )] = alt
194
-
204
+ phased = np . zeros ( gt . shape [ 0 ], dtype = bool )
195
205
# rlen is the length of the REF in PLINK as there's no END annotations
196
- yield vcz .VariantData (len (alleles [0 ]), alleles , gt [ i ] , phased [ i ] )
206
+ yield vcz .VariantData (len (alleles [0 ]), alleles , gt , phased )
197
207
198
208
def generate_schema (
199
209
self ,
0 commit comments