@@ -96,6 +96,18 @@ def __init__(self, path, num_variants, num_samples):
9696
9797 self .byte_lookup = lookup
9898
99+ def iter_decode (self , start , stop , buffer_size = None ):
100+ """
101+ Iterate of over the variants in the specified window
102+ with the specified approximate buffer size in bytes (default=10MiB).
103+ """
104+ if buffer_size is None :
105+ buffer_size = 10 * 1024 * 1024
106+ variants_per_read = max (1 , int (buffer_size / self .bytes_per_variant ))
107+ for off in range (start , stop , variants_per_read ):
108+ genotypes = self .decode (off , min (off + variants_per_read , stop ))
109+ yield from genotypes
110+
99111 def decode (self , start , stop ):
100112 chunk_size = stop - start
101113
@@ -108,6 +120,7 @@ def decode(self, start, stop):
108120 f"Reading { chunk_size } variants ({ bytes_to_read } bytes) "
109121 f"from { self .path } "
110122 )
123+
111124 # TODO make it possible to read sequentially from the same file handle,
112125 # seeking only when necessary.
113126 with open (self .path , "rb" ) as f :
@@ -181,19 +194,16 @@ def iter_id(self, start, stop):
181194 yield from self .bim .variant_id [start :stop ]
182195
183196 def iter_alleles_and_genotypes (self , start , stop , shape , num_alleles ):
184- alt_field = self .bim .allele_1 .values
185- ref_field = self .bim .allele_2 .values
186- gt = self .bed_reader .decode (start , stop )
187- phased = np .zeros (gt .shape [:2 ], dtype = bool )
188- for i , (ref , alt ) in enumerate (
189- zip (ref_field [start :stop ], alt_field [start :stop ])
190- ):
197+ alt_iter = self .bim .allele_1 .values [start :stop ]
198+ ref_iter = self .bim .allele_2 .values [start :stop ]
199+ gt_iter = self .bed_reader .iter_decode (start , stop )
200+ for alt , ref , gt in zip (alt_iter , ref_iter , gt_iter ):
191201 alleles = np .full (num_alleles , constants .STR_FILL , dtype = "O" )
192202 alleles [0 ] = ref
193203 alleles [1 : 1 + len (alt )] = alt
194-
204+ phased = np . zeros ( gt . shape [ 0 ], dtype = bool )
195205 # rlen is the length of the REF in PLINK as there's no END annotations
196- yield vcz .VariantData (len (alleles [0 ]), alleles , gt [ i ] , phased [ i ] )
206+ yield vcz .VariantData (len (alleles [0 ]), alleles , gt , phased )
197207
198208 def generate_schema (
199209 self ,
0 commit comments