Add buffered iterating over genotypes in plink

jeromekelleher · jeromekelleher · commit b626dda889d0 · 2025-05-23T14:13:13.000Z
Was trying to read entire partition into memory previously
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -96,6 +96,18 @@ def __init__(self, path, num_variants, num_samples):
 
         self.byte_lookup = lookup
 
+    def iter_decode(self, start, stop, buffer_size=None):
+        """
+        Iterate of over the variants in the specified window
+        with the specified approximate buffer size in bytes (default=10MiB).
+        """
+        if buffer_size is None:
+            buffer_size = 10 * 1024 * 1024
+        variants_per_read = max(1, int(buffer_size / self.bytes_per_variant))
+        for off in range(start, stop, variants_per_read):
+            genotypes = self.decode(off, min(off + variants_per_read, stop))
+            yield from genotypes
+
     def decode(self, start, stop):
         chunk_size = stop - start
 
@@ -108,6 +120,7 @@ def decode(self, start, stop):
             f"Reading {chunk_size} variants ({bytes_to_read} bytes) "
             f"from {self.path}"
         )
+
         # TODO make it possible to read sequentially from the same file handle,
         # seeking only when necessary.
         with open(self.path, "rb") as f:
@@ -181,19 +194,16 @@ def iter_id(self, start, stop):
         yield from self.bim.variant_id[start:stop]
 
     def iter_alleles_and_genotypes(self, start, stop, shape, num_alleles):
-        alt_field = self.bim.allele_1.values
-        ref_field = self.bim.allele_2.values
-        gt = self.bed_reader.decode(start, stop)
-        phased = np.zeros(gt.shape[:2], dtype=bool)
-        for i, (ref, alt) in enumerate(
-            zip(ref_field[start:stop], alt_field[start:stop])
-        ):
+        alt_iter = self.bim.allele_1.values[start:stop]
+        ref_iter = self.bim.allele_2.values[start:stop]
+        gt_iter = self.bed_reader.iter_decode(start, stop)
+        for alt, ref, gt in zip(alt_iter, ref_iter, gt_iter):
             alleles = np.full(num_alleles, constants.STR_FILL, dtype="O")
             alleles[0] = ref
             alleles[1 : 1 + len(alt)] = alt
-
+            phased = np.zeros(gt.shape[0], dtype=bool)
             # rlen is the length of the REF in PLINK as there's no END annotations
-            yield vcz.VariantData(len(alleles[0]), alleles, gt[i], phased[i])
+            yield vcz.VariantData(len(alleles[0]), alleles, gt, phased)
 
     def generate_schema(
         self,
diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py
@@ -842,6 +842,7 @@ def encode_alleles_and_genotypes_partition(self, partition_index):
                 partition_index, "call_genotype_phased"
             )
             shape = gt.buff.shape[1:]
+
         for variant_data in self.source.iter_alleles_and_genotypes(
             partition.start, partition.stop, shape, alleles.array.shape[1]
         ):
diff --git a/tests/test_plink.py b/tests/test_plink.py
@@ -93,7 +93,6 @@ def test_generated_bed_files(self, tmp_path, num_variants, num_samples):
         data = np.arange(num_variants * num_samples, dtype=int) % 4
         data[data == 3] = -127
         data = data.reshape((num_variants, num_samples))
-
         bed_reader.to_bed(bed_file, data.T, num_threads=1)
 
         bytes_per_variant = (num_samples + 3) // 4
@@ -108,6 +107,31 @@ def test_generated_bed_files(self, tmp_path, num_variants, num_samples):
             for k in range(num_samples):
                 assert br_map[data[j, k]] == tuple(g[j, k])
 
+    @pytest.mark.parametrize(
+        ("num_variants", "num_samples"),
+        [
+            (1, 1),
+            (30, 3),
+            (300, 1000),
+        ],
+    )
+    @pytest.mark.parametrize("buffer_size", [0, 1, 3, 100, 100_000, None])
+    def test_iter_decode(self, tmp_path, buffer_size, num_variants, num_samples):
+        bed_file = tmp_path / "a_file.bed"
+        # Generate a regular pattern of all possible values
+        data = np.arange(num_variants * num_samples, dtype=int) % 4
+        data[data == 3] = -127
+        data = data.reshape((num_variants, num_samples))
+        bed_reader.to_bed(bed_file, data.T, num_threads=1)
+
+        reader = plink.BedReader(bed_file, num_variants, num_samples)
+        G = reader.decode(0, num_variants)
+        assert G.shape == (num_variants, num_samples, 2)
+
+        reader = plink.BedReader(bed_file, num_variants, num_samples)
+        G2 = list(reader.iter_decode(0, num_variants, buffer_size=buffer_size))
+        nt.assert_array_equal(G, G2)
+
 
 class TestSmallExample:
     @pytest.fixture(scope="class")

Original file line number	Diff line number	Diff line change
`@@ -842,6 +842,7 @@ def encode_alleles_and_genotypes_partition(self, partition_index):`
`842`	`842`	`partition_index, "call_genotype_phased"`
`843`	`843`	`)`
`844`	`844`	`shape = gt.buff.shape[1:]`
	`845`	`+`
`845`	`846`	`for variant_data in self.source.iter_alleles_and_genotypes(`
`846`	`847`	`partition.start, partition.stop, shape, alleles.array.shape[1]`
`847`	`848`	`):`