Add generic retrieval API

jeromekelleher · jeromekelleher · commit 79dbcd242470 · 2025-04-04T10:24:05.000+01:00
diff --git a/vcztools/cli.py b/vcztools/cli.py
@@ -4,6 +4,7 @@
 import sys
 from functools import wraps
 
+import zarr
 import click
 
 from . import plink, vcf_writer
@@ -256,8 +257,10 @@ def view(
 
 @click.command
 @click.argument("path", type=click.Path())
+@include
+@exclude
 @click.option("--out", default="plink")
-def view_plink1(path, out):
+def view_plink1(path, include, exclude, out):
     """
     Generate a plink1 binary fileset compatible with plink1.9 --vcf.
     This command is equivalent to running ``vcztools view [filtering options]
diff --git a/vcztools/plink.py b/vcztools/plink.py
@@ -6,7 +6,7 @@
 import pandas as pd
 import zarr
 
-from vcztools import _vcztools
+from . import _vcztools, retrieval
 
 
 def encode_genotypes(genotypes, a12_allele=None):
@@ -62,6 +62,7 @@ def generate_bim(root, a12_allele):
 class Writer:
     def __init__(self, vcz_path, bed_path, fam_path, bim_path):
         self.root = zarr.open(vcz_path, mode="r")
+
         self.bim_path = bim_path
         self.fam_path = fam_path
         self.bed_path = bed_path
@@ -109,28 +110,22 @@ def _compute_alleles(self, G, alleles):
         return a12_allele
 
     def _write_genotypes(self):
+        ci = retrieval.variant_chunk_iter(
+            self.root, fields=["call_genotype", "variant_allele"]
+        )
         call_genotype = self.root["call_genotype"]
-        variant_allele = self.root["variant_allele"]
         a12_allele = zarr.zeros(
             (call_genotype.shape[0], 2), chunks=call_genotype.chunks[0], dtype=int
         )
         with open(self.bed_path, "wb") as bed_file:
             bed_file.write(bytes([0x6C, 0x1B, 0x01]))
-            for v_chunk in range(call_genotype.cdata_shape[0]):
-                # before = time.perf_counter()
-                G = call_genotype.blocks[v_chunk]
-                # duration = time.perf_counter() - before
 
-                # before = time.perf_counter()
-                a12 = self._compute_alleles(G, variant_allele.blocks[v_chunk])
-                # duration = time.perf_counter() - before
-
-                # before = time.perf_counter()
+            for j, chunk in enumerate(ci):
+                G = chunk["call_genotype"]
+                a12 = self._compute_alleles(G, chunk["variant_allele"])
                 buff = encode_genotypes(G, a12)
-                # duration = time.perf_counter() - before
-
                 bed_file.write(buff)
-                a12_allele.blocks[v_chunk] = a12
+                a12_allele.blocks[j] = a12
         return a12_allele[:]
 
     def run(self):
diff --git a/vcztools/retrieval.py b/vcztools/retrieval.py
@@ -0,0 +1,32 @@
+import collections.abc
+
+
+class VariantChunkReader(collections.abc.Sequence):
+    """
+    Retrieve data from a Zarr store and return chunk-by-chunk in the
+    variants dimension.
+    """
+
+    def __init__(self, root, *, fields=None):
+        self.root = root
+        if fields is None:
+            fields = [
+                key
+                for key in root.keys()
+                if key.startswith("variant_") or key.startswith("call_")
+            ]
+        self.arrays = {key: self.root[key] for key in fields}
+        # TODO validate the arrays have the correct shapes setc
+        self.num_chunks = next(iter(self.arrays.values())).cdata_shape[0]
+
+    def __len__(self):
+        return self.num_chunks
+
+    def __getitem__(self, chunk):
+        return {key: array.blocks[chunk] for key, array in self.arrays.items()}
+
+
+def variant_chunk_iter(root, fields=None, variant_select=None):
+    chunk_reader = VariantChunkReader(root, fields=fields)
+    for chunk in range(len(chunk_reader)):
+        yield chunk_reader[chunk]