Skip to content

Commit 79dbcd2

Browse files
Add generic retrieval API
1 parent d7918ae commit 79dbcd2

File tree

3 files changed

+45
-15
lines changed

3 files changed

+45
-15
lines changed

vcztools/cli.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import sys
55
from functools import wraps
66

7+
import zarr
78
import click
89

910
from . import plink, vcf_writer
@@ -256,8 +257,10 @@ def view(
256257

257258
@click.command
258259
@click.argument("path", type=click.Path())
260+
@include
261+
@exclude
259262
@click.option("--out", default="plink")
260-
def view_plink1(path, out):
263+
def view_plink1(path, include, exclude, out):
261264
"""
262265
Generate a plink1 binary fileset compatible with plink1.9 --vcf.
263266
This command is equivalent to running ``vcztools view [filtering options]

vcztools/plink.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pandas as pd
77
import zarr
88

9-
from vcztools import _vcztools
9+
from . import _vcztools, retrieval
1010

1111

1212
def encode_genotypes(genotypes, a12_allele=None):
@@ -62,6 +62,7 @@ def generate_bim(root, a12_allele):
6262
class Writer:
6363
def __init__(self, vcz_path, bed_path, fam_path, bim_path):
6464
self.root = zarr.open(vcz_path, mode="r")
65+
6566
self.bim_path = bim_path
6667
self.fam_path = fam_path
6768
self.bed_path = bed_path
@@ -109,28 +110,22 @@ def _compute_alleles(self, G, alleles):
109110
return a12_allele
110111

111112
def _write_genotypes(self):
113+
ci = retrieval.variant_chunk_iter(
114+
self.root, fields=["call_genotype", "variant_allele"]
115+
)
112116
call_genotype = self.root["call_genotype"]
113-
variant_allele = self.root["variant_allele"]
114117
a12_allele = zarr.zeros(
115118
(call_genotype.shape[0], 2), chunks=call_genotype.chunks[0], dtype=int
116119
)
117120
with open(self.bed_path, "wb") as bed_file:
118121
bed_file.write(bytes([0x6C, 0x1B, 0x01]))
119-
for v_chunk in range(call_genotype.cdata_shape[0]):
120-
# before = time.perf_counter()
121-
G = call_genotype.blocks[v_chunk]
122-
# duration = time.perf_counter() - before
123122

124-
# before = time.perf_counter()
125-
a12 = self._compute_alleles(G, variant_allele.blocks[v_chunk])
126-
# duration = time.perf_counter() - before
127-
128-
# before = time.perf_counter()
123+
for j, chunk in enumerate(ci):
124+
G = chunk["call_genotype"]
125+
a12 = self._compute_alleles(G, chunk["variant_allele"])
129126
buff = encode_genotypes(G, a12)
130-
# duration = time.perf_counter() - before
131-
132127
bed_file.write(buff)
133-
a12_allele.blocks[v_chunk] = a12
128+
a12_allele.blocks[j] = a12
134129
return a12_allele[:]
135130

136131
def run(self):

vcztools/retrieval.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import collections.abc
2+
3+
4+
class VariantChunkReader(collections.abc.Sequence):
5+
"""
6+
Retrieve data from a Zarr store and return chunk-by-chunk in the
7+
variants dimension.
8+
"""
9+
10+
def __init__(self, root, *, fields=None):
11+
self.root = root
12+
if fields is None:
13+
fields = [
14+
key
15+
for key in root.keys()
16+
if key.startswith("variant_") or key.startswith("call_")
17+
]
18+
self.arrays = {key: self.root[key] for key in fields}
19+
# TODO validate the arrays have the correct shapes setc
20+
self.num_chunks = next(iter(self.arrays.values())).cdata_shape[0]
21+
22+
def __len__(self):
23+
return self.num_chunks
24+
25+
def __getitem__(self, chunk):
26+
return {key: array.blocks[chunk] for key, array in self.arrays.items()}
27+
28+
29+
def variant_chunk_iter(root, fields=None, variant_select=None):
30+
chunk_reader = VariantChunkReader(root, fields=fields)
31+
for chunk in range(len(chunk_reader)):
32+
yield chunk_reader[chunk]

0 commit comments

Comments
 (0)