Skip to content

Commit d8f72c9

Browse files
committed
Add first_dim_iter utility to workaround slow array iteration in zarr-python v3
1 parent b866285 commit d8f72c9

File tree

2 files changed

+15
-3
lines changed

2 files changed

+15
-3
lines changed

bio2zarr/vcf2zarr/verification.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import tqdm
55
import zarr
66

7+
from bio2zarr.zarr_utils import first_dim_iter
8+
79
from .. import constants
810

911

@@ -152,7 +154,7 @@ def verify(vcf_path, zarr_path, show_progress=False):
152154
vid = root["variant_id"][:]
153155
call_genotype = None
154156
if "call_genotype" in root and root["call_genotype"].size > 0:
155-
call_genotype = iter(root["call_genotype"])
157+
call_genotype = first_dim_iter(root["call_genotype"])
156158

157159
vcf = cyvcf2.VCF(vcf_path)
158160
format_headers = {}
@@ -170,12 +172,16 @@ def verify(vcf_path, zarr_path, show_progress=False):
170172
vcf_name = colname.split("_", 1)[1]
171173
vcf_type = format_headers[vcf_name]["Type"]
172174
vcf_number = format_headers[vcf_name]["Number"]
173-
format_fields[vcf_name] = vcf_type, vcf_number, iter(root[colname])
175+
format_fields[vcf_name] = (
176+
vcf_type,
177+
vcf_number,
178+
first_dim_iter(root[colname]),
179+
)
174180
if colname.startswith("variant"):
175181
name = colname.split("_", 1)[1]
176182
if name.isupper():
177183
vcf_type = info_headers[name]["Type"]
178-
info_fields[name] = vcf_type, iter(root[colname])
184+
info_fields[name] = vcf_type, first_dim_iter(root[colname])
179185

180186
first_pos = next(vcf).POS
181187
start_index = np.searchsorted(pos, first_pos)

bio2zarr/zarr_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,9 @@ def zarr_v3() -> bool:
1111
ZARR_FORMAT_KWARGS = dict(zarr_format=2)
1212
else:
1313
ZARR_FORMAT_KWARGS = dict()
14+
15+
16+
# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
17+
def first_dim_iter(z):
18+
for chunk in range(z.cdata_shape[0]):
19+
yield from z.blocks[chunk]

0 commit comments

Comments
 (0)