Skip to content

Commit b1d7ef2

Browse files
tomwhitejeromekelleher
authored andcommitted
Backward and forward-compatible changes to support zarr-python v3
1 parent 0ef5623 commit b1d7ef2

File tree

4 files changed

+48
-30
lines changed

4 files changed

+48
-30
lines changed

bio2zarr/plink.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import numpy as np
77
import zarr
88

9+
from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
10+
911
from . import core
1012

1113
logger = logging.getLogger(__name__)
@@ -17,8 +19,7 @@ def encode_genotypes_slice(bed_path, zarr_path, start, stop):
1719
# the correct approach is, but it is important to note that the
1820
# 0th allele is *not* necessarily the REF for these datasets.
1921
bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
20-
store = zarr.DirectoryStore(zarr_path)
21-
root = zarr.group(store=store)
22+
root = zarr.open(store=zarr_path, mode="a", **ZARR_FORMAT_KWARGS)
2223
gt = core.BufferedArray(root["call_genotype"], start)
2324
gt_mask = core.BufferedArray(root["call_genotype_mask"], start)
2425
gt_phased = core.BufferedArray(root["call_genotype_phased"], start)
@@ -73,8 +74,7 @@ def convert(
7374
if variants_chunk_size is None:
7475
variants_chunk_size = 10_000
7576

76-
store = zarr.DirectoryStore(zarr_path)
77-
root = zarr.group(store=store, overwrite=True)
77+
root = zarr.open_group(store=zarr_path, mode="w", **ZARR_FORMAT_KWARGS)
7878

7979
ploidy = 2
8080
shape = [m, n]
@@ -88,7 +88,8 @@ def convert(
8888

8989
a = root.array(
9090
"sample_id",
91-
bed.iid,
91+
data=bed.iid,
92+
shape=bed.iid.shape,
9293
dtype="str",
9394
compressor=default_compressor,
9495
chunks=(samples_chunk_size,),
@@ -100,7 +101,8 @@ def convert(
100101
# fetching repeatedly from bim file
101102
a = root.array(
102103
"variant_position",
103-
bed.bp_position,
104+
data=bed.bp_position,
105+
shape=bed.bp_position.shape,
104106
dtype=np.int32,
105107
compressor=default_compressor,
106108
chunks=(variants_chunk_size,),
@@ -111,7 +113,8 @@ def convert(
111113
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
112114
a = root.array(
113115
"variant_allele",
114-
alleles,
116+
data=alleles,
117+
shape=alleles.shape,
115118
dtype="str",
116119
compressor=default_compressor,
117120
chunks=(variants_chunk_size,),
@@ -121,31 +124,34 @@ def convert(
121124

122125
# TODO remove this?
123126
a = root.empty(
124-
"call_genotype_phased",
127+
name="call_genotype_phased",
125128
dtype="bool",
126129
shape=list(shape),
127130
chunks=list(chunks),
128131
compressor=default_compressor,
132+
**ZARR_FORMAT_KWARGS,
129133
)
130134
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
131135

132136
shape += [ploidy]
133137
dimensions += ["ploidy"]
134138
a = root.empty(
135-
"call_genotype",
139+
name="call_genotype",
136140
dtype="i1",
137141
shape=list(shape),
138142
chunks=list(chunks),
139143
compressor=default_compressor,
144+
**ZARR_FORMAT_KWARGS,
140145
)
141146
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
142147

143148
a = root.empty(
144-
"call_genotype_mask",
149+
name="call_genotype_mask",
145150
dtype="bool",
146151
shape=list(shape),
147152
chunks=list(chunks),
148153
compressor=default_compressor,
154+
**ZARR_FORMAT_KWARGS,
149155
)
150156
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
151157

@@ -154,7 +160,7 @@ def convert(
154160
num_slices = max(1, worker_processes * 4)
155161
slices = core.chunk_aligned_slices(a, num_slices)
156162

157-
total_chunks = sum(a.nchunks for a in root.values())
163+
total_chunks = sum(a.nchunks for _, a in root.arrays())
158164

159165
progress_config = core.ProgressConfig(
160166
total=total_chunks, title="Convert", units="chunks", show=show_progress
@@ -171,8 +177,7 @@ def convert(
171177
# FIXME do this more efficiently - currently reading the whole thing
172178
# in for convenience, and also comparing call-by-call
173179
def validate(bed_path, zarr_path):
174-
store = zarr.DirectoryStore(zarr_path)
175-
root = zarr.group(store=store)
180+
root = zarr.open(store=zarr_path, mode="r")
176181
call_genotype = root["call_genotype"][:]
177182

178183
bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)

bio2zarr/vcf2zarr/vcz.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
import numpy as np
1414
import zarr
1515

16+
from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
17+
1618
from .. import constants, core, provenance
1719
from . import icf
1820

@@ -532,8 +534,7 @@ def init(
532534
)
533535

534536
self.path.mkdir()
535-
store = zarr.DirectoryStore(self.path)
536-
root = zarr.group(store=store)
537+
root = zarr.open(store=self.path, mode="a", **ZARR_FORMAT_KWARGS)
537538
root.attrs.update(
538539
{
539540
"vcf_zarr_version": "0.2",
@@ -549,8 +550,7 @@ def init(
549550
self.wip_path.mkdir()
550551
self.arrays_path.mkdir()
551552
self.partitions_path.mkdir()
552-
store = zarr.DirectoryStore(self.arrays_path)
553-
root = zarr.group(store=store)
553+
root = zarr.open(store=self.arrays_path, mode="a", **ZARR_FORMAT_KWARGS)
554554

555555
total_chunks = 0
556556
for field in self.schema.fields:
@@ -574,7 +574,8 @@ def encode_samples(self, root):
574574
raise ValueError("Subsetting or reordering samples not supported currently")
575575
array = root.array(
576576
"sample_id",
577-
[sample.id for sample in self.schema.samples],
577+
data=[sample.id for sample in self.schema.samples],
578+
shape=len(self.schema.samples),
578579
dtype="str",
579580
compressor=DEFAULT_ZARR_COMPRESSOR,
580581
chunks=(self.schema.samples_chunk_size,),
@@ -585,15 +586,17 @@ def encode_samples(self, root):
585586
def encode_contig_id(self, root):
586587
array = root.array(
587588
"contig_id",
588-
[contig.id for contig in self.schema.contigs],
589+
data=[contig.id for contig in self.schema.contigs],
590+
shape=len(self.schema.contigs),
589591
dtype="str",
590592
compressor=DEFAULT_ZARR_COMPRESSOR,
591593
)
592594
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
593595
if all(contig.length is not None for contig in self.schema.contigs):
594596
array = root.array(
595597
"contig_length",
596-
[contig.length for contig in self.schema.contigs],
598+
data=[contig.length for contig in self.schema.contigs],
599+
shape=len(self.schema.contigs),
597600
dtype=np.int64,
598601
compressor=DEFAULT_ZARR_COMPRESSOR,
599602
)
@@ -604,7 +607,8 @@ def encode_filter_id(self, root):
604607
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
605608
array = root.array(
606609
"filter_id",
607-
[filt.id for filt in self.schema.filters],
610+
data=[filt.id for filt in self.schema.filters],
611+
shape=len(self.schema.filters),
608612
dtype="str",
609613
compressor=DEFAULT_ZARR_COMPRESSOR,
610614
)
@@ -618,14 +622,15 @@ def init_array(self, root, array_spec, variants_dim_size):
618622
# Truncate the variants dimension is max_variant_chunks was specified
619623
shape[0] = variants_dim_size
620624
a = root.empty(
621-
array_spec.name,
625+
name=array_spec.name,
622626
shape=shape,
623627
chunks=array_spec.chunks,
624628
dtype=array_spec.dtype,
625629
compressor=numcodecs.get_codec(array_spec.compressor),
626630
filters=[numcodecs.get_codec(filt) for filt in array_spec.filters],
627631
object_codec=object_codec,
628632
dimension_separator=self.metadata.dimension_separator,
633+
**ZARR_FORMAT_KWARGS,
629634
)
630635
a.attrs.update(
631636
{
@@ -690,9 +695,7 @@ def init_partition_array(self, partition_index, name):
690695
# Overwrite any existing WIP files
691696
wip_path = self.wip_partition_array_path(partition_index, name)
692697
shutil.copytree(src, wip_path, dirs_exist_ok=True)
693-
store = zarr.DirectoryStore(self.wip_partition_path(partition_index))
694-
wip_root = zarr.group(store=store)
695-
array = wip_root[name]
698+
array = zarr.open_array(store=wip_path, mode="a")
696699
logger.debug(f"Opened empty array {array.name} <{array.dtype}> @ {wip_path}")
697700
return array
698701

@@ -909,8 +912,7 @@ def finalise(self, show_progress=False):
909912
def create_index(self):
910913
"""Create an index to support efficient region queries."""
911914

912-
store = zarr.DirectoryStore(self.path)
913-
root = zarr.open_group(store=store, mode="r+")
915+
root = zarr.open_group(store=self.path, mode="r+")
914916

915917
contig = root["variant_contig"]
916918
pos = root["variant_position"]

bio2zarr/vcf2zarr/verification.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,7 @@ def assert_format_val_equal(vcf_val, zarr_val, vcf_type, vcf_number):
145145

146146

147147
def verify(vcf_path, zarr_path, show_progress=False):
148-
store = zarr.DirectoryStore(zarr_path)
149-
150-
root = zarr.group(store=store)
148+
root = zarr.open(store=zarr_path, mode="r")
151149
pos = root["variant_position"][:]
152150
allele = root["variant_allele"][:]
153151
chrom = root["contig_id"][:][root["variant_contig"][:]]

bio2zarr/zarr_utils.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
import zarr
2+
from packaging.version import Version
3+
4+
5+
def zarr_v3() -> bool:
6+
return Version(zarr.__version__).major >= 3
7+
8+
9+
if zarr_v3():
10+
# Use zarr format v2 even when running with zarr-python v3
11+
ZARR_FORMAT_KWARGS = dict(zarr_format=2)
12+
else:
13+
ZARR_FORMAT_KWARGS = dict()

0 commit comments

Comments
 (0)