Skip to content

Commit ecfd46b

Browse files
Run on zarr-python v3 with zarr-format v2
1 parent 9c282bf commit ecfd46b

File tree

5 files changed

+53
-18
lines changed

5 files changed

+53
-18
lines changed

.github/workflows/ci.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,3 +132,25 @@ jobs:
132132
# We just run the CLI tests here because it doesn't require other upstream
133133
# packages like sgkit (which are tangled up with the numpy 2 dependency)
134134
python -m pytest tests/test_cli.py
135+
136+
test-zarr-version:
137+
name: Test Zarr versions
138+
runs-on: ubuntu-latest
139+
strategy:
140+
matrix:
141+
zarr: ["zarr==2.18.3", "git+https://github.com/zarr-developers/zarr-python.git"]
142+
steps:
143+
- uses: actions/checkout@v4
144+
- uses: actions/setup-python@v5
145+
with:
146+
python-version: '3.11'
147+
- name: Install dependencies
148+
run: |
149+
python -m pip install --upgrade pip
150+
python -m pip install '.[dev]'
151+
- name: Install ${{ matrix.zarr }}
152+
run: |
153+
python -m pip install --pre '${{ matrix.zarr }}'
154+
- name: Run tests
155+
run: |
156+
python -m pytest

bio2zarr/plink.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def convert(
8686
# we're not using the best Blosc settings for genotypes here.
8787
default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
8888

89-
a = root.array(
89+
a = root.create_dataset(
9090
"sample_id",
9191
data=bed.iid,
9292
shape=bed.iid.shape,
@@ -99,7 +99,7 @@ def convert(
9999

100100
# TODO encode these in slices - but read them in one go to avoid
101101
# fetching repeatedly from bim file
102-
a = root.array(
102+
a = root.create_dataset(
103103
"variant_position",
104104
data=bed.bp_position,
105105
shape=bed.bp_position.shape,
@@ -111,13 +111,13 @@ def convert(
111111
logger.debug("encoded variant_position")
112112

113113
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
114-
a = root.array(
114+
a = root.create_dataset(
115115
"variant_allele",
116116
data=alleles,
117117
shape=alleles.shape,
118118
dtype="str",
119119
compressor=default_compressor,
120-
chunks=(variants_chunk_size,),
120+
chunks=(variants_chunk_size, alleles.shape[1]),
121121
)
122122
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
123123
logger.debug("encoded variant_allele")

bio2zarr/vcf2zarr/vcz.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import numpy as np
1414
import zarr
1515

16-
from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
16+
from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3
1717

1818
from .. import constants, core, provenance
1919
from . import icf
@@ -571,7 +571,7 @@ def init(
571571
def encode_samples(self, root):
572572
if self.schema.samples != self.icf.metadata.samples:
573573
raise ValueError("Subsetting or reordering samples not supported currently")
574-
array = root.array(
574+
array = root.create_dataset(
575575
"sample_id",
576576
data=[sample.id for sample in self.schema.samples],
577577
shape=len(self.schema.samples),
@@ -583,7 +583,7 @@ def encode_samples(self, root):
583583
logger.debug("Samples done")
584584

585585
def encode_contig_id(self, root):
586-
array = root.array(
586+
array = root.create_dataset(
587587
"contig_id",
588588
data=[contig.id for contig in self.schema.contigs],
589589
shape=len(self.schema.contigs),
@@ -592,7 +592,7 @@ def encode_contig_id(self, root):
592592
)
593593
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
594594
if all(contig.length is not None for contig in self.schema.contigs):
595-
array = root.array(
595+
array = root.create_dataset(
596596
"contig_length",
597597
data=[contig.length for contig in self.schema.contigs],
598598
shape=len(self.schema.contigs),
@@ -604,7 +604,7 @@ def encode_contig_id(self, root):
604604
def encode_filter_id(self, root):
605605
# TODO need a way to store description also
606606
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
607-
array = root.array(
607+
array = root.create_dataset(
608608
"filter_id",
609609
data=[filt.id for filt in self.schema.filters],
610610
shape=len(self.schema.filters),
@@ -614,9 +614,17 @@ def encode_filter_id(self, root):
614614
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
615615

616616
def init_array(self, root, array_spec, variants_dim_size):
617-
object_codec = None
617+
kwargs = dict(ZARR_FORMAT_KWARGS)
618+
filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
618619
if array_spec.dtype == "O":
619-
object_codec = numcodecs.VLenUTF8()
620+
if zarr_v3():
621+
filters = [*list(filters), numcodecs.VLenUTF8()]
622+
else:
623+
kwargs["object_codec"] = numcodecs.VLenUTF8()
624+
625+
if not zarr_v3():
626+
kwargs["dimension_separator"] = self.metadata.dimension_separator
627+
620628
shape = list(array_spec.shape)
621629
# Truncate the variants dimension is max_variant_chunks was specified
622630
shape[0] = variants_dim_size
@@ -626,10 +634,8 @@ def init_array(self, root, array_spec, variants_dim_size):
626634
chunks=array_spec.chunks,
627635
dtype=array_spec.dtype,
628636
compressor=numcodecs.get_codec(array_spec.compressor),
629-
filters=[numcodecs.get_codec(filt) for filt in array_spec.filters],
630-
object_codec=object_codec,
631-
dimension_separator=self.metadata.dimension_separator,
632-
**ZARR_FORMAT_KWARGS,
637+
filters=filters,
638+
**kwargs,
633639
)
634640
a.attrs.update(
635641
{
@@ -945,13 +951,16 @@ def create_index(self):
945951
c_start_idx = c_end_idx + 1
946952

947953
index = np.array(index, dtype=np.int32)
948-
array = root.array(
954+
kwargs = {}
955+
if not zarr_v3():
956+
kwargs["dimension_separator"] = self.metadata.dimension_separator
957+
array = root.create_dataset(
949958
"region_index",
950959
data=index,
951960
shape=index.shape,
952961
dtype=index.dtype,
953962
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
954-
dimension_separator=self.metadata.dimension_separator,
963+
**kwargs,
955964
)
956965
array.attrs["_ARRAY_DIMENSIONS"] = [
957966
"region_index_values",

bio2zarr/vcf2zarr/verification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
7777
if vcf_type in ("String", "Character"):
7878
split = list(vcf_val.split(","))
7979
k = len(split)
80-
if isinstance(zarr_val, str):
80+
if isinstance(zarr_val, str) or zarr_val.ndim == 0:
8181
assert k == 1
8282
# Scalar
8383
assert vcf_val == zarr_val

tests/test_vcz.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from bio2zarr import core, vcf2zarr
1010
from bio2zarr.vcf2zarr import icf as icf_mod
1111
from bio2zarr.vcf2zarr import vcz as vcz_mod
12+
from bio2zarr.zarr_utils import zarr_v3
1213

1314

1415
@pytest.fixture(scope="module")
@@ -117,6 +118,9 @@ def test_encode_metadata_mismatch(self, tmpdir, icf_path, version):
117118
vcz_mod.VcfZarrWriterMetadata.fromdict(d)
118119

119120

121+
@pytest.mark.skipif(
122+
zarr_v3(), reason="Zarr-python v3 does not support dimension_separator"
123+
)
120124
class TestEncodeDimensionSeparator:
121125
@pytest.mark.parametrize("dimension_separator", [None, "/"])
122126
def test_directories(self, tmp_path, icf_path, dimension_separator):

0 commit comments

Comments
 (0)