Skip to content

Commit b866285

Browse files
jeromekellehertomwhite
authored andcommitted
Run on zarr-python v3 with zarr-format v2
1 parent 883a37e commit b866285

File tree

5 files changed

+53
-18
lines changed

5 files changed

+53
-18
lines changed

.github/workflows/ci.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,3 +132,25 @@ jobs:
132132
# We just run the CLI tests here because it doesn't require other upstream
133133
# packages like sgkit (which are tangled up with the numpy 2 dependency)
134134
python -m pytest tests/test_cli.py
135+
136+
test-zarr-version:
137+
name: Test Zarr versions
138+
runs-on: ubuntu-latest
139+
strategy:
140+
matrix:
141+
zarr: ["zarr==2.18.3", "git+https://github.com/zarr-developers/zarr-python.git"]
142+
steps:
143+
- uses: actions/checkout@v4
144+
- uses: actions/setup-python@v5
145+
with:
146+
python-version: '3.11'
147+
- name: Install dependencies
148+
run: |
149+
python -m pip install --upgrade pip
150+
python -m pip install '.[dev]'
151+
- name: Install ${{ matrix.zarr }}
152+
run: |
153+
python -m pip install --pre '${{ matrix.zarr }}'
154+
- name: Run tests
155+
run: |
156+
python -m pytest

bio2zarr/plink.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def convert(
8686
# we're not using the best Blosc settings for genotypes here.
8787
default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
8888

89-
a = root.array(
89+
a = root.create_dataset(
9090
"sample_id",
9191
data=bed.iid,
9292
shape=bed.iid.shape,
@@ -99,7 +99,7 @@ def convert(
9999

100100
# TODO encode these in slices - but read them in one go to avoid
101101
# fetching repeatedly from bim file
102-
a = root.array(
102+
a = root.create_dataset(
103103
"variant_position",
104104
data=bed.bp_position,
105105
shape=bed.bp_position.shape,
@@ -111,13 +111,13 @@ def convert(
111111
logger.debug("encoded variant_position")
112112

113113
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
114-
a = root.array(
114+
a = root.create_dataset(
115115
"variant_allele",
116116
data=alleles,
117117
shape=alleles.shape,
118118
dtype="str",
119119
compressor=default_compressor,
120-
chunks=(variants_chunk_size,),
120+
chunks=(variants_chunk_size, alleles.shape[1]),
121121
)
122122
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
123123
logger.debug("encoded variant_allele")

bio2zarr/vcf2zarr/vcz.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
import numpy as np
1414
import zarr
1515

16-
from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
16+
from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3
1717

1818
from .. import constants, core, provenance
1919
from . import icf
@@ -572,7 +572,7 @@ def init(
572572
def encode_samples(self, root):
573573
if self.schema.samples != self.icf.metadata.samples:
574574
raise ValueError("Subsetting or reordering samples not supported currently")
575-
array = root.array(
575+
array = root.create_dataset(
576576
"sample_id",
577577
data=[sample.id for sample in self.schema.samples],
578578
shape=len(self.schema.samples),
@@ -584,7 +584,7 @@ def encode_samples(self, root):
584584
logger.debug("Samples done")
585585

586586
def encode_contig_id(self, root):
587-
array = root.array(
587+
array = root.create_dataset(
588588
"contig_id",
589589
data=[contig.id for contig in self.schema.contigs],
590590
shape=len(self.schema.contigs),
@@ -593,7 +593,7 @@ def encode_contig_id(self, root):
593593
)
594594
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
595595
if all(contig.length is not None for contig in self.schema.contigs):
596-
array = root.array(
596+
array = root.create_dataset(
597597
"contig_length",
598598
data=[contig.length for contig in self.schema.contigs],
599599
shape=len(self.schema.contigs),
@@ -605,7 +605,7 @@ def encode_contig_id(self, root):
605605
def encode_filter_id(self, root):
606606
# TODO need a way to store description also
607607
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
608-
array = root.array(
608+
array = root.create_dataset(
609609
"filter_id",
610610
data=[filt.id for filt in self.schema.filters],
611611
shape=len(self.schema.filters),
@@ -615,9 +615,17 @@ def encode_filter_id(self, root):
615615
array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
616616

617617
def init_array(self, root, array_spec, variants_dim_size):
618-
object_codec = None
618+
kwargs = dict(ZARR_FORMAT_KWARGS)
619+
filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
619620
if array_spec.dtype == "O":
620-
object_codec = numcodecs.VLenUTF8()
621+
if zarr_v3():
622+
filters = [*list(filters), numcodecs.VLenUTF8()]
623+
else:
624+
kwargs["object_codec"] = numcodecs.VLenUTF8()
625+
626+
if not zarr_v3():
627+
kwargs["dimension_separator"] = self.metadata.dimension_separator
628+
621629
shape = list(array_spec.shape)
622630
# Truncate the variants dimension is max_variant_chunks was specified
623631
shape[0] = variants_dim_size
@@ -627,10 +635,8 @@ def init_array(self, root, array_spec, variants_dim_size):
627635
chunks=array_spec.chunks,
628636
dtype=array_spec.dtype,
629637
compressor=numcodecs.get_codec(array_spec.compressor),
630-
filters=[numcodecs.get_codec(filt) for filt in array_spec.filters],
631-
object_codec=object_codec,
632-
dimension_separator=self.metadata.dimension_separator,
633-
**ZARR_FORMAT_KWARGS,
638+
filters=filters,
639+
**kwargs,
634640
)
635641
a.attrs.update(
636642
{
@@ -946,13 +952,16 @@ def create_index(self):
946952
c_start_idx = c_end_idx + 1
947953

948954
index = np.array(index, dtype=np.int32)
949-
array = root.array(
955+
kwargs = {}
956+
if not zarr_v3():
957+
kwargs["dimension_separator"] = self.metadata.dimension_separator
958+
array = root.create_dataset(
950959
"region_index",
951960
data=index,
952961
shape=index.shape,
953962
dtype=index.dtype,
954963
compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
955-
dimension_separator=self.metadata.dimension_separator,
964+
**kwargs,
956965
)
957966
array.attrs["_ARRAY_DIMENSIONS"] = [
958967
"region_index_values",

bio2zarr/vcf2zarr/verification.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
7777
if vcf_type in ("String", "Character"):
7878
split = list(vcf_val.split(","))
7979
k = len(split)
80-
if isinstance(zarr_val, str):
80+
if isinstance(zarr_val, str) or zarr_val.ndim == 0:
8181
assert k == 1
8282
# Scalar
8383
assert vcf_val == zarr_val

tests/test_vcz.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from bio2zarr import core, vcf2zarr
1010
from bio2zarr.vcf2zarr import icf as icf_mod
1111
from bio2zarr.vcf2zarr import vcz as vcz_mod
12+
from bio2zarr.zarr_utils import zarr_v3
1213

1314

1415
@pytest.fixture(scope="module")
@@ -112,6 +113,9 @@ def test_encode_metadata_mismatch(self, tmpdir, icf_path, version):
112113
vcz_mod.VcfZarrWriterMetadata.fromdict(d)
113114

114115

116+
@pytest.mark.skipif(
117+
zarr_v3(), reason="Zarr-python v3 does not support dimension_separator"
118+
)
115119
class TestEncodeDimensionSeparator:
116120
@pytest.mark.parametrize("dimension_separator", [None, "/"])
117121
def test_directories(self, tmp_path, icf_path, dimension_separator):

0 commit comments

Comments
 (0)