Run on zarr-python v3 with zarr-format v2

jeromekelleher · jeromekelleher · commit ecfd46b8f6a6 · 2025-01-14T09:07:37.000Z
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -132,3 +132,25 @@ jobs:
           # We just run the CLI tests here because it doesn't require other upstream
           # packages like sgkit (which are tangled up with the numpy 2 dependency)
           python -m pytest tests/test_cli.py
+
+  test-zarr-version:
+    name: Test Zarr versions
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        zarr: ["zarr==2.18.3", "git+https://github.com/zarr-developers/zarr-python.git"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install '.[dev]'
+      - name: Install ${{ matrix.zarr }}
+        run: |
+          python -m pip install --pre '${{ matrix.zarr }}'
+      - name: Run tests
+        run: |
+          python -m pytest 
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -86,7 +86,7 @@ def convert(
     # we're not using the best Blosc settings for genotypes here.
     default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
 
-    a = root.array(
+    a = root.create_dataset(
         "sample_id",
         data=bed.iid,
         shape=bed.iid.shape,
@@ -99,7 +99,7 @@ def convert(
 
     # TODO encode these in slices - but read them in one go to avoid
     # fetching repeatedly from bim file
-    a = root.array(
+    a = root.create_dataset(
         "variant_position",
         data=bed.bp_position,
         shape=bed.bp_position.shape,
@@ -111,13 +111,13 @@ def convert(
     logger.debug("encoded variant_position")
 
     alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
-    a = root.array(
+    a = root.create_dataset(
         "variant_allele",
         data=alleles,
         shape=alleles.shape,
         dtype="str",
         compressor=default_compressor,
-        chunks=(variants_chunk_size,),
+        chunks=(variants_chunk_size, alleles.shape[1]),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
     logger.debug("encoded variant_allele")
diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py
@@ -13,7 +13,7 @@
 import numpy as np
 import zarr
 
-from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
+from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3
 
 from .. import constants, core, provenance
 from . import icf
@@ -571,7 +571,7 @@ def init(
     def encode_samples(self, root):
         if self.schema.samples != self.icf.metadata.samples:
             raise ValueError("Subsetting or reordering samples not supported currently")
-        array = root.array(
+        array = root.create_dataset(
             "sample_id",
             data=[sample.id for sample in self.schema.samples],
             shape=len(self.schema.samples),
@@ -583,7 +583,7 @@ def encode_samples(self, root):
         logger.debug("Samples done")
 
     def encode_contig_id(self, root):
-        array = root.array(
+        array = root.create_dataset(
             "contig_id",
             data=[contig.id for contig in self.schema.contigs],
             shape=len(self.schema.contigs),
@@ -592,7 +592,7 @@ def encode_contig_id(self, root):
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         if all(contig.length is not None for contig in self.schema.contigs):
-            array = root.array(
+            array = root.create_dataset(
                 "contig_length",
                 data=[contig.length for contig in self.schema.contigs],
                 shape=len(self.schema.contigs),
@@ -604,7 +604,7 @@ def encode_contig_id(self, root):
     def encode_filter_id(self, root):
         # TODO need a way to store description also
         # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
-        array = root.array(
+        array = root.create_dataset(
             "filter_id",
             data=[filt.id for filt in self.schema.filters],
             shape=len(self.schema.filters),
@@ -614,9 +614,17 @@ def encode_filter_id(self, root):
         array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
 
     def init_array(self, root, array_spec, variants_dim_size):
-        object_codec = None
+        kwargs = dict(ZARR_FORMAT_KWARGS)
+        filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
         if array_spec.dtype == "O":
-            object_codec = numcodecs.VLenUTF8()
+            if zarr_v3():
+                filters = [*list(filters), numcodecs.VLenUTF8()]
+            else:
+                kwargs["object_codec"] = numcodecs.VLenUTF8()
+
+        if not zarr_v3():
+            kwargs["dimension_separator"] = self.metadata.dimension_separator
+
         shape = list(array_spec.shape)
         # Truncate the variants dimension is max_variant_chunks was specified
         shape[0] = variants_dim_size
@@ -626,10 +634,8 @@ def init_array(self, root, array_spec, variants_dim_size):
             chunks=array_spec.chunks,
             dtype=array_spec.dtype,
             compressor=numcodecs.get_codec(array_spec.compressor),
-            filters=[numcodecs.get_codec(filt) for filt in array_spec.filters],
-            object_codec=object_codec,
-            dimension_separator=self.metadata.dimension_separator,
-            **ZARR_FORMAT_KWARGS,
+            filters=filters,
+            **kwargs,
         )
         a.attrs.update(
             {
@@ -945,13 +951,16 @@ def create_index(self):
                 c_start_idx = c_end_idx + 1
 
         index = np.array(index, dtype=np.int32)
-        array = root.array(
+        kwargs = {}
+        if not zarr_v3():
+            kwargs["dimension_separator"] = self.metadata.dimension_separator
+        array = root.create_dataset(
             "region_index",
             data=index,
             shape=index.shape,
             dtype=index.dtype,
             compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
-            dimension_separator=self.metadata.dimension_separator,
+            **kwargs,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = [
             "region_index_values",
diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py
@@ -77,7 +77,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
     if vcf_type in ("String", "Character"):
         split = list(vcf_val.split(","))
         k = len(split)
-        if isinstance(zarr_val, str):
+        if isinstance(zarr_val, str) or zarr_val.ndim == 0:
             assert k == 1
             # Scalar
             assert vcf_val == zarr_val
diff --git a/tests/test_vcz.py b/tests/test_vcz.py
@@ -9,6 +9,7 @@
 from bio2zarr import core, vcf2zarr
 from bio2zarr.vcf2zarr import icf as icf_mod
 from bio2zarr.vcf2zarr import vcz as vcz_mod
+from bio2zarr.zarr_utils import zarr_v3
 
 
 @pytest.fixture(scope="module")
@@ -117,6 +118,9 @@ def test_encode_metadata_mismatch(self, tmpdir, icf_path, version):
             vcz_mod.VcfZarrWriterMetadata.fromdict(d)
 
 
+@pytest.mark.skipif(
+    zarr_v3(), reason="Zarr-python v3 does not support dimension_separator"
+)
 class TestEncodeDimensionSeparator:
     @pytest.mark.parametrize("dimension_separator", [None, "/"])
     def test_directories(self, tmp_path, icf_path, dimension_separator):