sgkit-dev · jeromekelleher · Jan 14, 2025 · Oct 23, 2024 · Jan 7, 2025 · Jan 8, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -132,3 +132,25 @@ jobs:
           # We just run the CLI tests here because it doesn't require other upstream
           # packages like sgkit (which are tangled up with the numpy 2 dependency)
           python -m pytest tests/test_cli.py
+
+  test-zarr-version:
+    name: Test Zarr versions
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        zarr: ["==2.18.3", ">=3"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install '.[dev]'
+      - name: Install zarr${{ matrix.zarr }}
+        run: |
+          python -m pip install 'zarr${{ matrix.zarr }}'
+      - name: Run tests
+        run: |
+          python -m pytest -k "not test_double_encode_partition"
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -117,7 +117,7 @@ def convert(
         shape=alleles.shape,
         dtype="str",
         compressor=default_compressor,
-        chunks=(variants_chunk_size,),
+        chunks=(variants_chunk_size, alleles.shape[1]),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
     logger.debug("encoded variant_allele")

diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py
@@ -13,7 +13,7 @@
 import numpy as np
 import zarr
 
-from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
+from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3
 
 from .. import constants, core, provenance
 from . import icf
@@ -615,9 +615,17 @@ def encode_filter_id(self, root):
         array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
 
     def init_array(self, root, array_spec, variants_dim_size):
-        object_codec = None
+        kwargs = dict(ZARR_FORMAT_KWARGS)
+        filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
         if array_spec.dtype == "O":
-            object_codec = numcodecs.VLenUTF8()
+            if zarr_v3():
+                filters = [*list(filters), numcodecs.VLenUTF8()]
+            else:
+                kwargs["object_codec"] = numcodecs.VLenUTF8()
+
+        if not zarr_v3():
+            kwargs["dimension_separator"] = self.metadata.dimension_separator
+
         shape = list(array_spec.shape)
         # Truncate the variants dimension is max_variant_chunks was specified
         shape[0] = variants_dim_size
@@ -627,10 +635,8 @@ def init_array(self, root, array_spec, variants_dim_size):
             chunks=array_spec.chunks,
             dtype=array_spec.dtype,
             compressor=numcodecs.get_codec(array_spec.compressor),
-            filters=[numcodecs.get_codec(filt) for filt in array_spec.filters],
-            object_codec=object_codec,
-            dimension_separator=self.metadata.dimension_separator,
-            **ZARR_FORMAT_KWARGS,
+            filters=filters,
+            **kwargs,
         )
         a.attrs.update(
             {
@@ -946,13 +952,16 @@ def create_index(self):
                 c_start_idx = c_end_idx + 1
 
         index = np.array(index, dtype=np.int32)
+        kwargs = {}
+        if not zarr_v3():
+            kwargs["dimension_separator"] = self.metadata.dimension_separator
         array = root.array(
             "region_index",
             data=index,
             shape=index.shape,
             dtype=index.dtype,
             compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
-            dimension_separator=self.metadata.dimension_separator,
+            **kwargs,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = [
             "region_index_values",

diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py
@@ -4,6 +4,8 @@
 import tqdm
 import zarr
 
+from bio2zarr.zarr_utils import first_dim_iter
+
 from .. import constants
 
 
@@ -77,7 +79,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
     if vcf_type in ("String", "Character"):
         split = list(vcf_val.split(","))
         k = len(split)
-        if isinstance(zarr_val, str):
+        if isinstance(zarr_val, str) or zarr_val.ndim == 0:
             assert k == 1
             # Scalar
             assert vcf_val == zarr_val
@@ -152,7 +154,7 @@ def verify(vcf_path, zarr_path, show_progress=False):
     vid = root["variant_id"][:]
     call_genotype = None
     if "call_genotype" in root and root["call_genotype"].size > 0:
-        call_genotype = iter(root["call_genotype"])
+        call_genotype = first_dim_iter(root["call_genotype"])
 
     vcf = cyvcf2.VCF(vcf_path)
     format_headers = {}
@@ -170,12 +172,16 @@ def verify(vcf_path, zarr_path, show_progress=False):
             vcf_name = colname.split("_", 1)[1]
             vcf_type = format_headers[vcf_name]["Type"]
             vcf_number = format_headers[vcf_name]["Number"]
-            format_fields[vcf_name] = vcf_type, vcf_number, iter(root[colname])
+            format_fields[vcf_name] = (
+                vcf_type,
+                vcf_number,
+                first_dim_iter(root[colname]),
+            )
         if colname.startswith("variant"):
             name = colname.split("_", 1)[1]
             if name.isupper():
                 vcf_type = info_headers[name]["Type"]
-                info_fields[name] = vcf_type, iter(root[colname])
+                info_fields[name] = vcf_type, first_dim_iter(root[colname])
 
     first_pos = next(vcf).POS
     start_index = np.searchsorted(pos, first_pos)

diff --git a/bio2zarr/zarr_utils.py b/bio2zarr/zarr_utils.py
@@ -11,3 +11,9 @@ def zarr_v3() -> bool:
     ZARR_FORMAT_KWARGS = dict(zarr_format=2)
 else:
     ZARR_FORMAT_KWARGS = dict()
+
+
+# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
+def first_dim_iter(z):
+    for chunk in range(z.cdata_shape[0]):
+        yield from z.blocks[chunk]
diff --git a/tests/test_vcz.py b/tests/test_vcz.py
@@ -9,6 +9,7 @@
 from bio2zarr import core, vcf2zarr
 from bio2zarr.vcf2zarr import icf as icf_mod
 from bio2zarr.vcf2zarr import vcz as vcz_mod
+from bio2zarr.zarr_utils import zarr_v3
 
 
 @pytest.fixture(scope="module")
@@ -112,6 +113,9 @@ def test_encode_metadata_mismatch(self, tmpdir, icf_path, version):
             vcz_mod.VcfZarrWriterMetadata.fromdict(d)
 
 
+@pytest.mark.skipif(
+    zarr_v3(), reason="Zarr-python v3 does not support dimension_separator"
+)
 class TestEncodeDimensionSeparator:
     @pytest.mark.parametrize("dimension_separator", [None, "/"])
     def test_directories(self, tmp_path, icf_path, dimension_separator):