From b86628570875cec3b1e4f5460fa35b4d1ad553c8 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Wed, 23 Oct 2024 13:48:48 +0100
Subject: [PATCH 1/5] Run on zarr-python v3 with zarr-format v2

---
 .github/workflows/ci.yml          | 22 +++++++++++++++++++
 bio2zarr/plink.py                 |  8 +++----
 bio2zarr/vcf2zarr/vcz.py          | 35 +++++++++++++++++++------------
 bio2zarr/vcf2zarr/verification.py |  2 +-
 tests/test_vcz.py                 |  4 ++++
 5 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index deeebf5e..4c405b9d 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -132,3 +132,25 @@ jobs:
           # We just run the CLI tests here because it doesn't require other upstream
           # packages like sgkit (which are tangled up with the numpy 2 dependency)
           python -m pytest tests/test_cli.py
+
+  test-zarr-version:
+    name: Test Zarr versions
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        zarr: ["zarr==2.18.3", "git+https://github.com/zarr-developers/zarr-python.git"]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install '.[dev]'
+      - name: Install ${{ matrix.zarr }}
+        run: |
+          python -m pip install --pre '${{ matrix.zarr }}'
+      - name: Run tests
+        run: |
+          python -m pytest 
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
index 88843db1..3aa9bcf0 100644
--- a/bio2zarr/plink.py
+++ b/bio2zarr/plink.py
@@ -86,7 +86,7 @@ def convert(
     # we're not using the best Blosc settings for genotypes here.
     default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
 
-    a = root.array(
+    a = root.create_dataset(
         "sample_id",
         data=bed.iid,
         shape=bed.iid.shape,
@@ -99,7 +99,7 @@ def convert(
 
     # TODO encode these in slices - but read them in one go to avoid
     # fetching repeatedly from bim file
-    a = root.array(
+    a = root.create_dataset(
         "variant_position",
         data=bed.bp_position,
         shape=bed.bp_position.shape,
@@ -111,13 +111,13 @@ def convert(
     logger.debug("encoded variant_position")
 
     alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
-    a = root.array(
+    a = root.create_dataset(
         "variant_allele",
         data=alleles,
         shape=alleles.shape,
         dtype="str",
         compressor=default_compressor,
-        chunks=(variants_chunk_size,),
+        chunks=(variants_chunk_size, alleles.shape[1]),
     )
     a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
     logger.debug("encoded variant_allele")
diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py
index 3094b233..e9bdbd01 100644
--- a/bio2zarr/vcf2zarr/vcz.py
+++ b/bio2zarr/vcf2zarr/vcz.py
@@ -13,7 +13,7 @@
 import numpy as np
 import zarr
 
-from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS
+from bio2zarr.zarr_utils import ZARR_FORMAT_KWARGS, zarr_v3
 
 from .. import constants, core, provenance
 from . import icf
@@ -572,7 +572,7 @@ def init(
     def encode_samples(self, root):
         if self.schema.samples != self.icf.metadata.samples:
             raise ValueError("Subsetting or reordering samples not supported currently")
-        array = root.array(
+        array = root.create_dataset(
             "sample_id",
             data=[sample.id for sample in self.schema.samples],
             shape=len(self.schema.samples),
@@ -584,7 +584,7 @@ def encode_samples(self, root):
         logger.debug("Samples done")
 
     def encode_contig_id(self, root):
-        array = root.array(
+        array = root.create_dataset(
             "contig_id",
             data=[contig.id for contig in self.schema.contigs],
             shape=len(self.schema.contigs),
@@ -593,7 +593,7 @@ def encode_contig_id(self, root):
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         if all(contig.length is not None for contig in self.schema.contigs):
-            array = root.array(
+            array = root.create_dataset(
                 "contig_length",
                 data=[contig.length for contig in self.schema.contigs],
                 shape=len(self.schema.contigs),
@@ -605,7 +605,7 @@ def encode_contig_id(self, root):
     def encode_filter_id(self, root):
         # TODO need a way to store description also
         # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
-        array = root.array(
+        array = root.create_dataset(
             "filter_id",
             data=[filt.id for filt in self.schema.filters],
             shape=len(self.schema.filters),
@@ -615,9 +615,17 @@ def encode_filter_id(self, root):
         array.attrs["_ARRAY_DIMENSIONS"] = ["filters"]
 
     def init_array(self, root, array_spec, variants_dim_size):
-        object_codec = None
+        kwargs = dict(ZARR_FORMAT_KWARGS)
+        filters = [numcodecs.get_codec(filt) for filt in array_spec.filters]
         if array_spec.dtype == "O":
-            object_codec = numcodecs.VLenUTF8()
+            if zarr_v3():
+                filters = [*list(filters), numcodecs.VLenUTF8()]
+            else:
+                kwargs["object_codec"] = numcodecs.VLenUTF8()
+
+        if not zarr_v3():
+            kwargs["dimension_separator"] = self.metadata.dimension_separator
+
         shape = list(array_spec.shape)
         # Truncate the variants dimension is max_variant_chunks was specified
         shape[0] = variants_dim_size
@@ -627,10 +635,8 @@ def init_array(self, root, array_spec, variants_dim_size):
             chunks=array_spec.chunks,
             dtype=array_spec.dtype,
             compressor=numcodecs.get_codec(array_spec.compressor),
-            filters=[numcodecs.get_codec(filt) for filt in array_spec.filters],
-            object_codec=object_codec,
-            dimension_separator=self.metadata.dimension_separator,
-            **ZARR_FORMAT_KWARGS,
+            filters=filters,
+            **kwargs,
         )
         a.attrs.update(
             {
@@ -946,13 +952,16 @@ def create_index(self):
                 c_start_idx = c_end_idx + 1
 
         index = np.array(index, dtype=np.int32)
-        array = root.array(
+        kwargs = {}
+        if not zarr_v3():
+            kwargs["dimension_separator"] = self.metadata.dimension_separator
+        array = root.create_dataset(
             "region_index",
             data=index,
             shape=index.shape,
             dtype=index.dtype,
             compressor=numcodecs.Blosc("zstd", clevel=9, shuffle=0),
-            dimension_separator=self.metadata.dimension_separator,
+            **kwargs,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = [
             "region_index_values",
diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py
index 7d1d91c1..c62cf84e 100644
--- a/bio2zarr/vcf2zarr/verification.py
+++ b/bio2zarr/vcf2zarr/verification.py
@@ -77,7 +77,7 @@ def assert_info_val_equal(vcf_val, zarr_val, vcf_type):
     if vcf_type in ("String", "Character"):
         split = list(vcf_val.split(","))
         k = len(split)
-        if isinstance(zarr_val, str):
+        if isinstance(zarr_val, str) or zarr_val.ndim == 0:
             assert k == 1
             # Scalar
             assert vcf_val == zarr_val
diff --git a/tests/test_vcz.py b/tests/test_vcz.py
index 6787461f..d7be275b 100644
--- a/tests/test_vcz.py
+++ b/tests/test_vcz.py
@@ -9,6 +9,7 @@
 from bio2zarr import core, vcf2zarr
 from bio2zarr.vcf2zarr import icf as icf_mod
 from bio2zarr.vcf2zarr import vcz as vcz_mod
+from bio2zarr.zarr_utils import zarr_v3
 
 
 @pytest.fixture(scope="module")
@@ -112,6 +113,9 @@ def test_encode_metadata_mismatch(self, tmpdir, icf_path, version):
             vcz_mod.VcfZarrWriterMetadata.fromdict(d)
 
 
+@pytest.mark.skipif(
+    zarr_v3(), reason="Zarr-python v3 does not support dimension_separator"
+)
 class TestEncodeDimensionSeparator:
     @pytest.mark.parametrize("dimension_separator", [None, "/"])
     def test_directories(self, tmp_path, icf_path, dimension_separator):

From d8f72c95f2437825a69edc448615f6368f95c40c Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Tue, 7 Jan 2025 12:18:41 +0000
Subject: [PATCH 2/5] Add `first_dim_iter` utility to workaround slow array
 iteration in zarr-python v3

---
 bio2zarr/vcf2zarr/verification.py | 12 +++++++++---
 bio2zarr/zarr_utils.py            |  6 ++++++
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/bio2zarr/vcf2zarr/verification.py b/bio2zarr/vcf2zarr/verification.py
index c62cf84e..d7a5291e 100644
--- a/bio2zarr/vcf2zarr/verification.py
+++ b/bio2zarr/vcf2zarr/verification.py
@@ -4,6 +4,8 @@
 import tqdm
 import zarr
 
+from bio2zarr.zarr_utils import first_dim_iter
+
 from .. import constants
 
 
@@ -152,7 +154,7 @@ def verify(vcf_path, zarr_path, show_progress=False):
     vid = root["variant_id"][:]
     call_genotype = None
     if "call_genotype" in root and root["call_genotype"].size > 0:
-        call_genotype = iter(root["call_genotype"])
+        call_genotype = first_dim_iter(root["call_genotype"])
 
     vcf = cyvcf2.VCF(vcf_path)
     format_headers = {}
@@ -170,12 +172,16 @@ def verify(vcf_path, zarr_path, show_progress=False):
             vcf_name = colname.split("_", 1)[1]
             vcf_type = format_headers[vcf_name]["Type"]
             vcf_number = format_headers[vcf_name]["Number"]
-            format_fields[vcf_name] = vcf_type, vcf_number, iter(root[colname])
+            format_fields[vcf_name] = (
+                vcf_type,
+                vcf_number,
+                first_dim_iter(root[colname]),
+            )
         if colname.startswith("variant"):
             name = colname.split("_", 1)[1]
             if name.isupper():
                 vcf_type = info_headers[name]["Type"]
-                info_fields[name] = vcf_type, iter(root[colname])
+                info_fields[name] = vcf_type, first_dim_iter(root[colname])
 
     first_pos = next(vcf).POS
     start_index = np.searchsorted(pos, first_pos)
diff --git a/bio2zarr/zarr_utils.py b/bio2zarr/zarr_utils.py
index 16d36e02..11c6b374 100644
--- a/bio2zarr/zarr_utils.py
+++ b/bio2zarr/zarr_utils.py
@@ -11,3 +11,9 @@ def zarr_v3() -> bool:
     ZARR_FORMAT_KWARGS = dict(zarr_format=2)
 else:
     ZARR_FORMAT_KWARGS = dict()
+
+
+# See discussion in https://github.com/zarr-developers/zarr-python/issues/2529
+def first_dim_iter(z):
+    for chunk in range(z.cdata_shape[0]):
+        yield from z.blocks[chunk]

From 4a6ed6b4534cacc01dfe76c9509bb3f573d4bc83 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Wed, 8 Jan 2025 13:40:39 +0000
Subject: [PATCH 3/5] Go back to `Group.array()` following
 https://github.com/zarr-developers/zarr-python/issues/2667

---
 bio2zarr/plink.py        |  6 +++---
 bio2zarr/vcf2zarr/vcz.py | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
index 3aa9bcf0..11e45b2d 100644
--- a/bio2zarr/plink.py
+++ b/bio2zarr/plink.py
@@ -86,7 +86,7 @@ def convert(
     # we're not using the best Blosc settings for genotypes here.
     default_compressor = numcodecs.Blosc(cname="zstd", clevel=7)
 
-    a = root.create_dataset(
+    a = root.array(
         "sample_id",
         data=bed.iid,
         shape=bed.iid.shape,
@@ -99,7 +99,7 @@ def convert(
 
     # TODO encode these in slices - but read them in one go to avoid
     # fetching repeatedly from bim file
-    a = root.create_dataset(
+    a = root.array(
         "variant_position",
         data=bed.bp_position,
         shape=bed.bp_position.shape,
@@ -111,7 +111,7 @@ def convert(
     logger.debug("encoded variant_position")
 
     alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
-    a = root.create_dataset(
+    a = root.array(
         "variant_allele",
         data=alleles,
         shape=alleles.shape,
diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py
index e9bdbd01..53b25924 100644
--- a/bio2zarr/vcf2zarr/vcz.py
+++ b/bio2zarr/vcf2zarr/vcz.py
@@ -572,7 +572,7 @@ def init(
     def encode_samples(self, root):
         if self.schema.samples != self.icf.metadata.samples:
             raise ValueError("Subsetting or reordering samples not supported currently")
-        array = root.create_dataset(
+        array = root.array(
             "sample_id",
             data=[sample.id for sample in self.schema.samples],
             shape=len(self.schema.samples),
@@ -584,7 +584,7 @@ def encode_samples(self, root):
         logger.debug("Samples done")
 
     def encode_contig_id(self, root):
-        array = root.create_dataset(
+        array = root.array(
             "contig_id",
             data=[contig.id for contig in self.schema.contigs],
             shape=len(self.schema.contigs),
@@ -593,7 +593,7 @@ def encode_contig_id(self, root):
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
         if all(contig.length is not None for contig in self.schema.contigs):
-            array = root.create_dataset(
+            array = root.array(
                 "contig_length",
                 data=[contig.length for contig in self.schema.contigs],
                 shape=len(self.schema.contigs),
@@ -605,7 +605,7 @@ def encode_contig_id(self, root):
     def encode_filter_id(self, root):
         # TODO need a way to store description also
         # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
-        array = root.create_dataset(
+        array = root.array(
             "filter_id",
             data=[filt.id for filt in self.schema.filters],
             shape=len(self.schema.filters),
@@ -955,7 +955,7 @@ def create_index(self):
         kwargs = {}
         if not zarr_v3():
             kwargs["dimension_separator"] = self.metadata.dimension_separator
-        array = root.create_dataset(
+        array = root.array(
             "region_index",
             data=index,
             shape=index.shape,

From fa4640f804863cf29d141389f0ab7898fe9673cb Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Mon, 13 Jan 2025 11:39:05 +0000
Subject: [PATCH 4/5] Use released version of Zarr Python 3

---
 .github/workflows/ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4c405b9d..4b581f3b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -138,7 +138,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        zarr: ["zarr==2.18.3", "git+https://github.com/zarr-developers/zarr-python.git"]
+        zarr: ["==2.18.3", ">=3"]
     steps:
       - uses: actions/checkout@v4
       - uses: actions/setup-python@v5
@@ -148,9 +148,9 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install '.[dev]'
-      - name: Install ${{ matrix.zarr }}
+      - name: Install zarr${{ matrix.zarr }}
         run: |
-          python -m pip install --pre '${{ matrix.zarr }}'
+          python -m pip install 'zarr${{ matrix.zarr }}'
       - name: Run tests
         run: |
           python -m pytest 

From b82fdc4d64b50401cae5995ea10c2bb2d84db3b0 Mon Sep 17 00:00:00 2001
From: Tom White <tom.e.white@gmail.com>
Date: Mon, 13 Jan 2025 16:46:33 +0000
Subject: [PATCH 5/5] Temporarily exclude test_double_encode_partition for Zarr
 v3 testing

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4b581f3b..fcfea19e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -153,4 +153,4 @@ jobs:
           python -m pip install 'zarr${{ matrix.zarr }}'
       - name: Run tests
         run: |
-          python -m pytest 
+          python -m pytest -k "not test_double_encode_partition"