zarr-developers · norlandrhagen · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 27, 2026
diff --git a/docs/releases.md b/docs/releases.md
@@ -38,6 +38,10 @@ This release moves the `ObjectStoreRegistry` to a separate package `obspec_utils
 
 ### New Features
 
+- Improved `ZarrParser` performance.
+  ([#892](https://github.com/zarr-developers/VirtualiZarr/pull/892)).
+  By [Raphael Hagen](https://github.com/norlandrhagen).
+
 - Added `reader_factory` parameter to `HDFParser` to allow customizing how files are read
   ([#844](https://github.com/zarr-developers/VirtualiZarr/pull/844)).
   By [Max Jones](https://github.com/maxrjones).

diff --git a/pyproject.toml b/pyproject.toml
@@ -50,6 +50,8 @@ hdf = [
     "imagecodecs-numcodecs==2024.6.1",
 ]
 
+zarr = ["arro3-core", "pyarrow"]
+
 # kerchunk-based parsers
 netcdf3 = [
     "virtualizarr[remote]",
@@ -76,12 +78,14 @@ all_parsers = [
     "virtualizarr[fits]",
     "virtualizarr[kerchunk_parquet]",
     "virtualizarr[tiff]",
+    "virtualizarr[zarr]"
 ]
 
 # writers
 icechunk = [
     "icechunk>=1.1.2",
 ]
+
 kerchunk = ["fastparquet"]
 
 all_writers = [
@@ -201,14 +205,17 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov
 min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs
 # Inherit from min-deps to get all the test commands, along with optional dependencies
 test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
-test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py311"] # test against python 3.11
-test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py312"] # test against python 3.12
+test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py311"] # test against python 3.11
+test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py312"] # test against python 3.12
 minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"]
 minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"]
 upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"]
 all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"]
 docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
 
+[tool.pixi.dependencies]
+pytest = "*"
+
 # Define commands to run within the docs environment
 [tool.pixi.feature.docs.tasks]
 serve-docs = { cmd = "mkdocs serve" }

diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -192,21 +192,27 @@ def to_kerchunk(
 
             return None
         elif format == "parquet":
+            import pandas as pd
             from kerchunk.df import refs_to_dataframe
 
             if isinstance(filepath, Path):
                 url = str(filepath)
             elif isinstance(filepath, str):
                 url = filepath
 
-            # refs_to_dataframe is responsible for writing to parquet.
-            # at no point does it create a full in-memory dataframe.
-            refs_to_dataframe(
-                refs,
-                url=url,
-                record_size=record_size,
-                categorical_threshold=categorical_threshold,
-            )
+            # The zarr-parser performance update PR #892 adds pyarrow and arro3-core as deps.
+            # These break the `kerchunk` refs_to_dataframe behavior.
+            # It seems like pyarrow makes pandas default to an ArrowStringArray
+            # which fastparquet cannot zero-copy encode.
+            # TODO: remove once fastparquet or kerchunk handle ArrowStringArray.
+
+            with pd.option_context("future.infer_string", False):
+                refs_to_dataframe(
+                    refs,
+                    url=url,
+                    record_size=record_size,
+                    categorical_threshold=categorical_threshold,
+                )
             return None
         else:
             raise ValueError(f"Unrecognized output format: {format}")

diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import re
 from collections.abc import (
     Callable,
@@ -8,13 +10,16 @@
     ValuesView,
 )
 from pathlib import PosixPath
-from typing import Any, NewType, TypedDict, cast
+from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast
 
 import numpy as np
 
 from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index
 from virtualizarr.types import ChunkKey
 
+if TYPE_CHECKING:
+    import pyarrow as pa  # type: ignore[import-untyped,import-not-found]
+
 # doesn't guarantee that writers actually handle these
 VALID_URI_PREFIXES = {
     "s3://",
@@ -322,6 +327,55 @@ def from_arrays(
 
         return obj
 
+    @classmethod
+    def _from_arrow(
+        cls,
+        *,
+        paths: "pa.StringArray",
+        offsets: "pa.UInt64Array",
+        lengths: "pa.UInt64Array",
+        shape: tuple[int, ...],
+    ) -> "ChunkManifest":
+        """
+        Create a ChunkManifest from flat 1D PyArrow arrays.
+
+        Avoids intermediate Python dicts by converting Arrow arrays directly
+        to the numpy arrays used internally by ChunkManifest.
+
+        Parameters
+        ----------
+        paths
+            Full paths to chunks, as a PyArrow StringArray. Nulls represent missing chunks.
+        offsets
+            Byte offsets of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
+        lengths
+            Byte lengths of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
+        shape
+            Shape to reshape the flat arrays into.
+        """
+        import pyarrow as pa  # type: ignore[import-untyped,import-not-found]
+        import pyarrow.compute as pc  # type: ignore[import-untyped,import-not-found]
+
+        arrow_paths = pc.if_else(pc.is_null(paths), "", paths)
+        arrow_offsets = pc.if_else(
+            pc.is_null(offsets), pa.scalar(0, pa.uint64()), offsets
+        )
+        arrow_lengths = pc.if_else(
+            pc.is_null(lengths), pa.scalar(0, pa.uint64()), lengths
+        )
+
+        np_paths = arrow_paths.to_numpy(zero_copy_only=False).astype(
+            np.dtypes.StringDType()
+        )
+        np_offsets = arrow_offsets.to_numpy(zero_copy_only=False)
+        np_lengths = arrow_lengths.to_numpy(zero_copy_only=False)
+
+        return cls.from_arrays(
+            paths=np_paths.reshape(shape),
+            offsets=np_offsets.reshape(shape),
+            lengths=np_lengths.reshape(shape),
+        )
+
     @property
     def ndim_chunk_grid(self) -> int:
         """