Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/releases.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ This release moves the `ObjectStoreRegistry` to a separate package `obspec_utils

### New Features

- Improved `ZarrParser` performance.
([#892](https://github.com/zarr-developers/VirtualiZarr/pull/892)).
By [Raphael Hagen](https://github.com/norlandrhagen).

- Added `reader_factory` parameter to `HDFParser` to allow customizing how files are read
([#844](https://github.com/zarr-developers/VirtualiZarr/pull/844)).
By [Max Jones](https://github.com/maxrjones).
Expand Down
11 changes: 9 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ hdf = [
"imagecodecs-numcodecs==2024.6.1",
]

zarr = ["arro3-core", "pyarrow"]

# kerchunk-based parsers
netcdf3 = [
"virtualizarr[remote]",
Expand All @@ -76,12 +78,14 @@ all_parsers = [
"virtualizarr[fits]",
"virtualizarr[kerchunk_parquet]",
"virtualizarr[tiff]",
"virtualizarr[zarr]"
]

# writers
icechunk = [
"icechunk>=1.1.2",
]

kerchunk = ["fastparquet"]

all_writers = [
Expand Down Expand Up @@ -201,14 +205,17 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov
min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs
# Inherit from min-deps to get all the test commands, along with optional dependencies
test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py311"] # test against python 3.11
test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py312"] # test against python 3.12
test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py311"] # test against python 3.11
test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py312"] # test against python 3.12
minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"]
minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"]
upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"]
all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"]
docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]

[tool.pixi.dependencies]
pytest = "*"

# Define commands to run within the docs environment
[tool.pixi.feature.docs.tasks]
serve-docs = { cmd = "mkdocs serve" }
Expand Down
22 changes: 14 additions & 8 deletions virtualizarr/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,21 +192,27 @@ def to_kerchunk(

return None
elif format == "parquet":
import pandas as pd
from kerchunk.df import refs_to_dataframe

if isinstance(filepath, Path):
url = str(filepath)
elif isinstance(filepath, str):
url = filepath

# refs_to_dataframe is responsible for writing to parquet.
# at no point does it create a full in-memory dataframe.
refs_to_dataframe(
refs,
url=url,
record_size=record_size,
categorical_threshold=categorical_threshold,
)
# The zarr-parser performance update PR #892 adds pyarrow and arro3-core as deps.
# These break the `kerchunk` refs_to_dataframe behavior.
# It seems like pyarrow makes pandas default to an ArrowStringArray
# which fastparquet cannot zero-copy encode.
# TODO: remove once fastparquet or kerchunk handle ArrowStringArray.

with pd.option_context("future.infer_string", False):
refs_to_dataframe(
refs,
url=url,
record_size=record_size,
categorical_threshold=categorical_threshold,
)
return None
else:
raise ValueError(f"Unrecognized output format: {format}")
Expand Down
56 changes: 55 additions & 1 deletion virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import re
from collections.abc import (
Callable,
Expand All @@ -8,13 +10,16 @@
ValuesView,
)
from pathlib import PosixPath
from typing import Any, NewType, TypedDict, cast
from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast

import numpy as np

from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index
from virtualizarr.types import ChunkKey

if TYPE_CHECKING:
import pyarrow as pa # type: ignore[import-untyped,import-not-found]

# doesn't guarantee that writers actually handle these
VALID_URI_PREFIXES = {
"s3://",
Expand Down Expand Up @@ -322,6 +327,55 @@ def from_arrays(

return obj

@classmethod
def _from_arrow(
cls,
*,
paths: "pa.StringArray",
offsets: "pa.UInt64Array",
lengths: "pa.UInt64Array",
shape: tuple[int, ...],
) -> "ChunkManifest":
"""
Create a ChunkManifest from flat 1D PyArrow arrays.

Avoids intermediate Python dicts by converting Arrow arrays directly
to the numpy arrays used internally by ChunkManifest.

Parameters
----------
paths
Full paths to chunks, as a PyArrow StringArray. Nulls represent missing chunks.
offsets
Byte offsets of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
lengths
Byte lengths of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks.
shape
Shape to reshape the flat arrays into.
"""
import pyarrow as pa # type: ignore[import-untyped,import-not-found]
import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found]

arrow_paths = pc.if_else(pc.is_null(paths), "", paths)
arrow_offsets = pc.if_else(
pc.is_null(offsets), pa.scalar(0, pa.uint64()), offsets
)
arrow_lengths = pc.if_else(
pc.is_null(lengths), pa.scalar(0, pa.uint64()), lengths
)

np_paths = arrow_paths.to_numpy(zero_copy_only=False).astype(
np.dtypes.StringDType()
)
np_offsets = arrow_offsets.to_numpy(zero_copy_only=False)
np_lengths = arrow_lengths.to_numpy(zero_copy_only=False)

return cls.from_arrays(
paths=np_paths.reshape(shape),
offsets=np_offsets.reshape(shape),
lengths=np_lengths.reshape(shape),
)

@property
def ndim_chunk_grid(self) -> int:
"""
Expand Down
Loading
Loading