Skip to content
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
1fa42d9
add default compressor to config
brokkoli71 Nov 6, 2024
02053e9
modify _default_compressor to _default_filters_and_compressor
brokkoli71 Nov 6, 2024
6ac38ea
fix test_metadata_to_dict
brokkoli71 Nov 6, 2024
9507e19
wip debugging
brokkoli71 Nov 6, 2024
3727b4a
Merge branch 'master' into default-compressor
brokkoli71 Nov 13, 2024
f93ced2
format
brokkoli71 Nov 13, 2024
07590ca
fix v2 decode string dtype
brokkoli71 Nov 13, 2024
4e2a3bc
fix config default tests
brokkoli71 Nov 13, 2024
0fc7b23
format
brokkoli71 Nov 13, 2024
35849c7
Merge branch 'main' into default-compressor
brokkoli71 Nov 17, 2024
8ec16e8
Update src/zarr/codecs/_v2.py
normanrz Dec 6, 2024
d6dc146
rename v2_dtype_kind_to_default_filters_and_compressor to v2_default_…
brokkoli71 Dec 11, 2024
78ab221
merge main into default-compressor
brokkoli71 Dec 11, 2024
15577ae
recover test_v2.py
brokkoli71 Dec 11, 2024
67010ce
incorporate feedback
brokkoli71 Dec 11, 2024
f6b98c3
incorporate feedback
brokkoli71 Dec 11, 2024
fcbae8b
fix mypy
brokkoli71 Dec 11, 2024
75a858d
Merge remote-tracking branch 'origin/default-compressor' into default…
brokkoli71 Dec 11, 2024
a77fb0d
allow only one default compressor
brokkoli71 Dec 11, 2024
d11bf30
Merge remote-tracking branch 'refs/remotes/upstream/main' into defaul…
brokkoli71 Dec 14, 2024
876e67d
put `v2_default_compressor` under `array`
brokkoli71 Dec 14, 2024
12dfaf4
deprecate zarr.storage.default_compressor
brokkoli71 Dec 14, 2024
6954b60
test v3_default_codecs
brokkoli71 Dec 14, 2024
80dfc40
use v3_default_codecs
brokkoli71 Dec 14, 2024
6001e93
fix tests that expected codecs==["bytes"]
brokkoli71 Dec 14, 2024
ff76617
fix test_default_codecs
brokkoli71 Dec 14, 2024
f04e0e6
fail-fast: false
brokkoli71 Dec 14, 2024
f63bb67
fix string codecs for np1.25
brokkoli71 Dec 14, 2024
00e241e
format
brokkoli71 Dec 14, 2024
58406c8
add docstrings to create in asynchronous.py and array.py
brokkoli71 Dec 18, 2024
fc09989
add docstrings to creation in group.py
brokkoli71 Dec 18, 2024
eed4427
Merge branch 'main' into default-compressor
brokkoli71 Dec 18, 2024
c62aff5
Apply suggestions from code review
brokkoli71 Dec 18, 2024
48c7448
apply suggestions from review
brokkoli71 Dec 18, 2024
083c4cb
correct code double backticks
brokkoli71 Dec 18, 2024
500bc7b
correct attribute links in docstring
brokkoli71 Dec 18, 2024
cdf5542
link zarr.core.config in docstrings
brokkoli71 Dec 18, 2024
43307b3
Merge branch 'main' into default-compressor
brokkoli71 Dec 18, 2024
390c435
improve docstring readability
brokkoli71 Dec 18, 2024
35e35c4
correct config docstring
brokkoli71 Dec 18, 2024
92de85c
correct config docstring
brokkoli71 Dec 18, 2024
6fd3f25
improve config docstring
brokkoli71 Dec 18, 2024
ea228ca
Merge branch 'main' into default-compressor
normanrz Dec 19, 2024
3933c05
Merge branch 'main' into default-compressor
normanrz Dec 19, 2024
9ac82d1
Merge branch 'main' into default-compressor
normanrz Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 15 additions & 4 deletions src/zarr/api/asynchronous.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
ChunkCoords,
MemoryOrder,
ZarrFormat,
parse_dtype,
)
from zarr.core.config import config
from zarr.core.group import AsyncGroup, ConsolidatedMetadata, GroupMetadata
from zarr.core.metadata import ArrayMetadataDict, ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata.v2 import _default_filters_and_compressor
from zarr.errors import NodeTypeValidationError
from zarr.storage import (
StoreLike,
Expand Down Expand Up @@ -815,7 +817,12 @@ async def create(
dtype : str or dtype, optional
NumPy dtype.
compressor : Codec, optional
Primary compressor.
Primary compressor for `zarr_format=2`.
If neither `compressor` nor `filters` are provided, a default compressor will be used:
- For numeric arrays, the default is `ZstdCodec`.
- For Unicode strings, the default is `VLenUTF8Codec`.
- For bytes or objects, the default is `VLenBytesCodec`.
These defaults can be changed using the `v2_default_compressor` variable in the Zarr config.
fill_value : object
Default value to use for uninitialized portions of the array.
order : {'C', 'F'}, optional
Expand Down Expand Up @@ -885,9 +892,13 @@ async def create(
or _default_zarr_version()
)

if zarr_format == 2 and chunks is None:
chunks = shape
elif zarr_format == 3 and chunk_shape is None:
if zarr_format == 2:
if chunks is None:
chunks = shape
dtype = parse_dtype(dtype, zarr_format)
if not filters and not compressor:
filters, compressor = _default_filters_and_compressor(dtype)
elif zarr_format == 3 and chunk_shape is None: # type: ignore[redundant-expr]
if chunks is not None:
chunk_shape = chunks
chunks = None
Expand Down
18 changes: 0 additions & 18 deletions src/zarr/codecs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any

if TYPE_CHECKING:
import numpy as np

from zarr.codecs.blosc import BloscCname, BloscCodec, BloscShuffle
from zarr.codecs.bytes import BytesCodec, Endian
from zarr.codecs.crc32c_ import Crc32cCodec
Expand All @@ -13,7 +8,6 @@
from zarr.codecs.transpose import TransposeCodec
from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec
from zarr.codecs.zstd import ZstdCodec
from zarr.core.metadata.v3 import DataType

__all__ = [
"BloscCname",
Expand All @@ -30,15 +24,3 @@
"VLenUTF8Codec",
"ZstdCodec",
]


def _get_default_array_bytes_codec(
np_dtype: np.dtype[Any],
) -> BytesCodec | VLenUTF8Codec | VLenBytesCodec:
dtype = DataType.from_numpy(np_dtype)
if dtype == DataType.string:
return VLenUTF8Codec()
elif dtype == DataType.bytes:
return VLenBytesCodec()
else:
return BytesCodec()
13 changes: 12 additions & 1 deletion src/zarr/codecs/_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import TYPE_CHECKING

import numcodecs
import numpy as np
from numcodecs.compat import ensure_bytes, ensure_ndarray_like

from zarr.abc.codec import ArrayBytesCodec
Expand Down Expand Up @@ -46,7 +47,17 @@ async def _decode_single(
# special case object dtype, because incorrect handling can lead to
# segfaults and other bad things happening
if chunk_spec.dtype != object:
chunk = chunk.view(chunk_spec.dtype)
try:
chunk = chunk.view(chunk_spec.dtype)
except TypeError:
# this will happen if the dtype of the chunk
# does not match the dtype of the array spec i.g. if
# the dtype of the chunk_spec is a string dtype, but the chunk
# is an object array. In this case, we need to convert the object
# array to the correct dtype.

chunk = np.array(chunk).astype(chunk_spec.dtype)

elif chunk.dtype != object:
# If we end up here, someone must have hacked around with the filters.
# We cannot deal with object arrays unless there is an object
Expand Down
41 changes: 26 additions & 15 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@

from zarr._compat import _deprecate_positional_args
from zarr.abc.store import Store, set_or_delete
from zarr.codecs import _get_default_array_bytes_codec
from zarr.codecs._v2 import V2Codec
from zarr.core._info import ArrayInfo
from zarr.core.attributes import Attributes
Expand Down Expand Up @@ -77,7 +76,8 @@
ArrayV3MetadataDict,
T_ArrayMetadata,
)
from zarr.core.metadata.v3 import parse_node_type_array
from zarr.core.metadata.v2 import _default_filters_and_compressor
from zarr.core.metadata.v3 import DataType, parse_node_type_array
from zarr.core.sync import sync
from zarr.errors import MetadataValidationError
from zarr.registry import get_pipeline_class
Expand Down Expand Up @@ -493,14 +493,6 @@ async def create(
order=order,
)
elif zarr_format == 2:
if dtype is str or dtype == "str":
# another special case: zarr v2 added the vlen-utf8 codec
vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"}
if filters and not any(x["id"] == "vlen-utf8" for x in filters):
filters = list(filters) + [vlen_codec]
else:
filters = [vlen_codec]

if codecs is not None:
raise ValueError(
"codecs cannot be used for arrays with version 2. Use filters and compressor instead."
Expand Down Expand Up @@ -563,11 +555,7 @@ async def _create_v3(
await ensure_no_existing_node(store_path, zarr_format=3)

shape = parse_shapelike(shape)
codecs = (
list(codecs)
if codecs is not None
else [_get_default_array_bytes_codec(np.dtype(dtype))]
)
codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype))

if chunk_key_encoding is None:
chunk_key_encoding = ("default", "/")
Expand Down Expand Up @@ -625,6 +613,14 @@ async def _create_v2(
if dimension_separator is None:
dimension_separator = "."

dtype = parse_dtype(dtype, 2)
if not filters and not compressor:
filters, compressor = _default_filters_and_compressor(dtype)
if np.issubdtype(dtype, np.str_):
filters = filters or []
if not any(x["id"] == "vlen-utf8" for x in filters):
filters = list(filters) + [{"id": "vlen-utf8"}]

metadata = ArrayV2Metadata(
shape=shape,
dtype=np.dtype(dtype),
Expand Down Expand Up @@ -3317,3 +3313,18 @@ def _build_parents(
)

return parents


def _get_default_codecs(
np_dtype: np.dtype[Any],
) -> list[dict[str, JSON]]:
default_codecs = config.get("array.v3_default_codecs")
dtype = DataType.from_numpy(np_dtype)
if dtype == DataType.string:
dtype_key = "string"
elif dtype == DataType.bytes:
dtype_key = "bytes"
else:
dtype_key = "numeric"

return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]]
20 changes: 16 additions & 4 deletions src/zarr/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,30 @@ def reset(self) -> None:
# The config module is responsible for managing the configuration of zarr and is based on the Donfig python library.
# For selecting custom implementations of codecs, pipelines, buffers and ndbuffers, first register the implementations
# in the registry and then select them in the config.
# e.g. an implementation of the bytes codec in a class "NewBytesCodec", requires the value of codecs.bytes.name to be
# "NewBytesCodec".
# e.g. an implementation of the bytes codec in a class "your.module.NewBytesCodec", requires the value of codecs.bytes
# to be "your.module.NewBytesCodec".
# Donfig can be configured programmatically, by environment variables, or from YAML files in standard locations
# e.g. export ZARR_CODECS__BYTES__NAME="NewBytesCodec"
# e.g. export ZARR_CODECS__BYTES="your.module.NewBytesCodec"
# (for more information see github.com/pytroll/donfig)
# Default values below point to the standard implementations of zarr-python
config = Config(
"zarr",
defaults=[
{
"default_zarr_version": 3,
"array": {"order": "C"},
"array": {
"order": "C",
"v2_default_compressor": {
"numeric": "zstd",
"string": "vlen-utf8",
"bytes": "vlen-bytes",
},
"v3_default_codecs": {
"numeric": ["bytes", "zstd"],
"string": ["vlen-utf8"],
"bytes": ["vlen-bytes"],
},
},
"async": {"concurrency": 10, "timeout": None},
"threading": {"max_workers": None},
"json_indent": 2,
Expand Down
23 changes: 22 additions & 1 deletion src/zarr/core/metadata/v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections.abc import Iterable
from enum import Enum
from functools import cached_property
from typing import TYPE_CHECKING, TypedDict, cast
from typing import TYPE_CHECKING, Any, TypedDict, cast

from zarr.abc.metadata import Metadata

Expand Down Expand Up @@ -71,6 +71,7 @@ def __init__(
shape_parsed = parse_shapelike(shape)
dtype_parsed = parse_dtype(dtype)
chunks_parsed = parse_shapelike(chunks)

compressor_parsed = parse_compressor(compressor)
order_parsed = parse_indexing_order(order)
dimension_separator_parsed = parse_separator(dimension_separator)
Expand Down Expand Up @@ -326,3 +327,23 @@ def _default_fill_value(dtype: np.dtype[Any]) -> Any:
return ""
else:
return dtype.type(0)


def _default_filters_and_compressor(
dtype: np.dtype[Any],
) -> tuple[list[dict[str, JSON]], dict[str, JSON] | None]:
"""Get the default filters and compressor for a dtype.

https://numpy.org/doc/2.1/reference/generated/numpy.dtype.kind.html
"""
default_compressor = config.get("array.v2_default_compressor")
if dtype.kind in "biufcmM":
dtype_key = "numeric"
elif dtype.kind in "U":
dtype_key = "string"
elif dtype.kind in "OSV":
dtype_key = "bytes"
else:
raise ValueError(f"Unsupported dtype kind {dtype.kind}")

return [{"id": default_compressor[dtype_key]}], None
5 changes: 5 additions & 0 deletions src/zarr/core/metadata/v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
)
from zarr.core.config import config
from zarr.core.metadata.common import parse_attributes
from zarr.core.strings import _NUMPY_SUPPORTS_VLEN_STRING
from zarr.core.strings import _STRING_DTYPE as STRING_NP_DTYPE
from zarr.errors import MetadataValidationError, NodeTypeValidationError
from zarr.registry import get_codec_class
Expand Down Expand Up @@ -606,6 +607,10 @@ def from_numpy(cls, dtype: np.dtype[Any]) -> DataType:
return DataType.string
elif dtype.kind == "S":
return DataType.bytes
elif not _NUMPY_SUPPORTS_VLEN_STRING and dtype.kind == "O":
# numpy < 2.0 does not support vlen string dtype
# so we fall back on object array of strings
return DataType.string
dtype_to_data_type = {
"|b1": "bool",
"bool": "bool",
Expand Down
22 changes: 22 additions & 0 deletions src/zarr/storage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
import sys
import warnings
from types import ModuleType
from typing import Any

from zarr.storage.common import StoreLike, StorePath, make_store_path
from zarr.storage.local import LocalStore
from zarr.storage.logging import LoggingStore
Expand All @@ -17,3 +22,20 @@
"ZipStore",
"make_store_path",
]


class VerboseModule(ModuleType):
def __setattr__(self, attr: str, value: Any) -> None:
if attr == "default_compressor":
warnings.warn(
"setting zarr.storage.default_compressor is deprecated, use "
"zarr.config to configure array.v2_default_compressor "
"e.g. config.set({'codecs.zstd':'your.module.Zstd', 'array.v2_default_compressor.numeric': 'zstd'})",
DeprecationWarning,
stacklevel=1,
)
else:
super().__setattr__(attr, value)


sys.modules[__name__].__class__ = VerboseModule
22 changes: 15 additions & 7 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
from itertools import accumulate
from typing import Any, Literal

import numcodecs
import numpy as np
import pytest
from numcodecs import Zstd

import zarr.api.asynchronous
from zarr import Array, AsyncArray, Group
from zarr.codecs import BytesCodec, VLenBytesCodec
from zarr.codecs import BytesCodec, VLenBytesCodec, ZstdCodec
from zarr.core._info import ArrayInfo
from zarr.core.array import chunks_initialized
from zarr.core.buffer import default_buffer_prototype
Expand Down Expand Up @@ -374,7 +376,7 @@ async def test_chunks_initialized() -> None:


def test_nbytes_stored() -> None:
arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4")
arr = zarr.create(shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()])
result = arr.nbytes_stored()
assert result == 366 # the size of the metadata document. This is a fragile test.
arr[:50] = 1
Expand All @@ -386,7 +388,9 @@ def test_nbytes_stored() -> None:


async def test_nbytes_stored_async() -> None:
arr = await zarr.api.asynchronous.create(shape=(100,), chunks=(10,), dtype="i4")
arr = await zarr.api.asynchronous.create(
shape=(100,), chunks=(10,), dtype="i4", codecs=[BytesCodec()]
)
result = await arr.nbytes_stored()
assert result == 366 # the size of the metadata document. This is a fragile test.
await arr.setitem(slice(50), 1)
Expand Down Expand Up @@ -456,6 +460,7 @@ def test_info_v2(self) -> None:
_read_only=False,
_store_type="MemoryStore",
_count_bytes=128,
_filters=(numcodecs.Zstd(),),
)
assert result == expected

Expand All @@ -470,13 +475,13 @@ def test_info_v3(self) -> None:
_order="C",
_read_only=False,
_store_type="MemoryStore",
_codecs=[BytesCodec()],
_codecs=[BytesCodec(), ZstdCodec()],
_count_bytes=128,
)
assert result == expected

def test_info_complete(self) -> None:
arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3)
arr = zarr.create(shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()])
result = arr.info_complete()
expected = ArrayInfo(
_zarr_format=3,
Expand Down Expand Up @@ -511,6 +516,7 @@ async def test_info_v2_async(self) -> None:
_order="C",
_read_only=False,
_store_type="MemoryStore",
_filters=(Zstd(level=0),),
_count_bytes=128,
)
assert result == expected
Expand All @@ -526,13 +532,15 @@ async def test_info_v3_async(self) -> None:
_order="C",
_read_only=False,
_store_type="MemoryStore",
_codecs=[BytesCodec()],
_codecs=[BytesCodec(), ZstdCodec()],
_count_bytes=128,
)
assert result == expected

async def test_info_complete_async(self) -> None:
arr = await zarr.api.asynchronous.create(shape=(4, 4), chunks=(2, 2), zarr_format=3)
arr = await zarr.api.asynchronous.create(
shape=(4, 4), chunks=(2, 2), zarr_format=3, codecs=[BytesCodec()]
)
result = await arr.info_complete()
expected = ArrayInfo(
_zarr_format=3,
Expand Down
Loading
Loading