Merge branch 'main' into api-doc-struct

jhamman · web-flow · commit 877f5215865d · 2025-01-05T20:28:13.000-08:00
diff --git a/docs/user-guide/arrays.rst b/docs/user-guide/arrays.rst
@@ -574,8 +574,41 @@ Any combination of integer and slice can be used for block indexing::
 Sharding
 --------
 
-Coming soon.
-
+Using small chunk shapes in very large arrays can lead to a very large number of chunks.
+This can become a performance issue for file systems and object storage.
+With Zarr format 3, a new sharding feature has been added to address this issue.
+
+With sharding, multiple chunks can be stored in a single storage object (e.g. a file).
+Within a shard, chunks are compressed and serialized separately.
+This allows individual chunks to be read independently.
+However, when writing data, a full shard must be written in one go for optimal
+performance and to avoid concurrency issues.
+That means that shards are the units of writing and chunks are the units of reading.
+Users need to configure the chunk and shard shapes accordingly.
+
+Sharded arrays can be created by providing the ``shards`` parameter to :func:`zarr.create_array`.
+
+  >>> a = zarr.create_array('data/example-20.zarr', shape=(10000, 10000), shards=(1000, 1000), chunks=(100, 100), dtype='uint8')
+  >>> a[:] = (np.arange(10000 * 10000) % 256).astype('uint8').reshape(10000, 10000)
+  >>> a.info_complete()
+  Type               : Array
+  Zarr format        : 3
+  Data type          : DataType.uint8
+  Shape              : (10000, 10000)
+  Shard shape        : (1000, 1000)
+  Chunk shape        : (100, 100)
+  Order              : C
+  Read-only          : False
+  Store type         : LocalStore
+  Codecs             : [{'chunk_shape': (100, 100), 'codecs': ({'endian': <Endian.little: 'little'>}, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': <Endian.little: 'little'>}, {}), 'index_location': <ShardingCodecIndexLocation.end: 'end'>}]
+  No. bytes          : 100000000 (95.4M)
+  No. bytes stored   : 3981060
+  Storage ratio      : 25.1
+  Chunks Initialized : 100
+
+In this example a shard shape of (1000, 1000) and a chunk shape of (100, 100) is used.
+This means that 10*10 chunks are stored in each shard, and there are 10*10 shards in total.
+Without the ``shards`` argument, there would be 10,000 chunks stored as individual files.
 
 Missing features in 3.0
 -----------------------
diff --git a/docs/user-guide/performance.rst b/docs/user-guide/performance.rst
@@ -62,6 +62,45 @@ will be one single chunk for the array::
    >>> z5.chunks
    (10000, 10000)
 
+
+Sharding
+~~~~~~~~
+
+If you have large arrays but need small chunks to efficiently access the data, you can
+use sharding. Sharding provides a mechanism to store multiple chunks in a single
+storage object or file. This can be useful because traditional file systems and object
+storage systems may have performance issues storing and accessing many files.
+Additionally, small files can be inefficient to store if they are smaller than the
+block size of the file system.
+
+Picking a good combination of chunk shape and shard shape is important for performance.
+The chunk shape determines what unit of your data can be read independently, while the
+shard shape determines what unit of your data can be written efficiently.
+
+For an example, consider you have a 100 GB array and need to read small chunks of 1 MB.
+Without sharding, each chunk would be one file resulting in 100,000 files. That can
+already cause performance issues on some file systems.
+With sharding, you could use a shard size of 1 GB. This would result in 1000 chunks per
+file and 100 files in total, which seems manageable for most storage systems.
+You would still be able to read each 1 MB chunk independently, but you would need to
+write your data in 1 GB increments.
+
+To use sharding, you need to specify the ``shards`` parameter when creating the array.
+
+   >>> z6 = zarr.create_array(store={}, shape=(10000, 10000, 1000), shards=(1000, 1000, 1000), chunks=(100, 100, 100), dtype='uint8')
+   >>> z6.info
+   Type               : Array
+   Zarr format        : 3
+   Data type          : DataType.uint8
+   Shape              : (10000, 10000, 1000)
+   Shard shape        : (1000, 1000, 1000)
+   Chunk shape        : (100, 100, 100)
+   Order              : C
+   Read-only          : False
+   Store type         : MemoryStore
+   Codecs             : [{'chunk_shape': (100, 100, 100), 'codecs': ({'endian': <Endian.little: 'little'>}, {'level': 0, 'checksum': False}), 'index_codecs': ({'endian': <Endian.little: 'little'>}, {}), 'index_location': <ShardingCodecIndexLocation.end: 'end'>}]
+   No. bytes          : 100000000000 (93.1G)
+
 .. _user-guide-chunks-order:
 
 Chunk memory layout
diff --git a/docs/user-guide/v3_migration.rst b/docs/user-guide/v3_migration.rst
@@ -156,11 +156,11 @@ Dependencies
 When installing using ``pip``:
 
 - The new ``remote`` dependency group can be used to install a supported version of
-   ``fsspec``, required for remote data access.
+  ``fsspec``, required for remote data access.
 - The new ``gpu`` dependency group can be used to install a supported version of
-   ``cuda``, required for GPU functionality.
+  ``cuda``, required for GPU functionality.
 - The ``jupyter`` optional dependency group has been removed, since v3 contains no
-   jupyter specific functionality.
+  jupyter specific functionality.
 
 Miscellaneous
 ~~~~~~~~~~~~~
diff --git a/pyproject.toml b/pyproject.toml
@@ -397,7 +397,8 @@ ignore = [
 checks = [
     "GL06",
     "GL07",
-    "GL09",
+    # Currently broken; see https://github.com/numpy/numpydoc/issues/573
+    # "GL09",
     "GL10",
     "SS02",
     "SS04",
diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py
@@ -508,6 +508,10 @@ async def save_group(
 async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None = None) -> Any:
     """Provide a rich display of the hierarchy.
 
+    .. deprecated:: 3.0.0
+        `zarr.tree()` is deprecated and will be removed in a future release.
+        Use `group.tree()` instead.
+
     Parameters
     ----------
     grp : Group
@@ -521,10 +525,6 @@ async def tree(grp: AsyncGroup, expand: bool | None = None, level: int | None =
     -------
     TreeRepr
         A pretty-printable object displaying the hierarchy.
-
-    .. deprecated:: 3.0.0
-        `zarr.tree()` is deprecated and will be removed in a future release.
-        Use `group.tree()` instead.
     """
     return await grp.tree(expand=expand, level=level)
 
diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py
@@ -334,6 +334,10 @@ def save_group(
 def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> Any:
     """Provide a rich display of the hierarchy.
 
+    .. deprecated:: 3.0.0
+        `zarr.tree()` is deprecated and will be removed in a future release.
+        Use `group.tree()` instead.
+
     Parameters
     ----------
     grp : Group
@@ -347,10 +351,6 @@ def tree(grp: Group, expand: bool | None = None, level: int | None = None) -> An
     -------
     TreeRepr
         A pretty-printable object displaying the hierarchy.
-
-    .. deprecated:: 3.0.0
-        `zarr.tree()` is deprecated and will be removed in a future release.
-        Use `group.tree()` instead.
     """
     return sync(async_api.tree(grp._async_group, expand=expand, level=level))
 
diff --git a/src/zarr/core/_info.py b/src/zarr/core/_info.py
@@ -80,6 +80,7 @@ class ArrayInfo:
     _zarr_format: ZarrFormat
     _data_type: np.dtype[Any] | DataType
     _shape: tuple[int, ...]
+    _shard_shape: tuple[int, ...] | None = None
     _chunk_shape: tuple[int, ...] | None = None
     _order: Literal["C", "F"]
     _read_only: bool
@@ -96,7 +97,13 @@ def __repr__(self) -> str:
         Type               : {_type}
         Zarr format        : {_zarr_format}
         Data type          : {_data_type}
-        Shape              : {_shape}
+        Shape              : {_shape}""")
+
+        if self._shard_shape is not None:
+            template += textwrap.dedent("""
+        Shard shape        : {_shard_shape}""")
+
+        template += textwrap.dedent("""
         Chunk shape        : {_chunk_shape}
         Order              : {_order}
         Read-only          : {_read_only}
diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py
@@ -432,6 +432,9 @@ async def create(
     ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
         """Method to create a new asynchronous array instance.
 
+        .. deprecated:: 3.0.0
+            Deprecated in favor of :func:`zarr.api.asynchronous.create_array`.
+
         Parameters
         ----------
         store : StoreLike
@@ -509,9 +512,6 @@ async def create(
         -------
         AsyncArray
             The created asynchronous array instance.
-
-        .. deprecated:: 3.0.0
-            Deprecated in favor of :func:`zarr.api.asynchronous.create_array`.
         """
         return await cls._create(
             store,
@@ -1573,14 +1573,8 @@ def _info(
         else:
             kwargs["_codecs"] = self.metadata.codecs
             kwargs["_data_type"] = self.metadata.data_type
-            # just regular?
-            chunk_grid = self.metadata.chunk_grid
-            if isinstance(chunk_grid, RegularChunkGrid):
-                kwargs["_chunk_shape"] = chunk_grid.chunk_shape
-            else:
-                raise NotImplementedError(
-                    "'info' is not yet implemented for chunk grids of type {type(self.metadata.chunk_grid)}"
-                )
+            kwargs["_chunk_shape"] = self.chunks
+            kwargs["_shard_shape"] = self.shards
 
         return ArrayInfo(
             _zarr_format=self.metadata.zarr_format,
@@ -1637,6 +1631,9 @@ def create(
     ) -> Array:
         """Creates a new Array instance from an initialized store.
 
+        .. deprecated:: 3.0.0
+            Deprecated in favor of :func:`zarr.create_array`.
+
         Parameters
         ----------
         store : StoreLike
@@ -1704,9 +1701,6 @@ def create(
         -------
         Array
             Array created from the store.
-
-        .. deprecated:: 3.0.0
-            Deprecated in favor of :func:`zarr.create_array`.
         """
         return cls._create(
             store,
diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py
@@ -332,12 +332,21 @@ async def write_batch(
         drop_axes: tuple[int, ...] = (),
     ) -> None:
         if self.supports_partial_encode:
-            await self.encode_partial_batch(
-                [
-                    (byte_setter, value[out_selection], chunk_selection, chunk_spec)
-                    for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info
-                ],
-            )
+            # Pass scalar values as is
+            if len(value.shape) == 0:
+                await self.encode_partial_batch(
+                    [
+                        (byte_setter, value, chunk_selection, chunk_spec)
+                        for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info
+                    ],
+                )
+            else:
+                await self.encode_partial_batch(
+                    [
+                        (byte_setter, value[out_selection], chunk_selection, chunk_spec)
+                        for byte_setter, chunk_spec, chunk_selection, out_selection in batch_info
+                    ],
+                )
 
         else:
             # Read existing bytes if not total slice
diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py
@@ -1148,6 +1148,9 @@ async def create_dataset(
     ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
         """Create an array.
 
+        .. deprecated:: 3.0.0
+            The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead.
+
         Arrays are known as "datasets" in HDF5 terminology. For compatibility
         with h5py, Zarr groups also implement the :func:`zarr.AsyncGroup.require_dataset` method.
 
@@ -1161,11 +1164,17 @@ async def create_dataset(
         Returns
         -------
         a : AsyncArray
-
-        .. deprecated:: 3.0.0
-            The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.create_array` instead.
         """
-        return await self.create_array(name, shape=shape, **kwargs)
+        data = kwargs.pop("data", None)
+        # create_dataset in zarr 2.x requires shape but not dtype if data is
+        # provided. Allow this configuration by inferring dtype from data if
+        # necessary and passing it to create_array
+        if "dtype" not in kwargs and data is not None:
+            kwargs["dtype"] = data.dtype
+        array = await self.create_array(name, shape=shape, **kwargs)
+        if data is not None:
+            await array.setitem(slice(None), data)
+        return array
 
     @deprecated("Use AsyncGroup.require_array instead.")
     async def require_dataset(
@@ -1179,6 +1188,9 @@ async def require_dataset(
     ) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]:
         """Obtain an array, creating if it doesn't exist.
 
+        .. deprecated:: 3.0.0
+            The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.require_dataset` instead.
+
         Arrays are known as "datasets" in HDF5 terminology. For compatibility
         with h5py, Zarr groups also implement the :func:`zarr.AsyncGroup.create_dataset` method.
 
@@ -1199,9 +1211,6 @@ async def require_dataset(
         Returns
         -------
         a : AsyncArray
-
-        .. deprecated:: 3.0.0
-            The h5py compatibility methods will be removed in 3.1.0. Use `AsyncGroup.require_dataset` instead.
         """
         return await self.require_array(name, shape=shape, dtype=dtype, exact=exact, **kwargs)
 
@@ -2393,6 +2402,10 @@ def create_array(
     def create_dataset(self, name: str, **kwargs: Any) -> Array:
         """Create an array.
 
+        .. deprecated:: 3.0.0
+            The h5py compatibility methods will be removed in 3.1.0. Use `Group.create_array` instead.
+
+
         Arrays are known as "datasets" in HDF5 terminology. For compatibility
         with h5py, Zarr groups also implement the :func:`zarr.Group.require_dataset` method.
 
@@ -2406,16 +2419,16 @@ def create_dataset(self, name: str, **kwargs: Any) -> Array:
         Returns
         -------
         a : Array
-
-        .. deprecated:: 3.0.0
-            The h5py compatibility methods will be removed in 3.1.0. Use `Group.create_array` instead.
         """
         return Array(self._sync(self._async_group.create_dataset(name, **kwargs)))
 
     @deprecated("Use Group.require_array instead.")
     def require_dataset(self, name: str, *, shape: ShapeLike, **kwargs: Any) -> Array:
         """Obtain an array, creating if it doesn't exist.
 
+        .. deprecated:: 3.0.0
+            The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead.
+
         Arrays are known as "datasets" in HDF5 terminology. For compatibility
         with h5py, Zarr groups also implement the :func:`zarr.Group.create_dataset` method.
 
@@ -2431,9 +2444,6 @@ def require_dataset(self, name: str, *, shape: ShapeLike, **kwargs: Any) -> Arra
         Returns
         -------
         a : Array
-
-        .. deprecated:: 3.0.0
-            The h5py compatibility methods will be removed in 3.1.0. Use `Group.require_array` instead.
         """
         return Array(self._sync(self._async_group.require_array(name, shape=shape, **kwargs)))
 
@@ -2660,6 +2670,9 @@ def array(
     ) -> Array:
         """Create an array within this group.
 
+        .. deprecated:: 3.0.0
+            Use `Group.create_array` instead.
+
         This method lightly wraps :func:`zarr.core.array.create_array`.
 
         Parameters
diff --git a/tests/test_array.py b/tests/test_array.py
diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py
diff --git a/tests/test_group.py b/tests/test_group.py