|
13 | 13 |
|
14 | 14 | from zarr._compat import _deprecate_positional_args |
15 | 15 | from zarr.abc.store import Store, set_or_delete |
16 | | -from zarr.codecs import _get_default_array_bytes_codec |
17 | 16 | from zarr.codecs._v2 import V2Codec |
18 | 17 | from zarr.core._info import ArrayInfo |
19 | 18 | from zarr.core.attributes import Attributes |
|
78 | 77 | ArrayV3MetadataDict, |
79 | 78 | T_ArrayMetadata, |
80 | 79 | ) |
81 | | -from zarr.core.metadata.v3 import parse_node_type_array |
| 80 | +from zarr.core.metadata.v2 import _default_filters_and_compressor |
| 81 | +from zarr.core.metadata.v3 import DataType, parse_node_type_array |
82 | 82 | from zarr.core.sync import sync |
83 | 83 | from zarr.errors import MetadataValidationError |
84 | 84 | from zarr.registry import get_pipeline_class |
@@ -409,27 +409,53 @@ async def create( |
409 | 409 | attributes : dict[str, JSON], optional |
410 | 410 | The attributes of the array (default is None). |
411 | 411 | chunk_shape : ChunkCoords, optional |
412 | | - The shape of the array's chunks (default is None). |
| 412 | + The shape of the array's chunks |
| 413 | + V3 only. V2 arrays should use `chunks` instead. |
| 414 | + If not specified, default are guessed based on the shape and dtype. |
413 | 415 | chunk_key_encoding : ChunkKeyEncoding, optional |
414 | | - The chunk key encoding (default is None). |
415 | | - codecs : Iterable[Codec | dict[str, JSON]], optional |
416 | | - The codecs used to encode the data (default is None). |
| 416 | + A specification of how the chunk keys are represented in storage. |
| 417 | + V3 only. V2 arrays should use `dimension_separator` instead. |
| 418 | + Default is ``("default", "/")``. |
| 419 | + codecs : Sequence of Codecs or dicts, optional |
| 420 | + An iterable of Codec or dict serializations of Codecs. The elements of |
| 421 | + this collection specify the transformation from array values to stored bytes. |
| 422 | + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. |
| 423 | +
|
| 424 | + If no codecs are provided, default codecs will be used: |
| 425 | +
|
| 426 | + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. |
| 427 | + - For Unicode strings, the default is ``VLenUTF8Codec``. |
| 428 | + - For bytes or objects, the default is ``VLenBytesCodec``. |
| 429 | +
|
| 430 | + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. |
417 | 431 | dimension_names : Iterable[str], optional |
418 | 432 | The names of the dimensions (default is None). |
| 433 | + V3 only. V2 arrays should not use this parameter. |
419 | 434 | chunks : ShapeLike, optional |
420 | | - The shape of the array's chunks (default is None). |
421 | | - V2 only. V3 arrays should not have 'chunks' parameter. |
| 435 | + The shape of the array's chunks. |
| 436 | + V2 only. V3 arrays should use ``chunk_shape`` instead. |
| 437 | + If not specified, default are guessed based on the shape and dtype. |
422 | 438 | dimension_separator : Literal[".", "/"], optional |
423 | | - The dimension separator (default is None). |
424 | | - V2 only. V3 arrays cannot have a dimension separator. |
| 439 | + The dimension separator (default is "."). |
| 440 | + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. |
425 | 441 | order : Literal["C", "F"], optional |
426 | | - The order of the array (default is None). |
| 442 | + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). |
427 | 443 | filters : list[dict[str, JSON]], optional |
428 | | - The filters used to compress the data (default is None). |
429 | | - V2 only. V3 arrays should not have 'filters' parameter. |
| 444 | + Sequence of filters to use to encode chunk data prior to compression. |
| 445 | + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` |
| 446 | + nor ``filters`` are provided, a default compressor will be used. (see |
| 447 | + ``compressor`` for details) |
430 | 448 | compressor : dict[str, JSON], optional |
431 | 449 | The compressor used to compress the data (default is None). |
432 | | - V2 only. V3 arrays should not have 'compressor' parameter. |
| 450 | + V2 only. V3 arrays should use ``codecs`` instead. |
| 451 | +
|
| 452 | + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: |
| 453 | +
|
| 454 | + - For numeric arrays, the default is ``ZstdCodec``. |
| 455 | + - For Unicode strings, the default is ``VLenUTF8Codec``. |
| 456 | + - For bytes or objects, the default is ``VLenBytesCodec``. |
| 457 | +
|
| 458 | + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. |
433 | 459 | overwrite : bool, optional |
434 | 460 | Whether to raise an error if the store already exists (default is False). |
435 | 461 | data : npt.ArrayLike, optional |
@@ -494,14 +520,6 @@ async def create( |
494 | 520 | order=order, |
495 | 521 | ) |
496 | 522 | elif zarr_format == 2: |
497 | | - if dtype is str or dtype == "str": |
498 | | - # another special case: zarr v2 added the vlen-utf8 codec |
499 | | - vlen_codec: dict[str, JSON] = {"id": "vlen-utf8"} |
500 | | - if filters and not any(x["id"] == "vlen-utf8" for x in filters): |
501 | | - filters = list(filters) + [vlen_codec] |
502 | | - else: |
503 | | - filters = [vlen_codec] |
504 | | - |
505 | 523 | if codecs is not None: |
506 | 524 | raise ValueError( |
507 | 525 | "codecs cannot be used for arrays with version 2. Use filters and compressor instead." |
@@ -564,11 +582,7 @@ async def _create_v3( |
564 | 582 | await ensure_no_existing_node(store_path, zarr_format=3) |
565 | 583 |
|
566 | 584 | shape = parse_shapelike(shape) |
567 | | - codecs = ( |
568 | | - list(codecs) |
569 | | - if codecs is not None |
570 | | - else [_get_default_array_bytes_codec(np.dtype(dtype))] |
571 | | - ) |
| 585 | + codecs = list(codecs) if codecs is not None else _get_default_codecs(np.dtype(dtype)) |
572 | 586 |
|
573 | 587 | if chunk_key_encoding is None: |
574 | 588 | chunk_key_encoding = ("default", "/") |
@@ -634,6 +648,14 @@ async def _create_v2( |
634 | 648 | if dimension_separator is None: |
635 | 649 | dimension_separator = "." |
636 | 650 |
|
| 651 | + dtype = parse_dtype(dtype, zarr_format=2) |
| 652 | + if not filters and not compressor: |
| 653 | + filters, compressor = _default_filters_and_compressor(dtype) |
| 654 | + if np.issubdtype(dtype, np.str_): |
| 655 | + filters = filters or [] |
| 656 | + if not any(x["id"] == "vlen-utf8" for x in filters): |
| 657 | + filters = list(filters) + [{"id": "vlen-utf8"}] |
| 658 | + |
637 | 659 | metadata = ArrayV2Metadata( |
638 | 660 | shape=shape, |
639 | 661 | dtype=np.dtype(dtype), |
@@ -1493,23 +1515,53 @@ def create( |
1493 | 1515 | dtype : npt.DTypeLike |
1494 | 1516 | The data type of the array. |
1495 | 1517 | chunk_shape : ChunkCoords, optional |
1496 | | - The shape of the Array's chunks (default is None). |
| 1518 | + The shape of the Array's chunks. |
| 1519 | + V3 only. V2 arrays should use `chunks` instead. |
| 1520 | + If not specified, default are guessed based on the shape and dtype. |
1497 | 1521 | chunk_key_encoding : ChunkKeyEncoding, optional |
1498 | | - The chunk key encoding (default is None). |
1499 | | - codecs : Iterable[Codec | dict[str, JSON]], optional |
1500 | | - The codecs used to encode the data (default is None). |
| 1522 | + A specification of how the chunk keys are represented in storage. |
| 1523 | + V3 only. V2 arrays should use `dimension_separator` instead. |
| 1524 | + Default is ``("default", "/")``. |
| 1525 | + codecs : Sequence of Codecs or dicts, optional |
| 1526 | + An iterable of Codec or dict serializations of Codecs. The elements of |
| 1527 | + this collection specify the transformation from array values to stored bytes. |
| 1528 | + V3 only. V2 arrays should use ``filters`` and ``compressor`` instead. |
| 1529 | +
|
| 1530 | + If no codecs are provided, default codecs will be used: |
| 1531 | +
|
| 1532 | + - For numeric arrays, the default is ``BytesCodec`` and ``ZstdCodec``. |
| 1533 | + - For Unicode strings, the default is ``VLenUTF8Codec``. |
| 1534 | + - For bytes or objects, the default is ``VLenBytesCodec``. |
| 1535 | +
|
| 1536 | + These defaults can be changed by modifying the value of ``array.v3_default_codecs`` in :mod:`zarr.core.config`. |
1501 | 1537 | dimension_names : Iterable[str], optional |
1502 | 1538 | The names of the dimensions (default is None). |
| 1539 | + V3 only. V2 arrays should not use this parameter. |
1503 | 1540 | chunks : ChunkCoords, optional |
1504 | | - The shape of the Array's chunks (default is None). |
| 1541 | + The shape of the array's chunks. |
| 1542 | + V2 only. V3 arrays should use ``chunk_shape`` instead. |
| 1543 | + If not specified, default are guessed based on the shape and dtype. |
1505 | 1544 | dimension_separator : Literal[".", "/"], optional |
1506 | | - The dimension separator (default is None). |
| 1545 | + The dimension separator (default is "."). |
| 1546 | + V2 only. V3 arrays should use ``chunk_key_encoding`` instead. |
1507 | 1547 | order : Literal["C", "F"], optional |
1508 | | - The order of the array (default is None). |
| 1548 | + The order of the array (default is specified by ``array.order`` in :mod:`zarr.core.config`). |
1509 | 1549 | filters : list[dict[str, JSON]], optional |
1510 | | - The filters used to compress the data (default is None). |
| 1550 | + Sequence of filters to use to encode chunk data prior to compression. |
| 1551 | + V2 only. V3 arrays should use ``codecs`` instead. If neither ``compressor`` |
| 1552 | + nor ``filters`` are provided, a default compressor will be used. (see |
| 1553 | + ``compressor`` for details) |
1511 | 1554 | compressor : dict[str, JSON], optional |
1512 | | - The compressor used to compress the data (default is None). |
| 1555 | + Primary compressor to compress chunk data. |
| 1556 | + V2 only. V3 arrays should use ``codecs`` instead. |
| 1557 | +
|
| 1558 | + If neither ``compressor`` nor ``filters`` are provided, a default compressor will be used: |
| 1559 | +
|
| 1560 | + - For numeric arrays, the default is ``ZstdCodec``. |
| 1561 | + - For Unicode strings, the default is ``VLenUTF8Codec``. |
| 1562 | + - For bytes or objects, the default is ``VLenBytesCodec``. |
| 1563 | +
|
| 1564 | + These defaults can be changed by modifying the value of ``array.v2_default_compressor`` in :mod:`zarr.core.config`. |
1513 | 1565 | overwrite : bool, optional |
1514 | 1566 | Whether to raise an error if the store already exists (default is False). |
1515 | 1567 |
|
@@ -3342,3 +3394,18 @@ def _build_parents( |
3342 | 3394 | ) |
3343 | 3395 |
|
3344 | 3396 | return parents |
| 3397 | + |
| 3398 | + |
| 3399 | +def _get_default_codecs( |
| 3400 | + np_dtype: np.dtype[Any], |
| 3401 | +) -> list[dict[str, JSON]]: |
| 3402 | + default_codecs = config.get("array.v3_default_codecs") |
| 3403 | + dtype = DataType.from_numpy(np_dtype) |
| 3404 | + if dtype == DataType.string: |
| 3405 | + dtype_key = "string" |
| 3406 | + elif dtype == DataType.bytes: |
| 3407 | + dtype_key = "bytes" |
| 3408 | + else: |
| 3409 | + dtype_key = "numeric" |
| 3410 | + |
| 3411 | + return [{"name": codec_id, "configuration": {}} for codec_id in default_codecs[dtype_key]] |
0 commit comments