diff --git a/Changelog.rst b/Changelog.rst index aaa1b31908..c8baf9913b 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,18 +1,21 @@ -version NEXTVERSION -------------------- +Version NEXTVERSION +---------------- **2025-??-??** +* Read Zarr datasets with `cf.read` + (https://github.com/NCAS-CMS/cf-python/issues/863) * Update CF aggregation keywords (https://github.com/NCAS-CMS/cf-python/issues/868) * New keyword parameter to `cf.DimensionCoordinate.create_bounds`: ``inplace`` (https://github.com/NCAS-CMS/cf-python/issues/855) * Set new minimum version of `dask`: ``2025.5.1`` (https://github.com/NCAS-CMS/cf-python/issues/866) +* Changed dependency: ``cfdm>=1.12.2.0, <1.12.3.0`` * Changed dependency: ``dask>=2025.5.1`` ---- - + version 3.17.0 -------------- diff --git a/README.md b/README.md index bd6656ecf6..9f5c75a8b6 100644 --- a/README.md +++ b/README.md @@ -84,7 +84,7 @@ The `cf` package uses [Dask](https://ncas-cms.github.io/cf-python/performance.html) for all of its array manipulation and can: -* read field constructs from netCDF, CDL, PP and UM datasets with a +* read field constructs from netCDF, CDL, Zarr, PP and UM datasets with a choice of netCDF backends,and in local, http, and s3 locations, * create new field constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, diff --git a/cf/__init__.py b/cf/__init__.py index 38225cbe41..2df23b25e5 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -11,7 +11,7 @@ The `cf` package uses `dask` for all of its array manipulation and can: -* read field constructs from netCDF, CDL, PP and UM datasets, +* read field constructs from netCDF, CDL, Zarr, PP and UM datasets, * read field constructs and domain constructs from netCDF, CDL, PP and UM datasets with a choice of netCDF backends, @@ -284,6 +284,7 @@ RaggedIndexedContiguousArray, SubsampledArray, UMArray, + ZarrArray, ) from .aggregate import aggregate, climatology_cells diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index 3de62364df..ece90bde2c 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -38,6 +38,7 @@ RaggedIndexedArray, RaggedIndexedContiguousArray, SubsampledArray, + ZarrArray, ) from .functions import CF @@ -49,8 +50,8 @@ class CFImplementation(cfdm.CFDMImplementation): """ - def nc_set_hdf5_chunksizes(self, data, sizes, override=False): - """Set the data HDF5 chunksizes. + def nc_set_dataset_chunksizes(self, data, sizes, override=False): + """Set the data dataset chunksizes. .. versionadded:: 3.16.2 @@ -60,21 +61,21 @@ def nc_set_hdf5_chunksizes(self, data, sizes, override=False): The data. sizes: sequence of `int` - The new HDF5 chunk sizes. + The new dataset chunk sizes. override: `bool`, optional - If True then set the HDF5 chunks sizes even if some + If True then set the dataset chunks sizes even if some have already been specified. If False, the default, - then only set the HDF5 chunks sizes if some none have - already been specified. + then only set the dataset chunks sizes if some none + have already been specified. :Returns: `None` """ - if override or not data.nc_hdf5_chunksizes(): - data.nc_set_hdf5_chunksizes(sizes) + if override or not data.nc_dataset_chunksizes(): + data.nc_set_dataset_chunksizes(sizes) def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): """Insert a construct into a field or domain. @@ -151,6 +152,7 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): RaggedIndexedContiguousArray=RaggedIndexedContiguousArray, SubsampledArray=SubsampledArray, TiePointIndex=TiePointIndex, + ZarrArray=ZarrArray, ) @@ -205,7 +207,9 @@ def implementation(): 'RaggedIndexedArray': cf.data.array.raggedindexedarray.RaggedIndexedArray, 'RaggedIndexedContiguousArray': cf.data.array.raggedindexedcontiguousarray.RaggedIndexedContiguousArray, 'SubsampledArray': cf.data.array.subsampledarray.SubsampledArray, - 'TiePointIndex': cf.tiepointindex.TiePointIndex} + 'TiePointIndex': cf.tiepointindex.TiePointIndex, + 'ZarrArray': cf.data.array.zarrarray.ZarrArray, + } """ return _implementation.copy() diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index 693fec0fb4..2b3d03c54f 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -12,3 +12,4 @@ from .raggedindexedcontiguousarray import RaggedIndexedContiguousArray from .subsampledarray import SubsampledArray from .umarray import UMArray +from .zarrarray import ZarrArray diff --git a/cf/data/array/zarrarray.py b/cf/data/array/zarrarray.py new file mode 100644 index 0000000000..2d3d8c784f --- /dev/null +++ b/cf/data/array/zarrarray.py @@ -0,0 +1,15 @@ +import cfdm + +from ...mixin_container import Container + +# Uncomment when we can use active storage on Zarr datasets: +# from .mixin import ActiveStorageMixin + + +class ZarrArray( + # Uncomment when we can use active storage on Zarr datasets: + # ActiveStorageMixin, + Container, + cfdm.ZarrArray, +): + """A Zarr array accessed with `zarr`.""" diff --git a/cf/data/data.py b/cf/data/data.py index 5f024f4770..7fcd08024f 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -5532,7 +5532,7 @@ def outerproduct(self, a, inplace=False, i=False): d = _inplace_enabled_define_and_cleanup(self) shape = d.shape - chunksizes0 = d.nc_hdf5_chunksizes() + chunksizes0 = d.nc_dataset_chunksizes() # Cast 'a' as a Data object so that it definitely has sensible # Units. We don't mind if the units of 'a' are incompatible @@ -5563,8 +5563,8 @@ def outerproduct(self, a, inplace=False, i=False): for a_axis in a._cyclic: d.cyclic(ndim + a._axes.index(a_axis)) - # Update the HDF5 chunking strategy - chunksizes1 = a.nc_hdf5_chunksizes() + # Update the dataset chunking strategy + chunksizes1 = a.nc_dataset_chunksizes() if chunksizes0 or chunksizes1: if isinstance(chunksizes0, tuple): if isinstance(chunksizes1, tuple): @@ -5572,10 +5572,10 @@ def outerproduct(self, a, inplace=False, i=False): else: chunksizes = chunksizes0 + a.shape - d.nc_set_hdf5_chunksizes(chunksizes) + d.nc_set_dataset_chunksizes(chunksizes) elif isinstance(chunksizes1, tuple): chunksizes = shape + chunksizes1 - d.nc_set_hdf5_chunksizes(chunksizes) + d.nc_set_dataset_chunksizes(chunksizes) d._update_deterministic(a) return d @@ -6259,7 +6259,7 @@ def reshape(self, *shape, merge_chunks=True, limit=None, inplace=False): # Clear cyclic axes, as we can't help but lose them in this # operation - d._cyclic = _empty_set + del d._cyclic return d diff --git a/cf/data/mixin/deprecations.py b/cf/data/mixin/deprecations.py index aef006622d..f729ef2764 100644 --- a/cf/data/mixin/deprecations.py +++ b/cf/data/mixin/deprecations.py @@ -437,15 +437,17 @@ def dumps(self): def HDF_chunks(self, *chunks): """Get or set HDF chunk sizes. + Deprecated at version 3.14.0 and is no longer available. Use + the methods `nc_clear_dataset_chunksizes`, + `nc_dataset_chunksizes`, and `nc_set_dataset_chunksizes` + instead. + The HDF chunk sizes may be used by external code that allows `Data` objects to be written to netCDF files. - Deprecated at version 3.14.0 and is no longer available. Use - the methods `nc_clear_hdf5_chunksizes`, `nc_hdf5_chunksizes`, - and `nc_set_hdf5_chunksizes` instead. - - .. seealso:: `nc_clear_hdf5_chunksizes`, `nc_hdf5_chunksizes`, - `nc_set_hdf5_chunksizes` + .. seealso:: `nc_clear_dataset_chunksizes`, + `nc_dataset_chunksizes`, + `nc_set_dataset_chunksizes` :Parameters: @@ -506,8 +508,8 @@ def HDF_chunks(self, *chunks): _DEPRECATION_ERROR_METHOD( self, "HDF_chunks", - message="Use the methods 'nc_clear_hdf5_chunksizes', " - "'nc_hdf5_chunksizes', and 'nc_set_hdf5_chunksizes' " + message="Use the methods 'nc_clear_dataset_chunksizes', " + "'nc_dataset_chunksizes', and 'nc_set_dataset_chunksizes' " "instead.", version="3.14.0", removed_at="5.0.0", diff --git a/cf/data/utils.py b/cf/data/utils.py index 63b2a40e88..bcd8aef289 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -435,8 +435,8 @@ def collapse( d._axes = [a for i, a in enumerate(d._axes) if i not in axis] if d.size != original_size: - # Remove the out-dated HDF5 chunking strategy - d.nc_clear_hdf5_chunksizes() + # Remove the out-dated dataset chunking strategy + d.nc_clear_dataset_chunksizes() return d, weights diff --git a/cf/field.py b/cf/field.py index 9c7dcd93e6..c9eb0c19bd 100644 --- a/cf/field.py +++ b/cf/field.py @@ -13991,19 +13991,20 @@ def field( ) # pragma: no cover def HDF_chunks(self, *chunksizes): - """Deprecated at version 3.0.0. + """Get or set HDF chunk sizes. - Use methods 'Data.nc_hdf5_chunksizes', - 'Data.nc_set_hdf5_chunksizes', 'Data.nc_clear_hdf5_chunksizes' - instead. + Deprecated at version 3.0.0 and is no longer available. Use + methods `Data.nc_dataset_chunksizes`, + `Data.nc_set_dataset_chunksizes`, + `Data.nc_clear_dataset_chunksizes` instead. """ _DEPRECATION_ERROR_METHOD( self, "HDF_chunks", - "Use methods 'Data.nc_hdf5_chunksizes', " - "'Data.nc_set_hdf5_chunksizes', " - "'Data.nc_clear_hdf5_chunksizes' instead.", + "Use methods 'Data.nc_dataset_chunksizes', " + "'Data.nc_set_dataset_chunksizes', " + "'Data.nc_clear_dataset_chunksizes' instead.", version="3.0.0", removed_at="4.0.0", ) # pragma: no cover diff --git a/cf/functions.py b/cf/functions.py index 2018a5d9ff..2f05115048 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -1,12 +1,9 @@ import atexit import csv -import ctypes.util -import importlib import logging import os import platform import re -import sys import warnings from collections.abc import Iterable from itertools import product @@ -34,7 +31,7 @@ from .docstring import _docstring_substitution_definitions -# Instruction to close /proc/mem at exit. +# Instruction to close /proc/meminfo at exit. def _close_proc_meminfo(): try: _meminfo_file.close() @@ -3134,34 +3131,6 @@ def _section(x, axes=None, stop=None, chunks=False, min_step=1): return out -def _get_module_info(module, alternative_name=False, try_except=False): - """Helper function for processing modules for cf.environment.""" - if try_except: - module_name = None - try: - importlib.import_module(module) - module_name = module - except ImportError: - if ( - alternative_name - ): # where a module has a different (e.g. old) name - try: - importlib.import_module(alternative_name) - module_name = alternative_name - except ImportError: - pass - - if not module_name: - return ("not available", "") - else: - module_name = module - - return ( - importlib.import_module(module_name).__version__, - importlib.util.find_spec(module_name).origin, - ) - - def environment(display=True, paths=True): """Return the names and versions of the cf package and its dependencies. @@ -3188,97 +3157,82 @@ def environment(display=True, paths=True): >>> cf.environment() Platform: Linux-6.8.0-60-generic-x86_64-with-glibc2.39 + Python: 3.12.8 /home/miniconda3/bin/python + packaging: 24.2 /home/miniconda3/lib/python3.12/site-packages/packaging/__init__.py + numpy: 2.2.6 /home/miniconda3/lib/python3.12/site-packages/numpy/__init__.py + cfdm.core: 1.12.2.0 /home/miniconda3/lib/python3.12/site-packages/cfdm/cfdm/core/__init__.py + udunits2 library: libudunits2.so.0 HDF5 library: 1.14.2 netcdf library: 4.9.4-development - udunits2 library: libudunits2.so.0 - esmpy/ESMF: 8.7.0 /home/miniconda/lib/python3.12/site-packages/esmpy/__init__.py - Python: 3.12.8 /home/miniconda/bin/python - dask: 2025.5.1 /home/miniconda/lib/python3.12/site-packages/dask/__init__.py - netCDF4: 1.7.2 /home/miniconda/lib/python3.12/site-packages/netCDF4/__init__.py - h5netcdf: 1.3.0 /home/miniconda/lib/python3.12/site-packages/h5netcdf/__init__.py - h5py: 3.12.1 /home/miniconda/lib/python3.12/site-packages/h5py/__init__.py - s3fs: 2024.12.0 /home/miniconda/lib/python3.12/site-packages/s3fs/__init__.py - psutil: 6.1.1 /home/miniconda/lib/python3.12/site-packages/psutil/__init__.py - packaging: 24.2 /home/miniconda/lib/python3.12/site-packages/packaging/__init__.py - numpy: 2.2.2 /home/miniconda/lib/python3.12/site-packages/numpy/__init__.py - scipy: 1.15.2 /home/miniconda/lib/python3.12/site-packages/scipy/__init__.py - matplotlib: 3.10.0 /home/miniconda/lib/python3.12/site-packages/matplotlib/__init__.py - cftime: 1.6.4.post1 /home/miniconda/lib/python3.12/site-packages/cftime/__init__.py - cfunits: 3.3.7 /home/miniconda/lib/python3.12/site-packages/cfunits/__init__.py - cfplot: 3.3.0 /home/miniconda/lib/python3.12/site-packages/cfplot/__init__.py - cfdm: 1.12.2.0 /home/miniconda/lib/python3.12/site-packages/cfdm/__init__.py - cf: NEXTVERSION /home/miniconda/lib/python3.12/site-packages/cf/__init__.py + netCDF4: 1.7.2 /home/miniconda3/lib/python3.12/site-packages/netCDF4/__init__.py + h5netcdf: 1.3.0 /home/miniconda3/lib/python3.12/site-packages/h5netcdf/__init__.py + h5py: 3.12.1 /home/miniconda3/lib/python3.12/site-packages/h5py/__init__.py + zarr: 3.0.8 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py + s3fs: 2024.12.0 /home/miniconda3/lib/python3.12/site-packages/s3fs/__init__.py + scipy: 1.15.1 /home/miniconda3/lib/python3.12/site-packages/scipy/__init__.py + dask: 2025.5.1 /home/miniconda3/lib/python3.12/site-packages/dask/__init__.py + cftime: 1.6.4.post1 /home/miniconda3/lib/python3.12/site-packages/cftime/__init__.py + cfunits: 3.3.7 /home/miniconda3/lib/python3.12/site-packages/cfunits/__init__.py + cfdm: 1.12.2.0 /home/miniconda3/lib/python3.12/site-packages/cfdm/__init__.py + esmpy/ESMF: 8.7.0 /home/miniconda3/lib/python3.12/site-packages/esmpy/__init__.py + psutil: 6.1.1 /home/miniconda3/lib/python3.12/site-packages/psutil/__init__.py + matplotlib: 3.10.0 /home/miniconda3/lib/python3.12/site-packages/matplotlib/__init__.py + cfplot: 3.4.0 /home/miniconda3/lib/python3.12/site-packages/cfplot/__init__.py + cf: NEXTVERSION /home/miniconda3/lib/python3.12/site-packages/cf/__init__.py >>> cf.environment(paths=False) Platform: Linux-6.8.0-60-generic-x86_64-with-glibc2.39 + Python: 3.12.8 + packaging: 24.2 + numpy: 2.2.6 + cfdm.core: 1.12.2.0 + udunits2 library: libudunits2.so.0 HDF5 library: 1.14.2 netcdf library: 4.9.4-development - udunits2 library: libudunits2.so.0 - esmpy/ESMF: 8.7.0 - Python: 3.12.8 - dask: 2025.5.1 netCDF4: 1.7.2 h5netcdf: 1.3.0 h5py: 3.12.1 + zarr: 3.0.8 s3fs: 2024.12.0 - psutil: 6.1.1 - packaging: 24.2 - numpy: 2.2.2 - scipy: 1.15.2 - matplotlib: 3.10.0 + scipy: 1.15.1 + dask: 2025.5.1 cftime: 1.6.4.post1 cfunits: 3.3.7 - cfplot: 3.3.0 cfdm: 1.12.2.0 + esmpy/ESMF: 8.7.0 + psutil: 6.1.1 + matplotlib: 3.10.0 + cfplot: 3.4.0 cf: NEXTVERSION """ + # Get cfdm env + out = cfdm.environment(display=False, paths=paths) + + _get_module_info = cfdm.functions._get_module_info dependency_version_paths_mapping = { - # Platform first, then use an ordering to group libraries as follows... - "Platform": (platform.platform(), ""), - # Underlying C and Fortran based libraries first - "HDF5 library": (netCDF4.__hdf5libversion__, ""), - "netcdf library": (netCDF4.__netcdf4libversion__, ""), - "udunits2 library": (ctypes.util.find_library("udunits2"), ""), "esmpy/ESMF": ( _get_module_info("esmpy", alternative_name="ESMF", try_except=True) ), - # Now Python itself - "Python": (platform.python_version(), sys.executable), - # Then Dask (cover first from below as it's important under-the-hood) - "dask": _get_module_info("dask"), - # Then Python libraries not related to CF - "netCDF4": _get_module_info("netCDF4"), - "h5netcdf": _get_module_info("h5netcdf"), - "h5py": _get_module_info("h5py"), - "s3fs": _get_module_info("s3fs"), "psutil": _get_module_info("psutil"), - "packaging": _get_module_info("packaging"), - "numpy": _get_module_info("numpy"), - "scipy": _get_module_info("scipy"), "matplotlib": _get_module_info("matplotlib", try_except=True), - # Finally the CF related Python libraries, with the cf version last - # as it is the most relevant (cfdm penultimate for similar reason) - "cftime": _get_module_info("cftime"), - "cfunits": _get_module_info("cfunits"), "cfplot": _get_module_info("cfplot", try_except=True), - "cfdm": _get_module_info("cfdm"), "cf": (__version__, _os_path_abspath(__file__)), } string = "{0}: {1!s}" if paths: - # Include path information, else exclude, when unpacking tuple + # Include path information, else exclude, when unpacking tuple. string += " {2!s}" - out = [ - string.format(dep, *info) - for dep, info in dependency_version_paths_mapping.items() - ] - - out = "\n".join(out) + out.extend( + [ + string.format(dep, *info) + for dep, info in dependency_version_paths_mapping.items() + ] + ) if display: - print(out) # pragma: no cover + print("\n".join(out)) # pragma: no cover else: return out diff --git a/cf/read_write/read.py b/cf/read_write/read.py index f2402c25ad..835baf1a89 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -1,20 +1,16 @@ import logging -import os -from glob import glob -from os.path import isdir +from functools import partial from re import Pattern -from urllib.parse import urlparse import cfdm from cfdm.read_write.exceptions import DatasetTypeError -from cfdm.read_write.netcdf import NetCDFRead from ..aggregate import aggregate as cf_aggregate from ..cfimplementation import implementation from ..decorators import _manage_log_level_via_verbosity from ..domainlist import DomainList from ..fieldlist import FieldList -from ..functions import _DEPRECATION_ERROR_FUNCTION_KWARGS, flat +from ..functions import _DEPRECATION_ERROR_FUNCTION_KWARGS from ..query import Query from .um import UMRead @@ -24,18 +20,22 @@ class read(cfdm.read): """Read field or domain constructs from files. - The following file formats are supported: netCDF, CFA-netCDF, CDL, - UM fields file, and PP. + The following file formats are supported: netCDF, CDL, Zarr, PP, + and UM fields file. - Input datasets are mapped to constructs in memory which are - returned as elements of a `FieldList` or if the *domain* parameter - is True, a `DomainList`. + NetCDF and Zarr datasets may be on local disk, on an OPeNDAP + server, or in an S3 object store. - NetCDF files may be on disk, on an OPeNDAP server, or in an S3 - object store. + CDL, PP, and UM fields files must be on local disk. Any amount of files of any combination of file types may be read. + Input datasets are mapped to `Field` constructs which are returned + as elements of a `FieldList`, or if the *domain* parameter is + True, `Domain` constructs returned as elements of a + `DomainList`. The returned constructs are sorted by the netCDF + variable names of their corresponding data or domain variables. + **NetCDF unlimited dimensions** Domain axis constructs that correspond to NetCDF unlimited @@ -136,7 +136,7 @@ class read(cfdm.read): However, when two or more field or domain constructs are aggregated to form a single construct then the data arrays of some - metadata constructs (coordinates, cell measures, etc.) must be + metadata constructs (coordinates, cell measures, etc.) must be compared non-lazily to ascertain if aggregation is possible. .. seealso:: `cf.aggregate`, `cf.write`, `cf.Field`, `cf.Domain`, @@ -144,90 +144,36 @@ class read(cfdm.read): :Parameters: - files: (arbitrarily nested sequence of) `str` - A string or arbitrarily nested sequence of strings giving - the file names, directory names, or OPenDAP URLs from - which to read field constructs. Various type of expansion - are applied to the names: - - ==================== ====================================== - Expansion Description - ==================== ====================================== - Tilde An initial component of ``~`` or - ``~user`` is replaced by that *user*'s - home directory. - - Environment variable Substrings of the form ``$name`` or - ``${name}`` are replaced by the value - of environment variable *name*. - - Pathname A string containing UNIX file name - metacharacters as understood by the - Python `glob` module is replaced by - the list of matching file names. This - type of expansion is ignored for - OPenDAP URLs. - ==================== ====================================== - - Where more than one type of expansion is used in the same - string, they are applied in the order given in the above - table. + {{read datasets: (arbitrarily nested sequence of) `str`}} - *Parameter example:* - The file ``file.nc`` in the user's home directory could - be described by any of the following: - ``'$HOME/file.nc'``, ``'${HOME}/file.nc'``, - ``'~/file.nc'``, ``'~/tmp/../file.nc'``. - - When a directory is specified, all files in that directory - are read. Sub-directories are not read unless the - *recursive* parameter is True. If any directories contain - files that are not valid datasets then an exception will - be raised, unless the *ignore_unknown_type* parameter is - True. - - As a special case, if the `cdl_string` parameter is set to - True, the interpretation of `files` changes so that each - value is assumed to be a string of CDL input rather - than the above. + {{read recursive: `bool`, optional}} - {{read external: (sequence of) `str`, optional}} + {{read followlinks: `bool`, optional}} - {{read extra: (sequence of) `str`, optional}} + {{read cdl_string: `bool`, optional}} - {{read verbose: `int` or `str` or `None`, optional}} - - {{read warnings: `bool`, optional}} - - {{read file_type: (sequence of) `str`, optional}} + {{read dataset_type: `None` or (sequence of) `str`, optional}} Valid file types are: - ============ ============================================ - file type Description - ============ ============================================ - ``'netCDF'`` Binary netCDF-3 or netCDF-4 files - ``'CDL'`` Text CDL representations of netCDF files - ``'UM'`` UM fields files or PP files - ============ ============================================ + ============== ========================================== + *dataset_type* Description + ============== ========================================== + ``'netCDF'`` A netCDF-3 or netCDF-4 dataset + ``'CDL'`` A text CDL file of a netCDF dataset + ``'Zarr'`` A Zarr v2 (xarray) or Zarr v3 dataset + ``'UM'`` A UM fields file or PP dataset + ============== ========================================== - .. versionadded:: 3.17.0 + .. versionadded:: NEXTVERSION - cdl_string: `bool`, optional - If True and the format to read is CDL, read a string - input, or sequence of string inputs, each being interpreted - as a string of CDL rather than names of locations from - which field constructs can be read from, as standard. + {{read external: (sequence of) `str`, optional}} - By default, each string input or string element in the input - sequence is taken to be a file or directory name or an - OPenDAP URL from which to read field constructs, rather - than a string of CDL input, including when the `fmt` - parameter is set as CDL. + {{read extra: (sequence of) `str`, optional}} - Note that when `cdl_string` is True, the `fmt` parameter is - ignored as the format is assumed to be CDL, so in that case - it is not necessary to also specify ``fmt='CDL'``. + {{read verbose: `int` or `str` or `None`, optional}} + + {{read warnings: `bool`, optional}} um: `dict`, optional For Met Office (UK) PP files and Met Office (UK) fields @@ -334,24 +280,6 @@ class read(cfdm.read): select='air_temperature')`` is equivalent to ``fl = cf.read(file).select_by_identity('air_temperature')``. - recursive: `bool`, optional - If True then recursively read sub-directories of any - directories specified with the *files* parameter. - - followlinks: `bool`, optional - If True, and *recursive* is True, then also search for - files in sub-directories which resolve to symbolic - links. By default directories which resolve to symbolic - links are ignored. Ignored of *recursive* is False. Files - which are symbolic links are always followed. - - Note that setting ``recursive=True, followlinks=True`` can - lead to infinite recursion if a symbolic link points to a - parent directory of itself. - - This parameter replaces the deprecated *follow_symlinks* - parameter. - {{read warn_valid: `bool`, optional}} .. versionadded:: 3.4.0 @@ -422,10 +350,13 @@ class read(cfdm.read): Use the *dask_chunks* parameter instead. fmt: deprecated at version 3.17.0 - Use the *file_type* parameter instead. + Use the dataset_type* parameter instead. ignore_read_error: deprecated at version 3.17.0 - Use the *file_type* parameter instead. + Use the *dataset_type* parameter instead. + + file_type: deprecated at version NEXTVERSION + Use the *dataset_type* parameter instead. :Returns: @@ -478,7 +409,7 @@ class read(cfdm.read): @_manage_log_level_via_verbosity def __new__( cls, - files, + datasets, external=None, verbose=None, warnings=False, @@ -486,7 +417,7 @@ def __new__( nfields=None, squeeze=False, unsqueeze=False, - file_type=None, + dataset_type=None, cdl_string=False, select=None, extra=None, @@ -513,8 +444,11 @@ def __new__( chunks="auto", ignore_read_error=False, fmt=None, + file_type=None, ): """Read field or domain constructs from a dataset.""" + kwargs = locals() + if field: _DEPRECATION_ERROR_FUNCTION_KWARGS( "cf.read", @@ -568,7 +502,7 @@ def __new__( _DEPRECATION_ERROR_FUNCTION_KWARGS( "cf.read", {"fmt": fmt}, - "Use keyword 'file_type' instead.", + "Use keyword 'dataset_type' instead.", version="3.17.0", removed_at="5.0.0", ) # pragma: no cover @@ -577,25 +511,129 @@ def __new__( _DEPRECATION_ERROR_FUNCTION_KWARGS( "cf.read", {"ignore_read_error": ignore_read_error}, - "Use keyword 'file_type' instead.", + "Use keyword 'dataset_type' instead.", version="3.17.0", removed_at="5.0.0", ) # pragma: no cover - info = cfdm.is_log_level_info(logger) + if file_type is not None: + _DEPRECATION_ERROR_FUNCTION_KWARGS( + "cf.read", + {"file_type": file_type}, + "Use keyword 'dataset_type' instead.", + version="NEXTVERSION", + removed_at="5.0.0", + ) # pragma: no cover + + return super().__new__(**kwargs) + + def _finalise(self): + """Actions to take after all datasets have been read. + + Called by `__new__`. + + .. versionadded:: NEXTVERSION + + :Returns: + + `None` + + """ + # Whether or not there were only netCDF datasets + only_netCDF = self.unique_dataset_categories == set(("netCDF",)) + + # Whether or not there were any UM datasets + some_UM = "UM" in self.unique_dataset_categories + + # ---------------------------------------------------------------- + # Select matching constructs from netCDF datasets (before + # aggregation) + # ---------------------------------------------------------------- + select = self.select + if select and only_netCDF: + self.constructs = self.constructs.select_by_identity(*select) + + # ---------------------------------------------------------------- + # Aggregate the output fields or domains + # ---------------------------------------------------------------- + if self.aggregate and len(self.constructs) > 1: + aggregate_options = self.aggregate_options + # Set defaults specific to UM fields + if some_UM and "strict_units" not in aggregate_options: + aggregate_options["relaxed_units"] = True + + self.constructs = cf_aggregate( + self.constructs, **aggregate_options + ) + + # ---------------------------------------------------------------- + # Add standard names to non-netCDF fields (after aggregation) + # ---------------------------------------------------------------- + if not only_netCDF: + for f in self.constructs: + standard_name = f._custom.get("standard_name", None) + if standard_name is not None: + f.set_property("standard_name", standard_name, copy=False) + del f._custom["standard_name"] + + # ---------------------------------------------------------------- + # Select matching constructs from non-netCDF files (after + # setting their standard names) + # ---------------------------------------------------------------- + if select and not only_netCDF: + self.constructs = self.constructs.select_by_identity(*select) + + super()._finalise() + + def _initialise(self): + """Actions to take before any datasets have been read. - cls.netcdf = NetCDFRead(cls.implementation) - cls.um = UMRead(cls.implementation) + Called by `__new__`. + + .. versionadded:: NEXTVERSION + + :Returns: + + `None` + + """ + super()._initialise() + + # Initialise the list of output constructs + if self.field: + self.constructs = FieldList() + elif self.domain: + self.constructs = DomainList() + + # Recognised UM dataset formats + self.UM_dataset_types = set(("UM",)) + + # Allowed dataset formats + self.allowed_dataset_types.update(self.UM_dataset_types) + + # ------------------------------------------------------------ + # Parse the 'um' keyword parameter + # ------------------------------------------------------------ + kwargs = self.kwargs + um = kwargs["um"] + if not um: + um = {} + + self.um = um # ------------------------------------------------------------ # Parse the 'select' keyword parameter # ------------------------------------------------------------ + select = kwargs["select"] if isinstance(select, (str, Query, Pattern)): select = (select,) + self.select = select + # ------------------------------------------------------------ # Parse the 'aggregate' keyword parameter # ------------------------------------------------------------ + aggregate = kwargs["aggregate"] if isinstance(aggregate, dict): aggregate_options = aggregate.copy() aggregate = True @@ -604,257 +642,115 @@ def __new__( aggregate_options["copy"] = False - # ------------------------------------------------------------ - # Parse the 'file_type' keyword parameter - # ------------------------------------------------------------ - netCDF_file_types = set(("netCDF", "CDL")) - UM_file_types = set(("UM",)) - if file_type is not None: - if isinstance(file_type, str): - file_type = (file_type,) + self.aggregate = aggregate + self.aggregate_options = aggregate_options - file_type = set(file_type) + def _read(self, dataset): + """Read a given dataset into field or domain constructs. - # ------------------------------------------------------------ - # Parse the 'um' keyword parameter - # ------------------------------------------------------------ - if not um: - um = {} + The constructs are stored in the `dataset_contents` attribute. - # ------------------------------------------------------------ - # Parse the 'cdl_string' keyword parameter - # ------------------------------------------------------------ - if cdl_string and file_type is not None: - raise ValueError("Can't set file_type when cdl_string=True") + Called by `__new__`. - # ------------------------------------------------------------ - # Parse the 'follow_symlinks' and 'recursive' keyword - # parameters - # ------------------------------------------------------------ - if follow_symlinks and not recursive: - raise ValueError( - f"Can't set follow_symlinks={follow_symlinks!r} " - f"when recursive={recursive!r}" - ) - - # Initialise the output list of fields/domains - if domain: - out = DomainList() - else: - out = FieldList() - - # Count the number of fields (in all files) and the number of - # files - field_counter = -1 - file_counter = 0 - - if cdl_string: - if isinstance(files, str): - files = (files,) - - files = [ - NetCDFRead.string_to_cdl(cdl_string) for cdl_string in files - ] - file_type = set(("CDL",)) - - for file_glob in flat(files): - # Expand variables - file_glob = os.path.expanduser(os.path.expandvars(file_glob)) - - scheme = urlparse(file_glob).scheme - if scheme in ("https", "http", "s3"): - # Do not glob a remote URL - files2 = (file_glob,) - else: - # Glob files on disk - files2 = glob(file_glob) - - if not files2: - # Trigger a FileNotFoundError error - open(file_glob) - - files3 = [] - for x in files2: - if isdir(x): - # Walk through directories, possibly recursively - for path, subdirs, filenames in os.walk( - x, followlinks=followlinks - ): - files3.extend( - os.path.join(path, f) for f in filenames - ) - if not recursive: - break - else: - files3.append(x) - - files2 = files3 - - # The types of all of the input files - ftypes = set() - - for filename in files2: - if info: - logger.info(f"File: {filename}") # pragma: no cover - - # ---------------------------------------------------- - # Read the file - # ---------------------------------------------------- - file_contents = [] - - # The type of this file - ftype = None - - # Record file type errors - file_format_errors = [] - - if ftype is None and ( - file_type is None - or file_type.intersection(netCDF_file_types) - ): - # Try to read as netCDF - try: - file_contents = super().__new__( - cls, - filename=filename, - external=external, - extra=extra, - verbose=verbose, - warnings=warnings, - mask=mask, - unpack=unpack, - warn_valid=warn_valid, - domain=domain, - storage_options=storage_options, - netcdf_backend=netcdf_backend, - dask_chunks=dask_chunks, - store_dataset_chunks=store_dataset_chunks, - cache=cache, - cfa=cfa, - cfa_write=cfa_write, - to_memory=to_memory, - squeeze=squeeze, - unsqueeze=unsqueeze, - file_type=file_type, - ) - except DatasetTypeError as error: - if file_type is None: - file_format_errors.append(error) - else: - file_format_errors = [] - ftype = "netCDF" - - if ftype is None and ( - file_type is None or file_type.intersection(UM_file_types) - ): - # Try to read as UM - try: - file_contents = cls.um.read( - filename, - um_version=um.get("version"), - verbose=verbose, - set_standard_name=False, - height_at_top_of_model=height_at_top_of_model, - fmt=um.get("fmt"), - word_size=um.get("word_size"), - endian=um.get("endian"), - select=select, - squeeze=squeeze, - unsqueeze=unsqueeze, - domain=domain, - file_type=file_type, - unpack=unpack, - ) - except DatasetTypeError as error: - if file_type is None: - file_format_errors.append(error) - else: - file_format_errors = [] - ftype = "UM" - - if file_format_errors: - error = "\n".join(map(str, file_format_errors)) - raise DatasetTypeError(f"\n{error}") - - if domain: - file_contents = DomainList(file_contents) - - file_contents = FieldList(file_contents) - - if ftype: - ftypes.add(ftype) - - # Select matching fields (only for netCDF files at - # this stage - we'll other it for other file types - # later) - if select and ftype == "netCDF": - file_contents = file_contents.select_by_identity(*select) - - # Add this file's contents to that already read from - # other files - out.extend(file_contents) - - field_counter = len(out) - file_counter += 1 + .. versionadded:: NEXTVERSION - # ---------------------------------------------------------------- - # Aggregate the output fields/domains - # ---------------------------------------------------------------- - if aggregate and len(out) > 1: - org_len = len(out) # pragma: no cover - - if "UM" in ftypes: - # Set defaults specific to UM fields - if "strict_units" not in aggregate_options: - aggregate_options["relaxed_units"] = True + :Parameters: - out = cf_aggregate(out, **aggregate_options) + dataset: `str` + The pathname of the dataset to be read. - n = len(out) # pragma: no cover - if info: - logger.info( - f"{org_len} input field{cls._plural(org_len)} " - f"aggregated into {n} field{cls._plural(n)}" - ) # pragma: no cover + :Returns: - # ---------------------------------------------------------------- - # Sort by netCDF variable name - # ---------------------------------------------------------------- - if len(out) > 1: - out.sort(key=lambda f: f.nc_get_variable("")) + `None` - # ---------------------------------------------------------------- - # Add standard names to UM/PP fields (post aggregation) - # ---------------------------------------------------------------- - for f in out: - standard_name = f._custom.get("standard_name", None) - if standard_name is not None: - f.set_property("standard_name", standard_name, copy=False) - del f._custom["standard_name"] + """ + dataset_type = self.dataset_type - # ---------------------------------------------------------------- - # Select matching fields from UM files (post setting of their - # standard names) - # ---------------------------------------------------------------- - if select and "UM" in ftypes: - out = out.select_by_identity(*select) + # ------------------------------------------------------------ + # Try to read as a netCDF dataset + # ------------------------------------------------------------ + super()._read(dataset) - if info: - logger.info( - f"Read {field_counter} field{cls._plural(field_counter)} " - f"from {file_counter} file{cls._plural(file_counter)}" - ) # pragma: no cover + if self.dataset_contents is not None: + # Successfully read the dataset + return - if nfields is not None and len(out) != nfields: - raise ValueError( - f"{nfields} field{cls._plural(nfields)} requested but " - f"{len(out)} field/domain constuct{cls._plural(len(out))}" - f" found in file{cls._plural(file_counter)}" - ) + # ------------------------------------------------------------ + # Try to read as a PP/UM dataset + # ------------------------------------------------------------ + if dataset_type is None or dataset_type.intersection( + self.UM_dataset_types + ): + if not hasattr(self, "um_read"): + # Initialise the UM read function + kwargs = self.kwargs + um_kwargs = { + key: kwargs[key] + for key in ( + "height_at_top_of_model", + "squeeze", + "unsqueeze", + "domain", + "dataset_type", + "unpack", + "verbose", + ) + } + um_kwargs["set_standard_name"] = False + um_kwargs["select"] = self.select + um = self.um + um_kwargs["um_version"] = um.get("version") + um_kwargs["fmt"] = um.get("fmt") + um_kwargs["word_size"] = um.get("word_size") + um_kwargs["endian"] = um.get("endian") + + self.um_read = partial( + UMRead(self.implementation).read, **um_kwargs + ) + + try: + # Try to read the dataset + self.dataset_contents = self.um_read(dataset) + except DatasetTypeError as error: + if dataset_type is None: + self.dataset_format_errors.append(error) + else: + # Successfully read the dataset + self.unique_dataset_categories.add("UM") - return out + if self.dataset_contents is not None: + # Successfully read the dataset + return - @staticmethod - def _plural(n): # pragma: no cover - """Return a suffix which reflects a word's plural.""" - return "s" if n != 1 else "" # pragma: no cover + # ------------------------------------------------------------ + # Try to read as a GRIB dataset + # + # Not yet availabl. When (if!) the time comes, the framework + # will be: + # ------------------------------------------------------------ + # + # if dataset_type is None or dataset_type.intersection( + # self.GRIB_dataset_types + # ): + # if not hasattr(self, "grib_read"): + # # Initialise the GRIB read function + # kwargs = self.kwargs + # grib_kwargs = ... # + # + # self.grib_read = partial( + # GRIBRead(self.implementation).read, **grib_kwargs + # ) + # + # try: + # # Try to read the dataset + # self.dataset_contents = self.grib_read(dataset) + # except DatasetTypeError as error: + # if dataset_type is None: + # self.dataset_format_errors.append(error) + # else: + # # Successfully read the dataset + # self.unique_dataset_categories.add("GRIB") + # + # if self.dataset_contents is not None: + # # Successfully read the dataset + # return diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 52ee560d5d..3289ea88b4 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -3407,7 +3407,7 @@ def read( squeeze=False, unsqueeze=False, domain=False, - file_type=None, + dataset_type=None, ignore_unknown_type=False, unpack=True, ): @@ -3555,14 +3555,14 @@ def read( byte_ordering = None # ------------------------------------------------------------ - # Parse the 'file_type' keyword parameter + # Parse the 'dataset_type' keyword parameter # ------------------------------------------------------------ - if file_type is not None: - if isinstance(file_type, str): - file_type = (file_type,) + if dataset_type is not None: + if isinstance(dataset_type, str): + dataset_type = (dataset_type,) - file_type = set(file_type) - if not file_type.intersection(("UM",)): + dataset_type = set(dataset_type) + if not dataset_type.intersection(("UM",)): # Return now if there are valid file types return [] diff --git a/cf/test/example_field_0.nc b/cf/test/example_field_0.nc new file mode 100644 index 0000000000..585bc9bc0c Binary files /dev/null and b/cf/test/example_field_0.nc differ diff --git a/cf/test/example_field_0.zarr2/.zattrs b/cf/test/example_field_0.zarr2/.zattrs new file mode 100644 index 0000000000..eaf2177785 --- /dev/null +++ b/cf/test/example_field_0.zarr2/.zattrs @@ -0,0 +1,3 @@ +{ + "Conventions": "CF-1.12" +} \ No newline at end of file diff --git a/cf/test/example_field_0.zarr2/.zgroup b/cf/test/example_field_0.zarr2/.zgroup new file mode 100644 index 0000000000..3b7daf227c --- /dev/null +++ b/cf/test/example_field_0.zarr2/.zgroup @@ -0,0 +1,3 @@ +{ + "zarr_format": 2 +} \ No newline at end of file diff --git a/cf/test/example_field_0.zarr2/.zmetadata b/cf/test/example_field_0.zarr2/.zmetadata new file mode 100644 index 0000000000..79ba165c09 --- /dev/null +++ b/cf/test/example_field_0.zarr2/.zmetadata @@ -0,0 +1,171 @@ +{ + "metadata": { + ".zattrs": { + "Conventions": "CF-1.12" + }, + ".zgroup": { + "zarr_format": 2 + }, + "lat/.zarray": { + "chunks": [ + 5 + ], + "compressor": { + "blocksize": 0, + "clevel": 5, + "cname": "lz4", + "id": "blosc", + "shuffle": 1 + }, + "dtype": "", tempf] + command_to_run = ["ncdump", "example_field_0.nc", ">", tempf] if option: command_to_run.insert(1, option) @@ -720,12 +727,15 @@ def test_read_cdl_string(self): for cdl_input in (cdl_string_1, (cdl_string_1,)): f_from_str = cf.read(cdl_input, cdl_string=True) self.assertEqual(len(f_from_str), 1) - self.assertEqual(f_from_str[0], f) + if not option: + self.assertTrue(f_from_str[0].equals(f)) - # Check compatibility with the 'file_type' kwarg. - for file_type in ("netCDF", "CDL", "UM", ()): - with self.assertRaises(ValueError): - cf.read(cdl_string_1, cdl_string=True, file_type=file_type) + # Check compatibility with the 'dataset_type' kwarg. + f_from_str = cf.read(cdl_string_1, cdl_string=True, dataset_type="CDL") + self.assertEqual(len(f_from_str), 1) + + with self.assertRaises(ValueError): + cf.read(cdl_string_1, cdl_string=True, dataset_type="netCDF") # If the user forgets the cdl_string=True argument they will # accidentally attempt to create a file with a very long name @@ -855,9 +865,9 @@ def test_write_omit_data(self): self.assertFalse(np.ma.count(g.array)) self.assertTrue(np.ma.count(g.construct("grid_latitude").array)) - @unittest.skipUnless( - True, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL" - ) + # @unittest.skipUnless( + # True, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL" + # ) def test_read_url(self): """Test reading urls.""" for scheme in ("http", "https"): @@ -866,21 +876,20 @@ def test_read_url(self): f = cf.read(remote) self.assertEqual(len(f), 1) - def test_read_file_type(self): - """Test the cf.read 'file_type' keyword.""" - # netCDF file - for file_type in ( + def test_read_dataset_type(self): + """Test the cf.read 'dataset_type' keyword.""" + # netCDF dataset + for dataset_type in ( None, "netCDF", ("netCDF",), ("netCDF", "CDL"), - ("netCDF", "bad value"), ): - f = cf.read(self.filename, file_type=file_type) + f = cf.read(self.filename, dataset_type=dataset_type) self.assertEqual(len(f), 1) - for file_type in ("CDL", "bad value", ()): - f = cf.read(self.filename, file_type=file_type) + for dataset_type in ("CDL", ("CDL", "Zarr"), ()): + f = cf.read(self.filename, dataset_type=dataset_type) self.assertEqual(len(f), 0) # CDL file @@ -889,36 +898,50 @@ def test_read_file_type(self): shell=True, check=True, ) - for file_type in ( + for dataset_type in ( None, "CDL", - ("netCDF", "CDL"), - ("CDL", "bad value"), + ("CDL", "netCDF"), ): - f = cf.read(tmpfile, file_type=file_type) - self.assertEqual(len(f), 1) - - for file_type in ("netCDF", "bad value", ()): - f = cf.read(tmpfile, file_type=file_type) - self.assertEqual(len(f), 0) - - # UM file - for file_type in (None, "UM", ("UM",), ("UM", "bad value")): - f = cf.read("file1.pp", file_type=file_type) + f = cf.read(tmpfile, dataset_type=dataset_type) self.assertEqual(len(f), 1) - for file_type in ("netCDF", "bad value", ()): - f = cf.read("file1.pp", file_type=file_type) + for dataset_type in ("netCDF", ()): + f = cf.read(tmpfile, dataset_type=dataset_type) self.assertEqual(len(f), 0) - # Not a netCDF, CDL, or UM file + # Not a netCDF or CDL file with self.assertRaises(DatasetTypeError): f = cf.read("test_read_write.py") - for file_type in ("netCDF", "CDL", "bad value", ()): - f = cf.read("test_read_write.py", file_type=file_type) + for dataset_type in ("netCDF", ()): + f = cf.read("test_read_write.py", dataset_type=dataset_type) self.assertEqual(len(f), 0) + # Bad values + for dataset_type in ("bad value", ("bad value", "netCDF")): + with self.assertRaises(ValueError): + cf.read(self.filename, dataset_type=dataset_type) + + def test_read_zarr(self): + """Test the cf.read of a zarr dataset.""" + n = cf.read("example_field_0.nc")[0] + for zarr_dataset in (self.zarr2, self.zarr3): + z = cf.read(zarr_dataset, dask_chunks=3) + self.assertEqual(len(z), 1) + z = z[0] + self.assertTrue(z.equals(n)) + + cf.write(z, tmpfile) + n2 = cf.read(tmpfile)[0] + self.assertTrue(n2.equals(n)) + + z = cf.read(zarr_dataset, dataset_type="netCDF") + self.assertEqual(len(z), 0) + + z = cf.read(zarr_dataset, dataset_type="Zarr") + self.assertEqual(len(z), 1) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/docs/source/class.rst b/docs/source/class.rst index 70b9730095..d7941ac067 100644 --- a/docs/source/class.rst +++ b/docs/source/class.rst @@ -83,6 +83,7 @@ Data classes cf.NetCDF4Array cf.FullArray cf.UMArray + cf.ZarrArray Data compression classes ------------------------ diff --git a/docs/source/class/cf.AuxiliaryCoordinate.rst b/docs/source/class/cf.AuxiliaryCoordinate.rst index 0b2847c661..e5af934219 100644 --- a/docs/source/class/cf.AuxiliaryCoordinate.rst +++ b/docs/source/class/cf.AuxiliaryCoordinate.rst @@ -502,9 +502,9 @@ NetCDF ~cf.AuxiliaryCoordinate.nc_set_node_coordinate_variable ~cf.AuxiliaryCoordinate.nc_set_node_coordinate_variable_groups ~cf.AuxiliaryCoordinate.nc_clear_node_coordinate_variable_groups - ~cf.AuxiliaryCoordinate.nc_clear_hdf5_chunksizes - ~cf.AuxiliaryCoordinate.nc_hdf5_chunksizes - ~cf.AuxiliaryCoordinate.nc_set_hdf5_chunksizes + ~cf.AuxiliaryCoordinate.nc_clear_dataset_chunksizes + ~cf.AuxiliaryCoordinate.nc_dataset_chunksizes + ~cf.AuxiliaryCoordinate.nc_set_dataset_chunksizes Groups ^^^^^^ @@ -731,6 +731,9 @@ Deprecated ~cf.AuxiliaryCoordinate.remove_data ~cf.AuxiliaryCoordinate.select ~cf.AuxiliaryCoordinate.setprop + ~cf.AuxiliaryCoordinate.nc_clear_hdf5_chunksizes + ~cf.AuxiliaryCoordinate.nc_hdf5_chunksizes + ~cf.AuxiliaryCoordinate.nc_set_hdf5_chunksizes .. rubric:: Attributes diff --git a/docs/source/class/cf.Bounds.rst b/docs/source/class/cf.Bounds.rst index ea6558c98d..98d8caec77 100644 --- a/docs/source/class/cf.Bounds.rst +++ b/docs/source/class/cf.Bounds.rst @@ -414,9 +414,9 @@ NetCDF ~cf.Bounds.nc_get_dimension ~cf.Bounds.nc_has_dimension ~cf.Bounds.nc_set_dimension - ~cf.Bounds.nc_clear_hdf5_chunksizes - ~cf.Bounds.nc_hdf5_chunksizes - ~cf.Bounds.nc_set_hdf5_chunksizes + ~cf.Bounds.nc_clear_dataset_chunksizes + ~cf.Bounds.nc_dataset_chunksizes + ~cf.Bounds.nc_set_dataset_chunksizes Aggregation ----------- @@ -651,3 +651,6 @@ Deprecated ~cf.Bounds.select ~cf.Bounds.setprop ~cf.Bounds.unsafe_array + ~cf.Bounds.nc_clear_hdf5_chunksizes + ~cf.Bounds.nc_hdf5_chunksizes + ~cf.Bounds.nc_set_hdf5_chunksizes diff --git a/docs/source/class/cf.CellConnectivity.rst b/docs/source/class/cf.CellConnectivity.rst index 969ebab2bd..0723d8f9af 100644 --- a/docs/source/class/cf.CellConnectivity.rst +++ b/docs/source/class/cf.CellConnectivity.rst @@ -431,9 +431,9 @@ NetCDF ~cf.CellConnectivity.nc_get_variable ~cf.CellConnectivity.nc_has_variable ~cf.CellConnectivity.nc_set_variable - ~cf.CellConnectivity.nc_clear_hdf5_chunksizes - ~cf.CellConnectivity.nc_hdf5_chunksizes - ~cf.CellConnectivity.nc_set_hdf5_chunksizes + ~cf.CellConnectivity.nc_clear_dataset_chunksizes + ~cf.CellConnectivity.nc_dataset_chunksizes + ~cf.CellConnectivity.nc_set_dataset_chunksizes Aggregation ----------- @@ -477,8 +477,8 @@ Groups ~cf.CellConnectivity.nc_clear_variable_groups ~cf.CellConnectivity.nc_set_variable_groups -HDF5 chunks -^^^^^^^^^^^ +Dataset chunks +^^^^^^^^^^^^^^ .. rubric:: Methods @@ -487,9 +487,9 @@ HDF5 chunks :toctree: ../method/ :template: method.rst - ~cf.CellConnectivity.nc_hdf5_chunksizes - ~cf.CellConnectivity.nc_set_hdf5_chunksizes - ~cf.CellConnectivity.nc_clear_hdf5_chunksizes + ~cf.CellConnectivity.nc_dataset_chunksizes + ~cf.CellConnectivity.nc_set_dataset_chunksizes + ~cf.CellConnectivity.nc_clear_dataset_chunksizes Aliases ------- @@ -583,3 +583,7 @@ Deprecated ~cf.CellConnectivity.select ~cf.CellConnectivity.setprop ~cf.CellConnectivity.unsafe_array + ~cf.CellConnectivity.nc_clear_hdf5_chunksizes + ~cf.CellConnectivity.nc_hdf5_chunksizes + ~cf.CellConnectivity.nc_set_hdf5_chunksizes + diff --git a/docs/source/class/cf.CellMeasure.rst b/docs/source/class/cf.CellMeasure.rst index 02b4871231..bdb3ef502e 100644 --- a/docs/source/class/cf.CellMeasure.rst +++ b/docs/source/class/cf.CellMeasure.rst @@ -434,9 +434,9 @@ NetCDF ~cf.CellMeasure.nc_set_variable ~cf.CellMeasure.nc_get_external ~cf.CellMeasure.nc_set_external - ~cf.CellMeasure.nc_clear_hdf5_chunksizes - ~cf.CellMeasure.nc_hdf5_chunksizes - ~cf.CellMeasure.nc_set_hdf5_chunksizes + ~cf.CellMeasure.nc_clear_dataset_chunksizes + ~cf.CellMeasure.nc_dataset_chunksizes + ~cf.CellMeasure.nc_set_dataset_chunksizes Aggregation ----------- @@ -676,3 +676,6 @@ Deprecated ~cf.CellMeasure.select ~cf.CellMeasure.setprop ~cf.CellMeasure.unsafe_array + ~cf.CellMeasure.nc_clear_hdf5_chunksizes + ~cf.CellMeasure.nc_hdf5_chunksizes + ~cf.CellMeasure.nc_set_hdf5_chunksizes diff --git a/docs/source/class/cf.Count.rst b/docs/source/class/cf.Count.rst index 43017e9d5b..011938c558 100644 --- a/docs/source/class/cf.Count.rst +++ b/docs/source/class/cf.Count.rst @@ -411,9 +411,9 @@ NetCDF ~cf.Count.nc_get_sample_dimension ~cf.Count.nc_has_sample_dimension ~cf.Count.nc_set_sample_dimension - ~cf.Count.nc_clear_hdf5_chunksizes - ~cf.Count.nc_hdf5_chunksizes - ~cf.Count.nc_set_hdf5_chunksizes + ~cf.Count.nc_clear_dataset_chunksizes + ~cf.Count.nc_dataset_chunksizes + ~cf.Count.nc_set_dataset_chunksizes Aggregation ----------- @@ -660,3 +660,6 @@ Deprecated ~cf.Count.select ~cf.Count.setprop ~cf.Count.unsafe_array + ~cf.Count.nc_clear_hdf5_chunksizes + ~cf.Count.nc_hdf5_chunksizes + ~cf.Count.nc_set_hdf5_chunksizes diff --git a/docs/source/class/cf.Data.rst b/docs/source/class/cf.Data.rst index df1e94cf98..488ea75df8 100644 --- a/docs/source/class/cf.Data.rst +++ b/docs/source/class/cf.Data.rst @@ -639,9 +639,9 @@ Performance :toctree: ../method/ :template: method.rst - ~cf.Data.nc_clear_hdf5_chunksizes - ~cf.Data.nc_hdf5_chunksizes - ~cf.Data.nc_set_hdf5_chunksizes + ~cf.Data.nc_clear_dataset_chunksizes + ~cf.Data.nc_dataset_chunksizes + ~cf.Data.nc_set_dataset_chunksizes ~cf.Data.rechunk ~cf.Data.close ~cf.Data.chunks @@ -896,7 +896,10 @@ Deprecated ~cf.Data.to_disk ~cf.Data.to_memory ~cf.Data.unsafe_array - + ~cf.Data.nc_clear_hdf5_chunksizes + ~cf.Data.nc_hdf5_chunksizes + ~cf.Data.nc_set_hdf5_chunksizes + .. rubric:: Attributes .. autosummary:: diff --git a/docs/source/class/cf.DimensionCoordinate.rst b/docs/source/class/cf.DimensionCoordinate.rst index 7c46395bc0..d3e4944747 100644 --- a/docs/source/class/cf.DimensionCoordinate.rst +++ b/docs/source/class/cf.DimensionCoordinate.rst @@ -508,9 +508,9 @@ NetCDF ~cf.DimensionCoordinate.nc_get_variable ~cf.DimensionCoordinate.nc_has_variable ~cf.DimensionCoordinate.nc_set_variable - ~cf.DimensionCoordinate.nc_clear_hdf5_chunksizes - ~cf.DimensionCoordinate.nc_hdf5_chunksizes - ~cf.DimensionCoordinate.nc_set_hdf5_chunksizes + ~cf.DimensionCoordinate.nc_clear_dataset_chunksizes + ~cf.DimensionCoordinate.nc_dataset_chunksizes + ~cf.DimensionCoordinate.nc_set_dataset_chunksizes Groups ^^^^^^ @@ -744,6 +744,9 @@ Deprecated ~cf.DimensionCoordinate.remove_data ~cf.DimensionCoordinate.select ~cf.DimensionCoordinate.setprop + ~cf.DimensionCoordinate.nc_clear_hdf5_chunksizes + ~cf.DimensionCoordinate.nc_hdf5_chunksizes + ~cf.DimensionCoordinate.nc_set_hdf5_chunksizes .. rubric:: Attributes diff --git a/docs/source/class/cf.DomainAncillary.rst b/docs/source/class/cf.DomainAncillary.rst index 118b2fe4e1..13f4f78f6f 100644 --- a/docs/source/class/cf.DomainAncillary.rst +++ b/docs/source/class/cf.DomainAncillary.rst @@ -463,9 +463,9 @@ NetCDF ~cf.DomainAncillary.nc_get_variable ~cf.DomainAncillary.nc_has_variable ~cf.DomainAncillary.nc_set_variable - ~cf.DomainAncillary.nc_clear_hdf5_chunksizes - ~cf.DomainAncillary.nc_hdf5_chunksizes - ~cf.DomainAncillary.nc_set_hdf5_chunksizes + ~cf.DomainAncillary.nc_clear_dataset_chunksizes + ~cf.DomainAncillary.nc_dataset_chunksizes + ~cf.DomainAncillary.nc_set_dataset_chunksizes Aggregation ----------- @@ -706,3 +706,6 @@ Deprecated ~cf.DomainAncillary.select ~cf.DomainAncillary.setprop ~cf.DomainAncillary.unsafe_array + ~cf.DomainAncillary.nc_clear_hdf5_chunksizes + ~cf.DomainAncillary.nc_hdf5_chunksizes + ~cf.DomainAncillary.nc_set_hdf5_chunksizes diff --git a/docs/source/class/cf.DomainTopology.rst b/docs/source/class/cf.DomainTopology.rst index a9b7d7bd32..9e7e84e22b 100644 --- a/docs/source/class/cf.DomainTopology.rst +++ b/docs/source/class/cf.DomainTopology.rst @@ -432,9 +432,9 @@ NetCDF ~cf.DomainTopology.nc_get_variable ~cf.DomainTopology.nc_has_variable ~cf.DomainTopology.nc_set_variable - ~cf.DomainTopology.nc_clear_hdf5_chunksizes - ~cf.DomainTopology.nc_hdf5_chunksizes - ~cf.DomainTopology.nc_set_hdf5_chunksizes + ~cf.DomainTopology.nc_clear_dataset_chunksizes + ~cf.DomainTopology.nc_dataset_chunksizes + ~cf.DomainTopology.nc_set_dataset_chunksizes Aggregation ----------- @@ -478,8 +478,8 @@ Groups ~cf.DomainTopology.nc_clear_variable_groups ~cf.DomainTopology.nc_set_variable_groups -HDF5 chunks -^^^^^^^^^^^ +Dataset chunks +^^^^^^^^^^^^^^ .. rubric:: Methods @@ -488,9 +488,9 @@ HDF5 chunks :toctree: ../method/ :template: method.rst - ~cf.DomainTopology.nc_hdf5_chunksizes - ~cf.DomainTopology.nc_set_hdf5_chunksizes - ~cf.DomainTopology.nc_clear_hdf5_chunksizes + ~cf.DomainTopology.nc_dataset_chunksizes + ~cf.DomainTopology.nc_set_dataset_chunksizes + ~cf.DomainTopology.nc_clear_dataset_chunksizes Aliases ------- @@ -584,3 +584,6 @@ Deprecated ~cf.DomainTopology.select ~cf.DomainTopology.setprop ~cf.DomainTopology.unsafe_array + ~cf.DomainTopology.nc_clear_hdf5_chunksizes + ~cf.DomainTopology.nc_hdf5_chunksizes + ~cf.DomainTopology.nc_set_hdf5_chunksizes diff --git a/docs/source/class/cf.Field.rst b/docs/source/class/cf.Field.rst index ecf849dabf..72f9780687 100644 --- a/docs/source/class/cf.Field.rst +++ b/docs/source/class/cf.Field.rst @@ -417,9 +417,9 @@ NetCDF ~cf.Field.nc_set_global_attribute ~cf.Field.nc_set_global_attributes ~cf.Field.ncdimensions - ~cf.Field.nc_clear_hdf5_chunksizes - ~cf.Field.nc_hdf5_chunksizes - ~cf.Field.nc_set_hdf5_chunksizes + ~cf.Field.nc_clear_dataset_chunksizes + ~cf.Field.nc_dataset_chunksizes + ~cf.Field.nc_set_dataset_chunksizes Groups ^^^^^^ @@ -1036,6 +1036,9 @@ Deprecated ~cf.Field.setprop ~cf.Field.transpose_item ~cf.Field.unlimited + ~cf.Field.nc_clear_hdf5_chunksizes + ~cf.Field.nc_hdf5_chunksizes + ~cf.Field.nc_set_hdf5_chunksizes .. rubric:: Attributes diff --git a/docs/source/class/cf.FieldAncillary.rst b/docs/source/class/cf.FieldAncillary.rst index e83f3c2eae..679574045a 100644 --- a/docs/source/class/cf.FieldAncillary.rst +++ b/docs/source/class/cf.FieldAncillary.rst @@ -407,9 +407,9 @@ NetCDF ~cf.FieldAncillary.nc_get_variable ~cf.FieldAncillary.nc_has_variable ~cf.FieldAncillary.nc_set_variable - ~cf.FieldAncillary.nc_clear_hdf5_chunksizes - ~cf.FieldAncillary.nc_hdf5_chunksizes - ~cf.FieldAncillary.nc_set_hdf5_chunksizes + ~cf.FieldAncillary.nc_clear_dataset_chunksizes + ~cf.FieldAncillary.nc_dataset_chunksizes + ~cf.FieldAncillary.nc_set_dataset_chunksizes Aggregation ----------- @@ -648,3 +648,6 @@ Deprecated ~cf.FieldAncillary.select ~cf.FieldAncillary.setprop ~cf.FieldAncillary.unsafe_array + ~cf.FieldAncillary.nc_clear_hdf5_chunksizes + ~cf.FieldAncillary.nc_hdf5_chunksizes + ~cf.FieldAncillary.nc_set_hdf5_chunksizes diff --git a/docs/source/class/cf.Index.rst b/docs/source/class/cf.Index.rst index 339fc61099..4a9a431127 100644 --- a/docs/source/class/cf.Index.rst +++ b/docs/source/class/cf.Index.rst @@ -412,9 +412,9 @@ NetCDF ~cf.Index.nc_get_sample_dimension ~cf.Index.nc_has_sample_dimension ~cf.Index.nc_set_sample_dimension - ~cf.Index.nc_clear_hdf5_chunksizes - ~cf.Index.nc_hdf5_chunksizes - ~cf.Index.nc_set_hdf5_chunksizes + ~cf.Index.nc_clear_dataset_chunksizes + ~cf.Index.nc_dataset_chunksizes + ~cf.Index.nc_set_dataset_chunksizes Aggregation ----------- @@ -661,3 +661,6 @@ Deprecated ~cf.Index.select ~cf.Index.setprop ~cf.Index.unsafe_array + ~cf.Index.nc_clear_hdf5_chunksizes + ~cf.Index.nc_hdf5_chunksizes + ~cf.Index.nc_set_hdf5_chunksizes diff --git a/docs/source/class/cf.List.rst b/docs/source/class/cf.List.rst index 52cd8e4193..4f343ecc4c 100644 --- a/docs/source/class/cf.List.rst +++ b/docs/source/class/cf.List.rst @@ -404,9 +404,9 @@ NetCDF ~cf.List.nc_get_variable ~cf.List.nc_has_variable ~cf.List.nc_set_variable - ~cf.List.nc_clear_hdf5_chunksizes - ~cf.List.nc_hdf5_chunksizes - ~cf.List.nc_set_hdf5_chunksizes + ~cf.List.nc_clear_dataset_chunksizes + ~cf.List.nc_dataset_chunksizes + ~cf.List.nc_set_dataset_chunksizes Aggregation ----------- @@ -647,3 +647,6 @@ Deprecated ~cf.List.select ~cf.List.setprop ~cf.List.unsafe_array + ~cf.List.nc_clear_hdf5_chunksizes + ~cf.List.nc_hdf5_chunksizes + ~cf.List.nc_set_hdf5_chunksizes diff --git a/docs/source/class/cf.ZarrArray.rst b/docs/source/class/cf.ZarrArray.rst new file mode 100644 index 0000000000..7740d90d7e --- /dev/null +++ b/docs/source/class/cf.ZarrArray.rst @@ -0,0 +1,150 @@ +.. currentmodule:: cf +.. default-role:: obj + +cf.ZarrArray +============ + +---- + +.. autoclass:: cf.ZarrArray + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.ZarrArray.get_compression_type + ~cf.ZarrArray.get_subspace + ~cf.ZarrArray.get_attributes + ~cf.ZarrArray.index + ~cf.ZarrArray.is_subspace + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cf.ZarrArray.array + ~cf.ZarrArray.astype + ~cf.ZarrArray.dtype + ~cf.ZarrArray.ndim + ~cf.ZarrArray.shape + ~cf.ZarrArray.size + ~cf.ZarrArray.original_shape + ~cf.ZarrArray.reference_shape + +Units +----- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.ZarrArray.get_calendar + ~cf.ZarrArray.get_units + ~cf.ZarrArray.Units + + + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.ZarrArray.get_address + ~cf.ZarrArray.get_addresses + ~cf.ZarrArray.close + ~cf.ZarrArray.open + ~cf.ZarrArray.get_filename + ~cf.ZarrArray.get_filenames + ~cf.ZarrArray.get_format + ~cf.ZarrArray.get_formats + ~cf.ZarrArray.get_groups + ~cf.ZarrArray.get_mask + ~cf.ZarrArray.get_unpack + ~cf.ZarrArray.get_storage_options + ~cf.ZarrArray.add_file_location + ~cf.ZarrArray.del_file_location + ~cf.ZarrArray.file_locations + ~cf.ZarrArray.file_directory + ~cf.ZarrArray.replace_directory + ~cf.ZarrArray.replace_filename + ~cf.ZarrArray._lock + + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.ZarrArray.copy + ~cf.ZarrArray.to_memory + +Active storage +-------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.ZarrArray.active_storage + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.ZarrArray.__dask_tokenize__ + ~cf.ZarrArray.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.ZarrArray._docstring_special_substitutions + ~cf.ZarrArray._docstring_substitutions + ~cf.ZarrArray._docstring_package_depth + ~cf.ZarrArray._docstring_method_exclusions + +Deprecated +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.ZarrArray.filename + ~cf.ZarrArray.get_missing_values diff --git a/docs/source/conf.py b/docs/source/conf.py index eaaf3da986..aa32bf1717 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -157,6 +157,7 @@ def _get_date(): "matplotlib": ("https://matplotlib.org/stable/", None), # REVIEW: h5: new intersphinx mapping "h5netcdf": ("https://h5netcdf.org", None), + "zarr": ("https://zarr.readthedocs.io", None), } # This extension is meant to help with the common pattern of having diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index b55d242488..1e9c66ed0d 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -70,6 +70,8 @@ manipulation and can: * read files from OPeNDAP servers and S3 object stores, +* be fully flexible with respect to dataset chunking, + * create new field constructs in memory, * write and append field constructs to netCDF datasets on disk, diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 326d93e608..c741aed4e1 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -130,6 +130,14 @@ The following file types can be read: with or without the data array values. .. + +* Datasets in `Zarr v2 (xarray-style) + `_ + and `Zarr v3 + `_ + formats. + +.. * `CFA-netCDF `_ @@ -193,7 +201,7 @@ replacing any file name with a directory name. An attempt will be made to read all files in the directory, which will result in an error if any have a non-supported format. Non-supported files may be ignored by being more specific about the file type intended for reading in -using the *file_type* keyword: +using the *dataset_type* keyword: .. code-block:: python :caption: *Read all of the files in the current working directory.* @@ -202,7 +210,7 @@ using the *file_type* keyword: Traceback (most recent call last): ... Exception: Can't determine format of file cf_tutorial_files.zip - >>> y = cf.read('$PWD', file_type='netCDF') + >>> y = cf.read('$PWD', dataset_type='netCDF') >>> len(y) 15 diff --git a/setup.py b/setup.py index 498a6f8eef..706d89af63 100755 --- a/setup.py +++ b/setup.py @@ -177,7 +177,9 @@ def compile(): The ``cf`` package can: -* read field constructs from netCDF, CDL, PP and UM datasets, +* read field constructs from netCDF, CDL, Zarr, PP and UM datasets, + +* be fully flexible with respect to dataset storage chunking, * create new field constructs in memory,