diff --git a/Changelog.rst b/Changelog.rst index 1bbaf8583d..ca1a0b96bd 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,10 +1,16 @@ Version NEXTVERSION -------------------- +-------------- -**2025-12-??** +**2026-01-??** +* Write Zarr v3 datasets with `cf.write`, and allow the reading of + grouped Zarr v2 and v3 datasets with `cf.read` + (https://github.com/NCAS-CMS/cf-python/issues/895) +* Read Zarr v2 and v3 datasets that contain a group hierarchy with + `cf.read` (https://github.com/NCAS-CMS/cf-python/issues/894) * Reduce the time taken to import `cf` (https://github.com/NCAS-CMS/cf-python/issues/902) +* New optional dependency: ``zarr>=3.1.3`` ---- diff --git a/README.md b/README.md index 8829d96f6a..140e6faeb9 100644 --- a/README.md +++ b/README.md @@ -87,7 +87,8 @@ of its array manipulation and can: * read field constructs from netCDF, CDL, Zarr, PP and UM datasets with a choice of netCDF backends,and in local, http, and s3 locations, * create new field constructs in memory, -* write and append field and domain constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr v3 + datasets on disk, * read, create, and manipulate UGRID mesh topologies, * read, write, and create coordinates defined by geometry cells, * read netCDF and CDL datasets containing hierarchical groups, diff --git a/cf/functions.py b/cf/functions.py index f63b037305..daf30b8560 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -3184,7 +3184,7 @@ def environment(display=True, paths=True): netCDF4: 1.7.2 /home/miniconda3/lib/python3.12/site-packages/netCDF4/__init__.py h5netcdf: 1.3.0 /home/miniconda3/lib/python3.12/site-packages/h5netcdf/__init__.py h5py: 3.12.1 /home/miniconda3/lib/python3.12/site-packages/h5py/__init__.py - zarr: 3.0.8 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py + zarr: 3.1.3 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py s3fs: 2024.12.0 /home/miniconda3/lib/python3.12/site-packages/s3fs/__init__.py scipy: 1.15.1 /home/miniconda3/lib/python3.12/site-packages/scipy/__init__.py dask: 2025.5.1 /home/miniconda3/lib/python3.12/site-packages/dask/__init__.py @@ -3210,7 +3210,7 @@ def environment(display=True, paths=True): netCDF4: 1.7.2 h5netcdf: 1.3.0 h5py: 3.12.1 - zarr: 3.0.8 + zarr: 3.1.3 s3fs: 2024.12.0 scipy: 1.15.1 dask: 2025.5.1 diff --git a/cf/read_write/read.py b/cf/read_write/read.py index ba3cd8469b..803a4d8e39 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -316,6 +316,10 @@ class read(cfdm.read): .. versionadded:: 3.17.0 + {{read store_dataset_shards: `bool`, optional}} + + .. versionadded:: NEXTVERSION + {{read cfa: `dict`, optional}} .. versionadded:: 3.15.0 @@ -328,6 +332,10 @@ class read(cfdm.read): .. versionadded:: 3.17.0 + {{read group_dimension_search: `str`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -434,6 +442,7 @@ def __new__( warn_valid=False, dask_chunks="storage-aligned", store_dataset_chunks=True, + store_dataset_shards=True, domain=False, cfa=None, cfa_write=None, @@ -445,6 +454,7 @@ def __new__( ignore_read_error=False, fmt=None, file_type=None, + group_dimension_search="closest_ancestor", ): """Read field or domain constructs from a dataset.""" kwargs = locals() diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 8f21984094..9daf1c4612 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -3575,7 +3575,7 @@ def read( # Return now if there are valid file types return [] - f = self.file_open(filename, parse=True) + f = self.dataset_open(filename, parse=True) info = is_log_level_info(logger) @@ -3598,7 +3598,7 @@ def read( for var in f.vars ] - self.file_close() + self.dataset_close() return [field for x in um for field in x.fields if field] @@ -3632,7 +3632,7 @@ def _open_um_file( The open PP or FF file object. """ - self.file_close() + self.dataset_close() try: f = File( filename, @@ -3678,15 +3678,15 @@ def is_um_file(self, filename): try: # Note: No need to completely parse the file to ascertain # if it's PP or FF. - self.file_open(filename, parse=False) + self.dataset_open(filename, parse=False) except Exception: - self.file_close() + self.dataset_close() return False else: - self.file_close() + self.dataset_close() return True - def file_close(self): + def dataset_close(self): """Close the file that has been read. :Returns: @@ -3700,7 +3700,7 @@ def file_close(self): self._um_file = None - def file_open(self, filename, parse=True): + def dataset_open(self, filename, parse=True): """Open the file for reading. :Paramters: diff --git a/cf/test/test_zarr.py b/cf/test/test_zarr.py new file mode 100644 index 0000000000..2310a83953 --- /dev/null +++ b/cf/test/test_zarr.py @@ -0,0 +1,333 @@ +import atexit +import datetime +import faulthandler +import os +import shutil +import tempfile +import unittest + +faulthandler.enable() # to debug seg faults and timeouts + +import zarr + +import cf + +warnings = False + +# Set up temporary directories +tmpdirs = [ + tempfile.mkdtemp("_test_zarr.zarr", dir=os.getcwd()) for i in range(2) +] +[tmpdir1, tmpdir2] = tmpdirs + +# Set up temporary files +tmpfiles = [ + tempfile.mkstemp("_test_zarr.nc", dir=os.getcwd())[1] for i in range(2) +] +[tmpfile1, tmpfile2] = tmpfiles + + +def _remove_tmpdirs(): + """Remove temporary files created during tests.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + for d in tmpdirs: + try: + shutil.rmtree(d) + os.rmdir(d) + except OSError: + pass + + +atexit.register(_remove_tmpdirs) + + +class read_writeTest(unittest.TestCase): + """Test the reading and writing of field constructs from/to disk.""" + + f0 = cf.example_field(0) + + def setUp(self): + """Preparations called immediately before each test method.""" + # Disable log messages to silence expected warnings + cf.LOG_LEVEL("DISABLE") + # Note: to enable all messages for given methods, lines or + # calls (those without a 'verbose' option to do the same) + # e.g. to debug them, wrap them (for methods, start-to-end + # internally) as follows: cf.LOG_LEVEL('DEBUG') + # + # < ... test code ... > + # cf.log_level('DISABLE') + + def test_zarr_read_write_1(self): + """Test Zarr read/write on example fields.""" + for i, f in enumerate(cf.example_fields()): + if i in (8, 9, 10): + # Can't write UGRID yet + continue + + cf.write(f, tmpdir1, fmt="ZARR3") + z = cf.read(tmpdir1) + self.assertEqual(len(z), 1) + z = z[0] + self.assertTrue(z.equals(f)) + + # Check that the Zarr and netCDF4 encodings are equivalent + cf.write(f, tmpfile1, fmt="NETCDF4") + n = cf.read(tmpfile1)[0] + self.assertTrue(z.equals(n)) + + def test_zarr_read_write_2(self): + """Test Zarr read/write on various netCDF files.""" + for filename in ( + "DSG_timeSeries_contiguous.nc", + "DSG_timeSeries_indexed.nc", + "DSG_timeSeriesProfile_indexed_contiguous.nc", + "gathered.nc", + "geometry_1.nc", + "geometry_2.nc", + "geometry_3.nc", + "geometry_4.nc", + "string_char.nc", + ): + n = cf.read(filename) + cf.write(n, tmpdir1, fmt="ZARR3") + z = cf.read(tmpdir1) + self.assertEqual(len(z), len(n)) + for a, b in zip(z, n): + self.assertTrue(a.equals(b)) + + def test_zarr_read_write_chunks_shards(self): + """Test Zarr read/write with chunks and shards.""" + f = self.f0.copy() + f.data.nc_set_dataset_chunksizes([2, 3]) + + cf.write(f, tmpdir1, fmt="ZARR3") + z = cf.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertIsNone(z["q"].shards) + + # Make shards comprising 4 chunks + cf.write(f, tmpdir1, fmt="ZARR3", dataset_shards=4) + z = cf.read(tmpdir1, store_dataset_shards=False)[0] + self.assertTrue(z.equals(f)) + self.assertIsNone(z.data.nc_dataset_shards()) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertEqual(z["q"].shards, (4, 6)) + + for shards in (4, [2, 2]): + f.data.nc_set_dataset_shards(shards) + cf.write(f, tmpdir1, fmt="ZARR3") + z = cf.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + self.assertEqual(z.data.nc_dataset_shards(), (2, 2)) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertEqual(z["q"].shards, (4, 6)) + + def test_zarr_read_write_CFA(self): + """Test CF aggreagtion in Zarr.""" + f = self.f0 + + cf.write(f, tmpdir1, fmt="ZARR3") + cf.write(f, tmpfile1, fmt="NETCDF4") + + z = cf.read(tmpdir1, cfa_write="field")[0] + n = cf.read(tmpfile1, cfa_write="field")[0] + + self.assertTrue(z.equals(f)) + self.assertTrue(z.equals(n)) + + cf.write(z, tmpdir2, fmt="ZARR3", cfa="field") + cf.write(n, tmpfile2, fmt="NETCDF4", cfa="field") + + z = cf.read(tmpdir2)[0] + n = cf.read(tmpfile2)[0] + + self.assertTrue(z.equals(f)) + self.assertTrue(z.equals(n)) + + def test_zarr_groups_1(self): + """Test for the general handling of Zarr hierarchical groups.""" + f = cf.example_field(1) + + # Add a second grid mapping + datum = cf.Datum(parameters={"earth_radius": 7000000}) + conversion = cf.CoordinateConversion( + parameters={"grid_mapping_name": "latitude_longitude"} + ) + + grid = cf.CoordinateReference( + coordinate_conversion=conversion, + datum=datum, + coordinates=["auxiliarycoordinate0", "auxiliarycoordinate1"], + ) + + f.set_construct(grid) + + grid0 = f.construct("grid_mapping_name:rotated_latitude_longitude") + grid0.del_coordinate("auxiliarycoordinate0") + grid0.del_coordinate("auxiliarycoordinate1") + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.construct("grid_latitude").bounds.nc_set_variable_groups( + ["forecast"] + ) + for name in ( + "longitude", # Auxiliary coordinate + "latitude", # Auxiliary coordinate + "long_name=Grid latitude name", # Auxiliary coordinate + "measure:area", # Cell measure + "surface_altitude", # Domain ancillary + "air_temperature standard_error", # Field ancillary + "grid_mapping_name:rotated_latitude_longitude", + "time", # Dimension coordinate + "grid_latitude", # Dimension coordinate + ): + f.construct(name).nc_set_variable_groups(["forecast"]) + + # Check the groups + cf.write(f, grouped_file, fmt="NETCDF4") + cf.write(f, grouped_dir, fmt="ZARR3") + + n = cf.read(grouped_file)[0] + z = cf.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + # Directly check the groups in the Zarr dataset + x = zarr.open(grouped_dir) + self.assertEqual(list(x.group_keys()), ["forecast"]) + self.assertEqual(list(x["forecast"].group_keys()), ["model"]) + + cf.write(z, tmpdir2, fmt="ZARR3") + z1 = cf.read(tmpdir2)[0] + self.assertTrue(z1.equals(f)) + + def test_zarr_groups_dimension(self): + """Test Zarr groups dimensions.""" + f = self.f0.copy() + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + for construct in f.constructs.filter_by_data().values(): + construct.nc_set_variable_groups(["forecast"]) + + for construct in f.coordinates().values(): + try: + construct.bounds.nc_set_variable_groups(["forecast"]) + except ValueError: + pass + + domain_axis = f.domain_axis("latitude") + domain_axis.nc_set_dimension_groups(["forecast"]) + + # Check the groups + cf.write(f, grouped_file, fmt="NETCDF4") + cf.write(f, grouped_dir, fmt="ZARR3") + + n = cf.read(grouped_file)[0] + z = cf.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + # Check that grouped netCDF datasets can only be read with + # 'closest_ancestor' + cf.read(grouped_file, group_dimension_search="closest_ancestor") + for gsn in ("furthest_ancestor", "local", "BAD VALUE"): + with self.assertRaises(ValueError): + cf.read(grouped_file, group_dimension_search=gsn) + + def test_zarr_groups_DSG(self): + """Test Zarr groups containing DSGs.""" + f = cf.example_field(4) + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + f.compress("indexed_contiguous", inplace=True) + f.data.get_count().nc_set_variable("count") + f.data.get_index().nc_set_variable("index") + + # Set some groups. (Write the read the field first to create + # the compressions variables on disk.) + cf.write(f, tmpfile2) + f = cf.read(tmpfile2)[0] + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.data.get_count().nc_set_variable_groups(["forecast"]) + f.data.get_index().nc_set_variable_groups(["forecast"]) + f.construct("altitude").nc_set_variable_groups(["forecast"]) + f.data.get_count().nc_set_sample_dimension_groups(["forecast"]) + + cf.write(f, grouped_file, fmt="NETCDF4") + cf.write(f, grouped_dir, fmt="ZARR3") + + n = cf.read(grouped_file) + z = cf.read(grouped_dir) + + n = n[0] + z = z[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + def test_zarr_groups_geometry(self): + """Test Zarr groups containing cell geometries.""" + f = cf.example_field(6) + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + cf.write(f, tmpfile2) + f = cf.read(tmpfile2)[0] + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.nc_set_geometry_variable_groups(["forecast"]) + f.coordinate("longitude").bounds.nc_set_variable_groups(["forecast"]) + f.nc_set_component_variable_groups("node_count", ["forecast"]) + f.nc_set_component_variable_groups("part_node_count", ["forecast"]) + f.nc_set_component_variable("interior_ring", "interior_ring") + f.nc_set_component_variable_groups("interior_ring", ["forecast"]) + + # Check the groups + cf.write(f, grouped_file, fmt="NETCDF4") + cf.write(f, grouped_dir, fmt="ZARR3") + + n = cf.read(grouped_file)[0] + z = cf.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + def test_zarr_read_v2(self): + """Test reading Zarr v2.""" + f2 = cf.read("example_field_0.zarr2") + f3 = cf.read("example_field_0.zarr3") + self.assertEqual(len(f2), len(f3)) + self.assertEqual(len(f2), 1) + self.assertTrue(f2[0].equals(f3[0])) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print("") + unittest.main(verbosity=2) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index b029966c48..4ec36e0d67 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -265,6 +265,12 @@ Some further dependencies that enable further functionality are optional. This to facilitate cf-python being installed in restricted environments for which these features are not required. +.. rubric:: Zarr + +* `zarr `_, version 3.1.3 or newer. + + For reading and writing Zarr datasets. + .. rubric:: Regridding * `esmpy `_, previously diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 30514a0e5c..9e9b3208a9 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -67,8 +67,8 @@ may nonetheless be modified in memory. The `cf` package can: * read :term:`field constructs ` and :term:`domain - constructs ` from netCDF, CDL, PP and UM datasets - with a choice of netCDF backends, + constructs ` from netCDF, CDL, Zarr, PP and UM + datasets with a choice of netCDF backends, * read files from OPeNDAP servers and S3 object stores, @@ -76,7 +76,8 @@ The `cf` package can: * create new field constructs in memory, -* write and append field constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr + datasets on disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/docs/source/recipes/plot_19_recipe.py b/docs/source/recipes/plot_19_recipe.py index 02d493dc21..dcc0926fbd 100644 --- a/docs/source/recipes/plot_19_recipe.py +++ b/docs/source/recipes/plot_19_recipe.py @@ -55,10 +55,7 @@ # of the maxima, we loop through the season query mapping and do a # "T: mean" collapse setting the season as the grouping: cfp.gopen( - rows=2, - columns=1, - bottom=0.1, - top=0.85, + rows=2, columns=1, bottom=0.1, top=0.85, ) cfp.gpos(1) cfp.gset(xmin="1980-01-01", xmax="2022-12-01", ymin=304, ymax=312) diff --git a/docs/source/recipes/plot_22_recipe.py b/docs/source/recipes/plot_22_recipe.py index 377313c899..fe329cda9d 100644 --- a/docs/source/recipes/plot_22_recipe.py +++ b/docs/source/recipes/plot_22_recipe.py @@ -11,10 +11,11 @@ # %% # 1. Import cf-python, Dask.array, NumPy, and Matplotlib: -import cf import dask.array as da -import numpy as np import matplotlib.pyplot as plt +import numpy as np + +import cf # %% # 2. Read the field constructs and load the wind speed component fields: diff --git a/docs/source/recipes/plot_23_recipe.py b/docs/source/recipes/plot_23_recipe.py index 2499b0d875..29537803af 100644 --- a/docs/source/recipes/plot_23_recipe.py +++ b/docs/source/recipes/plot_23_recipe.py @@ -18,12 +18,12 @@ # sphinx_gallery_thumbnail_number = 2 # sphinx_gallery_end_ignore -import matplotlib.pyplot as plt import cfplot as cfp -import cf - -import numpy as np import dask.array as da +import matplotlib.pyplot as plt +import numpy as np + +import cf # %% # 2. Read example data field constructs, and set region for our plots: diff --git a/docs/source/recipes/recipe_list.txt b/docs/source/recipes/recipe_list.txt index 0a8930811a..3dad79a79c 100644 --- a/docs/source/recipes/recipe_list.txt +++ b/docs/source/recipes/recipe_list.txt @@ -37,10 +37,4 @@ plot_18_recipe.html#sphx-glr-recipes-plot-18-recipe-py plot_19_recipe.html#sphx-glr-recipes-plot-19-recipe-py
plot_20_recipe.html#sphx-glr-recipes-plot-20-recipe-py -
-plot_21_recipe.html#sphx-glr-recipes-plot-21-recipe-py -
-plot_22_recipe.html#sphx-glr-recipes-plot-22-recipe-py -
-plot_23_recipe.html#sphx-glr-recipes-plot-23-recipe-py -
+
\ No newline at end of file diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index bc3b6671c6..20cd5c11b6 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -5275,8 +5275,11 @@ Method Classes **Writing to a netCDF dataset** ------------------------------- +**Writing to disk** +------------------- + The `cf.write` function writes a field construct, or a sequence of -field constructs, to a new netCDF file on disk: +field constructs, to a netCDF or Zarr dataset on disk: .. code-block:: python :caption: *Write a field construct to a netCDF dataset on disk.* @@ -5345,8 +5348,8 @@ By default the output file will be for CF-|version|. The `cf.write` function has optional parameters to -* set the output netCDF format (all netCDF3 and netCDF4 formats are - possible); +* set the output netCDF format (all netCDF3 and netCDF4 formats, as + well as Zarr v3 are possible); * append to the netCDF file rather than over-writing it by default; diff --git a/setup.py b/setup.py index f5ddbed1f7..a9bdd9f268 100755 --- a/setup.py +++ b/setup.py @@ -178,13 +178,13 @@ def compile(): The ``cf`` package can: -* read field constructs from netCDF, CDL, Zarr, PP and UM datasets, +* read field and domain constructs from netCDF, CDL, Zarr, PP and UM datasets, * be fully flexible with respect to dataset storage chunking, * create new field constructs in memory, -* write and append field constructs to netCDF datasets on disk, +* write and append field constructs and domain to netCDF and Zarr v3 datasets on disk, * read, write, and create coordinates defined by geometry cells, @@ -263,6 +263,9 @@ def compile(): "docformatter", "flake8", ], + "zarr": [ + "zarr>=3.1.3", + ], } setup(