diff --git a/docs/src/further_topics/netcdf_io.rst b/docs/src/further_topics/netcdf_io.rst index 47d85ffeab..581e6759a8 100644 --- a/docs/src/further_topics/netcdf_io.rst +++ b/docs/src/further_topics/netcdf_io.rst @@ -189,6 +189,17 @@ Deferred Saving TBC +Dataless Cubes +-------------- +It now possible to have "dataless" cubes, where ``cube.data is None``. +When these are saved to a NetCDF file interface, this results in a netcdf file variable +with all-unwritten data (meaning that it takes up no storage space). + +In order to load such variables back correctly, we also add an extra +``iris_dataless_cube = "true"`` attribute : this tells the loader to skip array creation +when loading back in, so that the read-back cube is also dataless. + + Guessing Coordinate Axes ------------------------ diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index dd91aa7cc8..f4c5f66716 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -40,6 +40,9 @@ This document explains the changes made to Iris for this release :func:`~iris.fileformats.netcdf.saver.save_mesh` also supports ``zlib`` compression. (:issue:`6565`, :pull:`6728`) +#. `@pp-mo`_ made it possible to save 'dataless' cubes to a netcdf file, and load them + back again. (:issue:`6727`, :pull:`6739`) + #. `@ukmo-ccbunney`_ added a new :class:`~iris.util.CMLSettings` class to control the formatting of Cube CML output via a context manager. (:issue:`6244`, :pull:`6743`) diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index 216df67590..e8d283beb8 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -392,8 +392,17 @@ def _load_cube_inner(engine, cf, cf_var, filename): from iris.cube import Cube """Create the cube associated with the CF-netCDF data variable.""" - data = _get_cf_var_data(cf_var) - cube = Cube(data) + from iris.fileformats.netcdf.saver import Saver + + if hasattr(cf_var, Saver._DATALESS_ATTRNAME): + # This data-variable represents a dataless cube. + # The variable array content was never written (to take up no space). + data = None + shape = cf_var.shape + else: + data = _get_cf_var_data(cf_var) + shape = None + cube = Cube(data=data, shape=shape) # Reset the actions engine. engine.reset() diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py index 67f011f724..5177749c07 100644 --- a/lib/iris/fileformats/netcdf/saver.py +++ b/lib/iris/fileformats/netcdf/saver.py @@ -2275,6 +2275,10 @@ def _create_cf_grid_mapping(self, cube, cf_var_cube): if grid_mapping: _setncattr(cf_var_cube, "grid_mapping", grid_mapping) + _DATALESS_ATTRNAME = "iris_dataless_cube" + _DATALESS_DTYPE = np.dtype("u1") + _DATALESS_FILLVALUE = 127 + def _create_cf_data_variable( self, cube, @@ -2315,9 +2319,19 @@ def _create_cf_data_variable( # TODO: when iris.FUTURE.save_split_attrs is removed, the 'local_keys' arg can # be removed. # Get the values in a form which is valid for the file format. - data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) + is_dataless = cube.is_dataless() + if is_dataless: + data = None + else: + data = self._ensure_valid_dtype(cube.core_data(), "cube", cube) - if packing: + if is_dataless: + # The variable must have *some* dtype, and it must be maskable + dtype = self._DATALESS_DTYPE + fill_value = self._DATALESS_FILLVALUE + elif not packing: + dtype = data.dtype.newbyteorder("=") + else: if isinstance(packing, dict): if "dtype" not in packing: msg = "The dtype attribute is required for packing." @@ -2355,8 +2369,6 @@ def _create_cf_data_variable( add_offset = (cmax + cmin) / 2 else: add_offset = cmin + 2 ** (n - 1) * scale_factor - else: - dtype = data.dtype.newbyteorder("=") def set_packing_ncattrs(cfvar): """Set netCDF packing attributes. @@ -2380,8 +2392,9 @@ def set_packing_ncattrs(cfvar): cf_name, dtype, dimension_names, fill_value=fill_value, **kwargs ) - set_packing_ncattrs(cf_var) - self._lazy_stream_data(data=data, cf_var=cf_var) + if not is_dataless: + set_packing_ncattrs(cf_var) + self._lazy_stream_data(data=data, cf_var=cf_var) if cube.standard_name: _setncattr(cf_var, "standard_name", cube.standard_name) @@ -2446,6 +2459,10 @@ def set_packing_ncattrs(cfvar): _setncattr(cf_var, attr_name, value) + # Add the 'dataless' marker if needed + if is_dataless: + _setncattr(cf_var, self._DATALESS_ATTRNAME, "true") + # Create the CF-netCDF data variable cell method attribute. cell_methods = self._create_cf_cell_methods(cube, dimension_names) diff --git a/lib/iris/tests/integration/netcdf/test_dataless.py b/lib/iris/tests/integration/netcdf/test_dataless.py new file mode 100644 index 0000000000..442777ce18 --- /dev/null +++ b/lib/iris/tests/integration/netcdf/test_dataless.py @@ -0,0 +1,102 @@ +# Copyright Iris contributors +# +# This file is part of Iris and is released under the BSD license. +# See LICENSE in the root of the repository for full licensing details. +"""Integration tests for save+load of datales cubes.""" + +import numpy as np +import pytest + +import iris +from iris.coords import DimCoord +from iris.cube import Cube +from iris.fileformats.netcdf._thread_safe_nc import DatasetWrapper +from iris.fileformats.netcdf.saver import Saver + + +class TestDataless: + @pytest.fixture(autouse=True) + def setup(self, tmp_path_factory): + ny, nx = 3, 4 + self.testcube = Cube( + shape=(ny, nx), + long_name="testdata", + dim_coords_and_dims=[ + (DimCoord(np.arange(ny), long_name="y"), 0), + (DimCoord(np.arange(nx), long_name="x"), 1), + ], + ) + self.testdir = tmp_path_factory.mktemp("dataless") + self.test_path = self.testdir / "test.nc" + + @staticmethod + def _strip_saveload_additions(reloaded_cube): + reloaded_cube.attributes.pop("Conventions", None) + reloaded_cube.var_name = None + for co in reloaded_cube.coords(): + co.var_name = None + + def test_dataless_save(self): + # Check that we can save a dataless cube, and what that looks like in the file. + iris.save(self.testcube, self.test_path) + assert Saver._DATALESS_ATTRNAME not in self.testcube.attributes + # Check the content as seen in the file + ncds = DatasetWrapper(self.test_path) + var = ncds.variables["testdata"] + assert Saver._DATALESS_ATTRNAME in var.ncattrs() + assert var.dtype == Saver._DATALESS_DTYPE + assert "_FillValue" in var.ncattrs() + assert var._FillValue == Saver._DATALESS_FILLVALUE + assert np.all(np.ma.getmaskarray(var[:]) == True) # noqa: E712 + + def test_dataless_load(self): + # Check that we can load a saved dataless cube, and it matches the original. + iris.save(self.testcube, self.test_path) + + # NB Load with load_raw, since we haven't finished supporting dataless merge. + (result_cube,) = iris.load_raw(self.test_path) + assert result_cube.is_dataless() + assert "iris_dataless_cube" not in result_cube.attributes + + # strip off extra things added by netcdf save+load + self._strip_saveload_additions(result_cube) + + # Result now == original + assert result_cube == self.testcube + + def test_mixture_saveload(self): + # Check that a mixture of dataless and "normal" cubes can be saved + loaded back + dataless = self.testcube + ny = dataless.shape[0] + dataful = Cube( + np.ones((ny, 3)), + long_name="other", + dim_coords_and_dims=[(dataless.coord("y"), 0)], + ) + iris.save([dataless, dataful], self.test_path) + # NB Load with load_raw, since we haven't finished supporting dataless merge. + cubes = iris.load_raw(self.test_path) + assert len(cubes) == 2 + read_dataless = cubes.extract_cube("testdata") + read_dataful = cubes.extract_cube("other") + assert read_dataless.is_dataless() + assert not read_dataful.is_dataless() + for cube in (read_dataless, read_dataful): + self._strip_saveload_additions(cube) + assert read_dataless == dataless + assert read_dataful == dataful + + def test_nodata_size(self): + # Check that a file saved with a large dataless cube does *not* occupy a large + # amount of diskspace. + ny, nx = 10000, 10000 + data_dims = (ny, nx) + dataless_cube = Cube(shape=data_dims) + + iris.save(dataless_cube, self.test_path) + + data_size_bytes = ny * nx # bytes, since dtype is "u1" (approx 100Mb) + filesize_bytes = self.test_path.stat().st_size + # Check that the file size < 1/10 variable array size + # The 0.1 is a bit arbitrary, but it makes the point! + assert filesize_bytes < 0.1 * data_size_bytes