Merge pull request #743 from davidhassell/cfa-write

davidhassell · web-flow · commit 627105b81252 · 2024-03-26T13:31:35.000Z
Reduce output CFA netCDF file size by setting the HDF5 chunksizes of CFA variables to be no larger than required
diff --git a/Changelog.rst b/Changelog.rst
@@ -15,6 +15,9 @@ version NEXT
   axis coordinates (https://github.com/NCAS-CMS/cf-python/issues/741)
 * Improve `cf.Field.__getitem__` performance by not re-calculating
   axis cyclicity (https://github.com/NCAS-CMS/cf-python/issues/744)
+* Reduce output CFA netCDF file size by setting the HDF5 chunksizes of
+  CFA variables to be no larger than required
+  (https://github.com/NCAS-CMS/cf-python/issues/739)
 * Fix misleading error message when it is not possible to create area
   weights requested from `cf.Field.collapse`
   (https://github.com/NCAS-CMS/cf-python/issues/731)
diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py
@@ -48,6 +48,33 @@ class CFImplementation(cfdm.CFDMImplementation):
 
     """
 
+    def nc_set_hdf5_chunksizes(self, data, sizes, override=False):
+        """Set the data HDF5 chunksizes.
+
+        .. versionadded:: NEXTVERSION
+
+        :Parameters:
+
+            data: `Data`
+                The data.
+
+            sizes: sequence of `int`
+                The new HDF5 chunk sizes.
+
+            override: `bool`, optional
+                If True then set the HDF5 chunks sizes even if some
+                have already been specified. If False, the default,
+                then only set the HDF5 chunks sizes if some none have
+                already been specified.
+
+        :Returns:
+
+            `None`
+
+        """
+        if override or not data.nc_hdf5_chunksizes():
+            data.nc_set_hdf5_chunksizes(sizes)
+
     def set_construct(self, parent, construct, axes=None, copy=True, **kwargs):
         """Insert a construct into a field or domain.
 
diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py
@@ -483,8 +483,10 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar):
 
         # Location
         term = "location"
+        data = cfa[term]
+        self.implementation.nc_set_hdf5_chunksizes(data, data.shape)
         term_ncvar = self._cfa_write_term_variable(
-            cfa[term],
+            data,
             aggregated_data.get(term, f"cfa_{term}"),
             location_ncdimensions,
         )
@@ -502,8 +504,10 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar):
         else:
             attributes = None
 
+        data = cfa[term]
+        self.implementation.nc_set_hdf5_chunksizes(data, data.shape)
         term_ncvar = self._cfa_write_term_variable(
-            cfa[term],
+            data,
             aggregated_data.get(term, f"cfa_{term}"),
             fragment_ncdimensions,
             attributes=attributes,
@@ -521,8 +525,10 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar):
         else:
             dimensions = fragment_ncdimensions
 
+        data = cfa[term]
+        self.implementation.nc_set_hdf5_chunksizes(data, data.shape)
         term_ncvar = self._cfa_write_term_variable(
-            cfa[term],
+            data,
             aggregated_data.get(term, f"cfa_{term}"),
             dimensions,
         )
@@ -539,8 +545,10 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar):
         else:
             dimensions = fragment_ncdimensions
 
+        data = cfa[term]
+        self.implementation.nc_set_hdf5_chunksizes(data, data.shape)
         term_ncvar = self._cfa_write_term_variable(
-            cfa[term],
+            data,
             aggregated_data.get(term, f"cfa_{term}"),
             dimensions,
         )
@@ -809,8 +817,10 @@ def _cfa_write_non_standard_terms(
             terms.append(term)
 
             # Create the new CFA term variable
+            data = type(data)(dx)
+            self.implementation.nc_set_hdf5_chunksizes(data, data.shape)
             term_ncvar = self._cfa_write_term_variable(
-                data=type(data)(dx),
+                data=data,
                 ncvar=aggregated_data.get(term, f"cfa_{term}"),
                 ncdimensions=fragment_ncdimensions,
             )
@@ -904,6 +914,7 @@ def _cfa_aggregation_instructions(self, data, cfvar):
         aggregation_format = []
         for indices in data.chunk_indices():
             file_details = self._cfa_get_file_details(data[indices])
+
             if len(file_details) != 1:
                 if file_details:
                     raise ValueError(
@@ -962,6 +973,8 @@ def _cfa_aggregation_instructions(self, data, cfvar):
             ):
                 n = n_trailing - len(filenames)
                 if n:
+                    # This chunk has fewer fragment files than some
+                    # others, so some padding is required.
                     pad = ("",) * n
                     aggregation_file[i] = filenames + pad
                     aggregation_format[i] = formats + pad
@@ -1055,13 +1068,14 @@ def _cfa_get_file_details(self, data):
         {(('/home/file.pp',), (34556,), ('um',))}
 
         """
-        out = set()
+        out = []
+        out_append = out.append
         for a in data.todict().values():
             try:
-                out.update(
-                    ((a.get_filenames(), a.get_addresses(), a.get_formats()),)
+                out_append(
+                    (a.get_filenames(), a.get_addresses(), a.get_formats())
                 )
             except AttributeError:
                 pass
 
-        return out
+        return set(out)