Change regridding so it does not crash on parametric vertical coordinates

bouweandela · bouweandela · commit b87024ff5f6d · 2025-08-19T19:00:30.000+02:00
diff --git a/src/ref_sample_data/data_request/base.py b/src/ref_sample_data/data_request/base.py
@@ -4,6 +4,9 @@
 import pandas as pd
 import xarray as xr
 from intake_esgf import ESGFCatalog
+from loguru import logger
+
+from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear
 
 
 class DataRequest(Protocol):
@@ -69,9 +72,9 @@ def _deduplicate_group(group: pd.DataFrame) -> pd.DataFrame:
     return datasets.groupby("key").apply(_deduplicate_group, include_groups=False).reset_index()
 
 
-class IntakeESGFDataRequest(DataRequest):
+class IntakeESGFMixin:
     """
-    A data request that fetches datasets from ESGF using intake-esgf.
+    A mixin that fetches datasets from ESGF using intake-esgf.
     """
 
     facets: dict[str, str | tuple[str, ...]]
@@ -91,3 +94,49 @@ def fetch_datasets(self) -> pd.DataFrame:
             merged_df["time_start"] = self.time_span[0]
             merged_df["time_end"] = self.time_span[1]
         return _deduplicate_datasets(merged_df)
+
+
+class DecimateMixin:
+    """
+    Mixin for decimating datasets based on their grid type.
+    """
+
+    def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
+        """
+        Downscale the dataset to a smaller size.
+
+        Parameters
+        ----------
+        dataset
+            The dataset to downscale
+
+        Returns
+        -------
+        xr.Dataset
+            The downscaled dataset
+        """
+        if "time" in dataset.dims and self.time_span is not None:
+            result = dataset.sel(time=slice(*self.time_span))
+            if result.time.size == 0:
+                # The dataset does not contain data in the requested time range.
+                return None
+        else:
+            result = dataset.copy()
+
+        has_latlon = "lat" in result.dims and "lon" in result.dims
+        has_ij = "i" in result.dims and "j" in result.dims
+
+        if has_latlon:
+            assert len(result.lat.dims) == 1 and len(result.lon.dims) == 1
+
+            result = decimate_rectilinear(result)
+        elif has_ij:
+            # 2d curvilinear grid (generally ocean variables)
+            result = decimate_curvilinear(result)
+        else:
+            logger.debug(
+                "No algorithm implemented for this grid type, not spatially decimating dataset:\n{dataset}",
+                dataset=dataset,
+            )
+
+        return result
diff --git a/src/ref_sample_data/data_request/cmip6.py b/src/ref_sample_data/data_request/cmip6.py
@@ -6,8 +6,7 @@
 import pandas as pd
 import xarray as xr
 
-from ref_sample_data.data_request.base import IntakeESGFDataRequest
-from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear
+from ref_sample_data.data_request.base import DecimateMixin, IntakeESGFMixin
 
 
 def prefix_to_filename(ds, filename_prefix: str) -> str:
@@ -37,7 +36,7 @@ def prefix_to_filename(ds, filename_prefix: str) -> str:
     return filename
 
 
-class CMIP6Request(IntakeESGFDataRequest):
+class CMIP6Request(IntakeESGFMixin, DecimateMixin):
     """
     Represents a CMIP6 dataset request
 
@@ -92,45 +91,6 @@ def __init__(
         assert all(key in self.avail_facets for key in self.cmip6_path_items), "Error message"
         assert all(key in self.avail_facets for key in self.cmip6_filename_paths), "Error message"
 
-    def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
-        """
-        Downscale the dataset to a smaller size.
-
-        Parameters
-        ----------
-        dataset
-            The dataset to downscale
-
-        Returns
-        -------
-        xr.Dataset
-            The downscaled dataset
-        """
-        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
-        has_ij = "i" in dataset.dims and "j" in dataset.dims
-
-        # The AMOC variable `msftmz` has these strange dims and we do not want to decimate
-        skip_decimate = {"time", "basin", "lev", "lat"}.issubset(dataset.dims)
-
-        if has_latlon:
-            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
-
-            result = decimate_rectilinear(dataset)
-        elif has_ij:
-            # 2d curvilinear grid (generally ocean variables)
-            result = decimate_curvilinear(dataset)
-        elif skip_decimate:
-            result = dataset
-        else:
-            raise ValueError("Cannot decimate this grid: too many dimensions")
-
-        if "time" in dataset.dims and self.time_span is not None:
-            result = result.sel(time=slice(*self.time_span))
-            if result.time.size == 0:
-                result = None
-
-        return result
-
     def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: pathlib.Path) -> Path:
         """
         Create the output filename for the dataset.
diff --git a/src/ref_sample_data/data_request/obs4mips.py b/src/ref_sample_data/data_request/obs4mips.py
@@ -6,12 +6,11 @@
 import pandas as pd
 import xarray as xr
 
-from ref_sample_data.data_request.base import IntakeESGFDataRequest
+from ref_sample_data.data_request.base import DecimateMixin, IntakeESGFMixin
 from ref_sample_data.data_request.cmip6 import prefix_to_filename
-from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear
 
 
-class Obs4MIPsRequest(IntakeESGFDataRequest):
+class Obs4MIPsRequest(IntakeESGFMixin, DecimateMixin):
     """
     Represents a Obs4MIPs dataset request
     """
@@ -70,40 +69,6 @@ def __init__(
         assert all(key in self.avail_facets for key in self.obs4mips_path_items), "Error message"
         assert all(key in self.avail_facets for key in self.obs4mips_filename_paths), "Error message"
 
-    def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
-        """
-        Downscale the dataset to a smaller size.
-
-        Parameters
-        ----------
-        dataset
-            The dataset to downscale
-
-        Returns
-        -------
-        xr.Dataset
-            The downscaled dataset
-        """
-        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
-        has_ij = "i" in dataset.dims and "j" in dataset.dims
-
-        if has_latlon:
-            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
-
-            result = decimate_rectilinear(dataset)
-        elif has_ij:
-            # 2d curvilinear grid (generally ocean variables)
-            result = decimate_curvilinear(dataset)
-        else:
-            raise ValueError("Cannot decimate this grid: too many dimensions")
-
-        if "time" in dataset.dims and self.time_span is not None:
-            result = result.sel(time=slice(*self.time_span))
-            if result.time.size == 0:
-                result = None
-
-        return result
-
     def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: pathlib.Path) -> Path:
         """
         Create the output filename for the dataset.
diff --git a/src/ref_sample_data/data_request/obs4ref.py b/src/ref_sample_data/data_request/obs4ref.py
@@ -6,11 +6,10 @@
 import xarray as xr
 from climate_ref_core.dataset_registry import dataset_registry_manager
 
-from ref_sample_data.data_request.base import DataRequest
-from ref_sample_data.resample import decimate_curvilinear, decimate_rectilinear
+from ref_sample_data.data_request.base import DecimateMixin
 
 
-class Obs4REFRequest(DataRequest):
+class Obs4REFRequest(DecimateMixin):
     """
     Fetch the unpublished Obs4MIPs datasets from the PMP registry
 
@@ -42,40 +41,6 @@ def fetch_datasets(self) -> pd.DataFrame:
             )
         return pd.DataFrame(datasets)
 
-    def decimate_dataset(self, dataset: xr.Dataset) -> xr.Dataset | None:
-        """
-        Downscale the dataset to a smaller size.
-
-        Parameters
-        ----------
-        dataset
-            The dataset to downscale
-
-        Returns
-        -------
-        xr.Dataset
-            The downscaled dataset
-        """
-        has_latlon = "lat" in dataset.dims and "lon" in dataset.dims
-        has_ij = "i" in dataset.dims and "j" in dataset.dims
-
-        # If less than 10 MB skip decimating
-        small_file_threshold = 10 * 1024**2
-        if dataset.nbytes < small_file_threshold:
-            return dataset
-
-        if has_latlon:
-            assert len(dataset.lat.dims) == 1 and len(dataset.lon.dims) == 1
-
-            result = decimate_rectilinear(dataset)
-        elif has_ij:
-            # 2d curvilinear grid (generally ocean variables)
-            result = decimate_curvilinear(dataset)
-        else:
-            raise ValueError("Cannot decimate this grid: too many dimensions")
-
-        return result
-
     def generate_filename(self, metadata: pd.Series, ds: xr.Dataset, ds_filename: pathlib.Path) -> Path:
         """
         Create the output filename for the dataset.
diff --git a/src/ref_sample_data/resample.py b/src/ref_sample_data/resample.py
@@ -1,22 +1,23 @@
 import numpy as np
 import xarray as xr
 import xcdat
+import xesmf
 
 
 def _calculate_2d_cell_bounds(
-    dimension: xr.DataArray,
+    points: np.ndarray,
     i: int,
     j: int,
-) -> [float, float, float, float]:
-    cell_center = dimension[j, i].data
+) -> list[float]:
+    cell_center = points[j, i]
     if i == 0:
-        di = dimension[j, i + 1].data - cell_center
+        di = points[j, i + 1] - cell_center
     else:
-        di = cell_center - dimension[j, i - 1].data
+        di = cell_center - points[j, i - 1]
     if j == 0:
-        dj = dimension[j + 1, i].data - cell_center
+        dj = points[j + 1, i] - cell_center
     else:
-        dj = cell_center - dimension[j - 1, i].data
+        dj = cell_center - points[j - 1, i]
 
     return np.asarray(
         [
@@ -43,22 +44,20 @@ def decimate_rectilinear(dataset: xr.Dataset) -> xr.Dataset:
     """
     # Decimate the dataset, but update the bounds
     # 10x10 degree grid
-    regridded_vars = []
-
-    for data_var in dataset.data_vars:
-        # Some datasets don't correctly use data_vars
-        if "_bnds" in data_var:
-            continue
-        output_grid = xcdat.create_uniform_grid(-90, 90, 10, 0, 359, 10)
-        regridded_vars.append(
-            dataset.regridder.horizontal(
-                data_var,
-                output_grid=output_grid,
-                tool="xesmf",
-                method="bilinear",
-            )
-        )
-    return xr.merge(regridded_vars)
+    output_grid = xcdat.create_uniform_grid(-90, 90, 10, 0, 359, 10)
+    regrid = xesmf.Regridder(dataset, output_grid, "bilinear", periodic=True)
+    result = regrid(dataset.copy())
+    result = result.bounds.add_bounds("Y").bounds.add_bounds("X")
+    # Restore attributes and add dataarrays that have not been regridded.
+    for k, v in dataset.data_vars.items():
+        if k in result:
+            result[k].attrs = v.attrs
+        else:
+            result[k] = v
+    for k, v in dataset.coords.items():
+        result[k].attrs = v.attrs
+    result.attrs = dataset.attrs
+    return result
 
 
 def decimate_curvilinear(dataset: xr.Dataset, factor: int = 10) -> xr.Dataset:
@@ -82,13 +81,15 @@ def decimate_curvilinear(dataset: xr.Dataset, factor: int = 10) -> xr.Dataset:
     """
     assert factor >= 1
     result = dataset.interp(i=dataset.i[::factor]).interp(j=dataset.j[::factor])
-    result.coords["i"].values[:] = range(len(result.i))
-    result.coords["j"].values[:] = range(len(result.j))
+    result.coords["i"].values[:] = np.arange(len(result.i))
+    result.coords["j"].values[:] = np.arange(len(result.j))
 
     # Update the bounds of the cells
+    latitude_points = result.latitude.values
+    longitude_points = result.longitude.values
     for j in result.j:
         for i in result.i:
-            result.vertices_latitude[j, i] = _calculate_2d_cell_bounds(result.latitude, i, j)
-            result.vertices_longitude[j, i] = _calculate_2d_cell_bounds(result.longitude, i, j)
+            result.vertices_latitude[j, i] = _calculate_2d_cell_bounds(latitude_points, i, j)
+            result.vertices_longitude[j, i] = _calculate_2d_cell_bounds(longitude_points, i, j)
 
     return result