Remove need for scipy, fix parallel regridder (#461)

aulemahal · pre-commit-ci[bot] · huard · web-flow · commit 0cd7ef6504fc · 2025-11-25T14:27:26.000-05:00
* Remove need for scipy. - Better name collision avoidment in init para regrid * upd changes * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update xesmf/smm.py Co-authored-by: David Huard <huard.david@ouranos.ca> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add comments - simplify unname-rename * fix fix - add 0 removal to add_nans to imitate previous --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: David Huard <huard.david@ouranos.ca>
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,11 @@
 What's new
 ==========
 
+0.9.1 (unreleased)
+------------------
+* Remove scipy-dependent code in ``add_nans_to_weight``. By `Pascal Bourgault <https://github.com/aulemahal>`_.
+* Fix some name collision issues in the parallel regridder initialisation. By `Pascal Bourgault <https://github.com/aulemahal>`_.
+
 0.9.0 (2025-11-21)
 ------------------
 * Added ``Regridder`` option ``post_mask_source`` to mask contributions of specified source grid cells, with a special setting for masking domain edge cells to avoid extrapolation with ``nearest_s2d`` when remapping to a larger domain (``post_mask_source = 'domain_edge'``, :pull:`444`). By `Martin Schupfner <https://github.com/sol1105>`_.
diff --git a/xesmf/frontend.py b/xesmf/frontend.py
@@ -22,7 +22,14 @@
     post_apply_target_mask_to_weights,
     read_weights,
 )
-from .util import LAT_CF_ATTRS, LON_CF_ATTRS, _get_edge_indices_2d, split_polygons_and_holes
+from .util import (
+    LAT_CF_ATTRS,
+    LON_CF_ATTRS,
+    _get_edge_indices_2d,
+    _rename_dataset,
+    _unname_dataset,
+    split_polygons_and_holes,
+)
 
 try:
     import dask.array as da
@@ -39,16 +46,8 @@ def subset_regridder(
     kwargs.pop('filename', None)  # Don't save subset of weights
     kwargs.pop('reuse_weights', None)
 
-    # Renaming dims to original names for the subset regridding
-    if locstream_in:
-        ds_in = ds_in.rename({'x_in': in_dims[0]})
-    else:
-        ds_in = ds_in.rename({'y_in': in_dims[0], 'x_in': in_dims[1]})
-
-    if locstream_out:
-        ds_out = ds_out.rename({'x_out': out_dims[1]})
-    else:
-        ds_out = ds_out.rename({'y_out': out_dims[0], 'x_out': out_dims[1]})
+    ds_in = _rename_dataset(ds_in, locstream_in, in_dims, '_in')
+    ds_out = _rename_dataset(ds_out, locstream_out, out_dims, '_out')
 
     regridder = Regridder(
         ds_in, ds_out, method, locstream_in, locstream_out, periodic, parallel=False, **kwargs
@@ -1153,32 +1152,13 @@ def _init_para_regrid(self, ds_in, ds_out, kwargs):
             ds_out[ds_out.cf['latitude'].name].attrs['bounds'] = 'lat_bounds'
             ds_out = ds_out.drop_dims(ds_out.lon_b.dims + ds_out.lat_b.dims)
         # rename dims to avoid map_blocks confusing ds_in and ds_out dims.
-        if self.sequence_in:
-            ds_in = ds_in.rename({self.in_horiz_dims[0]: 'x_in'})
-        else:
-            ds_in = ds_in.rename({self.in_horiz_dims[0]: 'y_in', self.in_horiz_dims[1]: 'x_in'})
-
-        if self.sequence_out:
-            ds_out = ds_out.rename({self.out_horiz_dims[1]: 'x_out'})
-        else:
-            ds_out = ds_out.rename(
-                {self.out_horiz_dims[0]: 'y_out', self.out_horiz_dims[1]: 'x_out'}
-            )
+        ds_in = _unname_dataset(ds_in, self.sequence_in, self.in_horiz_dims, '_in')
+        ds_out = _unname_dataset(ds_out, self.sequence_out, self.out_horiz_dims, '_out')
 
         out_chunks = {k: ds_out.chunks.get(k) for k in ['y_out', 'x_out']}
         in_chunks = {k: ds_in.chunks.get(k) for k in ['y_in', 'x_in']}
         chunks = out_chunks | in_chunks
 
-        # Rename coords to avoid issues in xr.map_blocks
-        # If coords and dims are the same, renaming has already been done.
-        ds_out = ds_out.rename(
-            {
-                coord: coord + '_out'
-                for coord in self.out_coords.coords.keys()
-                if coord not in self.out_horiz_dims
-            }
-        )
-
         weights_dims = ('y_out', 'x_out', 'y_in', 'x_in')
         templ = sps.zeros((self.shape_out + self.shape_in))
         w_templ = xr.DataArray(templ, dims=weights_dims).chunk(
diff --git a/xesmf/smm.py b/xesmf/smm.py
@@ -260,7 +260,8 @@ def add_nans_to_weights(weights):
 
     By default, empty rows in the weights sparse matrix are interpreted as zeroes. This can become problematic
     when the field being interpreted has legitimate null values. This function inserts nan values in each row to
-    make sure empty weights are propagated as nans instead of zeros.
+    make sure empty weights are propagated as nans instead of zeros. It also removes unnecessary entries, ones
+    where the data is the same as the fill value (0).
 
     Parameters
     ----------
@@ -272,17 +273,30 @@ def add_nans_to_weights(weights):
     DataArray backed by a sparse.COO array
       Sparse weights matrix.
     """
-
-    # Taken from @trondkr and adapted by @raphaeldussin to use `lil`.
-    # lil matrix is better than CSR when changing sparsity
-    m = weights.data.to_scipy_sparse().tolil()
-    # replace empty rows by one nan value at element 0 (arbitrary)
-    # so that remapped element become nan instead of zero
-    for krow in range(len(m.rows)):
-        m.rows[krow] = [0] if m.rows[krow] == [] else m.rows[krow]
-        m.data[krow] = [np.nan] if m.data[krow] == [] else m.data[krow]
-    # update regridder weights (in COO)
-    weights = weights.copy(data=sps.COO.from_scipy_sparse(m))
+    # Taken from @trondkr and adapted by @raphaeldussin to use `lil`, translated to COO by @aulemahal
+    coo = weights.data
+    coords = coo.coords
+    data = coo.data
+    # Remove unnecessary entries (roundtrip through scipy's lil did that implicitely)
+    coords = coords[:, data != coo.fill_value]
+    data = data[data != coo.fill_value]
+
+    # Replace rows with no weights with a NaN at element 0, so that remapped elements are NaNs instead of zeros.
+    # Fin rows with no entry in the weights, the unmapped ones
+    unmapped_rows = set(np.arange(coo.shape[0])) - set(coords[0])
+    # Generate one coord bper unmapped row
+    new_coords = np.array([list(unmapped_rows), [0] * len(unmapped_rows)], dtype=coords.dtype)
+    # Assign a NaN to the new coord so the scalar product of that row gives a NaN
+    new_data = np.full((len(unmapped_rows),), np.nan)
+
+    # Recreate the new COO weights matrix
+    new = sps.COO(
+        np.hstack((coords, new_coords)),
+        np.hstack((data, new_data)),
+        coo.shape,
+        fill_value=coo.fill_value,
+    )
+    weights = weights.copy(data=new)
     return weights
 
 
diff --git a/xesmf/util.py b/xesmf/util.py
@@ -415,3 +415,46 @@ def _get_edge_indices_2d(nlons, nlats):
     edge_mask[:, :1] = True
     edge_mask[:, -1:] = True
     return np.where(edge_mask.ravel())[0]
+
+
+def _unname_dataset(ds, sequence, dims, suffix):
+    """Rename everything in a dataset so that it can be aligned without modification with another."""
+    if sequence:
+        dim = list(set(dims) - {'dummy'})[0]
+        ds = ds.rename({dim: f'x{suffix}'})
+    else:
+        ds = ds.rename({dims[0]: f'y{suffix}', dims[1]: f'x{suffix}'})
+    if ds[f'x{suffix}'].attrs.get('bounds'):
+        ds = ds.rename({ds[f'x{suffix}'].attrs['bounds']: f'x{suffix}_bounds'})
+        ds[f'x{suffix}'].attrs['bounds'] = f'x{suffix}_bounds'
+    if not sequence and ds[f'y{suffix}'].attrs.get('bounds'):
+        ds = ds.rename({ds[f'y{suffix}'].attrs['bounds']: f'y{suffix}_bounds'})
+        ds[f'y{suffix}'].attrs['bounds'] = f'y{suffix}_bounds'
+
+    # If coords and dims are the same, renaming has already been done.
+    ds = ds.rename(
+        {
+            coord: coord + suffix
+            for coord in ds.coords.keys()
+            if coord not in (f'y{suffix}', f'x{suffix}')
+        }
+    )
+    return ds
+
+
+def _rename_dataset(ds, sequence, dims, suffix):
+    """Restore coordinate names from an "unnamed" dataset"""
+    ds = ds.rename(
+        {
+            coord: coord.rstrip(suffix)
+            for coord in ds.coords.keys()
+            if coord not in dims
+            and coord.endswith(suffix)
+            and coord not in (f'y{suffix}', f'x{suffix}')
+        }
+    )
+    if sequence:
+        ds = ds.rename({f'x{suffix}': dims[0]})
+    else:
+        ds = ds.rename({f'y{suffix}': dims[0], f'x{suffix}': dims[1]})
+    return ds