Ensure most_common returns correct dtype (#53)

BSchilperoort · web-flow · commit f2120b8bda1e · 2024-10-23T08:01:46.000+02:00
* Add test that should pass

* Only call .where(covered) if needed. Improve var naming

* Expose fill_value to regrid.most_common/least_common

* Expose fill_value in regrid.stat as well

* Ruff changed formatting

* Warn the user if data is cast to float

* Undo formatting change
diff --git a/pyproject.toml b/pyproject.toml
@@ -135,6 +135,8 @@ ignore = [
   "S105", "S106", "S107",
   # Ignore complexity
   "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",
+  # Ignore magic values (false positives)
+  "PLR2004",
   # Causes conflicts with formatter
   "ISC001",
 ]
diff --git a/src/xarray_regrid/methods/_shared.py b/src/xarray_regrid/methods/_shared.py
@@ -1,5 +1,6 @@
 """Utility functions shared between methods."""
 
+import warnings
 from collections.abc import Hashable
 from typing import Any, overload
 
@@ -53,13 +54,21 @@ def restore_properties(
         result[coord].attrs = target_ds[coord].attrs
 
         # Replace zeros outside of original data grid with NaNs
-        uncovered_target_grid = (target_ds[coord] <= original_data[coord].max()) & (
+        covered = (target_ds[coord] <= original_data[coord].max()) & (
             target_ds[coord] >= original_data[coord].min()
         )
-        if fill_value is None:
-            result = result.where(uncovered_target_grid)
-        else:
-            result = result.where(uncovered_target_grid, fill_value)
+
+        if (~covered).any():
+            if fill_value is None:
+                if np.issubdtype(result.dtype, np.integer):
+                    msg = (
+                        "No fill_value is provided; data will be cast to "
+                        "floating point dtype to be able to use NaN for missing values."
+                    )
+                    warnings.warn(msg, stacklevel=1)
+                result = result.where(covered)
+            else:
+                result = result.where(covered, fill_value)
 
     return result.transpose(*original_data.dims)
 
diff --git a/src/xarray_regrid/regrid.py b/src/xarray_regrid/regrid.py
@@ -1,5 +1,5 @@
 from collections.abc import Hashable
-from typing import overload
+from typing import Any, overload
 
 import numpy as np
 import xarray as xr
@@ -14,11 +14,12 @@ class Regridder:
     """Regridding xarray datasets and dataarrays.
 
     Available methods:
-        linear: linear, bilinear, or higher dimensional linear interpolation.
-        nearest: nearest-neighbor regridding.
-        cubic: cubic spline regridding.
-        conservative: conservative regridding.
+        linear: linear, bilinear, or higher dimensional linear interpolation
+        nearest: nearest-neighbor regridding
+        cubic: cubic spline regridding
+        conservative: conservative regridding
         most_common: most common value regridder
+        stat: area statistics regridder
     """
 
     def __init__(self, xarray_obj: xr.DataArray | xr.Dataset):
@@ -134,6 +135,7 @@ def most_common(
         ds_target_grid: xr.Dataset,
         values: np.ndarray,
         time_dim: str | None = "time",
+        fill_value: None | Any = None,
     ) -> xr.DataArray:
         """Regrid by taking the most common value within the new grid cells.
 
@@ -151,6 +153,9 @@ def most_common(
                 contains the values 0, 2 and 4.
             time_dim: Name of the time dimension. Defaults to "time". Use `None` to
                 force regridding over the time dimension.
+            fill_value: What value to fill uncovered parts of the target grid.
+                By default this will be NaN, and integer type data will be cast to
+                float to accomodate this.
 
         Returns:
             Regridded data.
@@ -173,6 +178,7 @@ def most_common(
             ds_target_grid,
             values,
             time_dim,
+            fill_value,
             anti_mode=False,
         )
 
@@ -181,6 +187,7 @@ def least_common(
         ds_target_grid: xr.Dataset,
         values: np.ndarray,
         time_dim: str | None = "time",
+        fill_value: None | Any = None,
     ) -> xr.DataArray:
         """Regrid by taking the least common value within the new grid cells.
 
@@ -198,6 +205,9 @@ def least_common(
                 contains the values 0, 2 and 4.
             time_dim: Name of the time dimension. Defaults to "time". Use `None` to
                 force regridding over the time dimension.
+            fill_value: What value to fill uncovered parts of the target grid.
+                By default this will be NaN, and integer type data will be cast to
+                float to accomodate this.
 
         Returns:
             Regridded data.
@@ -220,6 +230,7 @@ def least_common(
             ds_target_grid,
             values,
             time_dim,
+            fill_value,
             anti_mode=True,
         )
 
@@ -229,6 +240,7 @@ def stat(
         method: str,
         time_dim: str | None = "time",
         skipna: bool = False,
+        fill_value: None | Any = None,
     ) -> xr.DataArray | xr.Dataset:
         """Upsampling of data using statistical methods (e.g. the mean or variance).
 
@@ -243,6 +255,9 @@ def stat(
             time_dim: Name of the time dimension. Defaults to "time". Use `None` to
                 force regridding over the time dimension.
             skipna: If NaN values should be ignored.
+            fill_value: What value to fill uncovered parts of the target grid.
+                By default this will be NaN, and integer type data will be cast to
+                float to accomodate this.
 
         Returns:
             xarray.dataset with regridded land cover categorical data.
@@ -251,7 +266,7 @@ def stat(
         ds_formatted = format_for_regrid(self._obj, ds_target_grid, stats=True)
 
         return flox_reduce.statistic_reduce(
-            ds_formatted, ds_target_grid, time_dim, method, skipna
+            ds_formatted, ds_target_grid, time_dim, method, skipna, fill_value
         )
 
 
diff --git a/tests/test_reduce.py b/tests/test_reduce.py
@@ -93,16 +93,22 @@ def test_most_common(dummy_lc_data, dummy_target_grid):
             [0, 0, 0, 0, 0, 0],
             [0, 0, 0, 0, 0, 0],
             [3, 3, 0, 0, 0, 1],
-        ]
+        ],
+        dtype="uint8",
+    )
+    input_data_int = dummy_lc_data["lc"].astype("uint8")
+
+    regrid_result = input_data_int.regrid.most_common(
+        dummy_target_grid,
+        values=EXP_LABELS,
     )
     xr.testing.assert_equal(
-        dummy_lc_data["lc"].regrid.most_common(
-            dummy_target_grid,
-            values=EXP_LABELS,
-        ),
+        regrid_result,
         make_expected_ds(expected_data)["lc"],
     )
 
+    assert regrid_result.dtype == input_data_int.dtype
+
 
 def test_least_common(dummy_lc_data, dummy_target_grid):
     # Currently just test if the method runs: code is 99% the same as most_common

Original file line number	Diff line number	Diff line change
`@@ -135,6 +135,8 @@ ignore = [`
`135`	`135`	`"S105", "S106", "S107",`
`136`	`136`	`# Ignore complexity`
`137`	`137`	`"C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915",`
	`138`	`+ # Ignore magic values (false positives)`
	`139`	`+ "PLR2004",`
`138`	`140`	`# Causes conflicts with formatter`
`139`	`141`	`"ISC001",`
`140`	`142`	`]`