Add option to add raw values as coordinates in data loader base class (to be used for binning by raw values).

raspstephan · WeatherBenchX authors · commit 9c54b9be7958 · 2026-02-13T00:03:37.000-08:00
In sparse_parquet and xarray_loaders make all base class arguments kwargs

PiperOrigin-RevId: 869579359
diff --git a/weatherbenchX/data_loaders/base.py b/weatherbenchX/data_loaders/base.py
@@ -77,6 +77,7 @@ def __init__(
               [Mapping[Hashable, xr.DataArray]], Mapping[Hashable, xr.DataArray]
           ]
       ] = None,
+      add_values_to_coords: bool = False,
   ):
     """Shared initialization for data loaders.
 
@@ -89,11 +90,15 @@ def __init__(
         False.
       process_chunk_fn: optional function to be applied to each chunk after
         loading but before interpolation, computing, and adding nan mask.
+      add_values_to_coords: If True, add returned values to coordinates. These
+        will propagate into the statistics, and can therefore be used for
+        binning. Default: False.
     """
     self._interpolation = interpolation
     self._compute = compute
     self._add_nan_mask = add_nan_mask
     self._process_chunk_fn = process_chunk_fn
+    self._add_values_to_coords = add_values_to_coords
 
   @abc.abstractmethod
   def _load_chunk_from_source(
@@ -149,4 +154,10 @@ def _compute_and_keep_dtype(x: xr.DataArray) -> xr.DataArray:
 
     if self._add_nan_mask:
       chunk = add_nan_mask_to_data(chunk)
+
+    if self._add_values_to_coords:
+      chunk = xarray_tree.map_structure(
+          lambda da: da.assign_coords(values_as_coord=da), chunk
+      )
+
     return chunk
diff --git a/weatherbenchX/data_loaders/sparse_parquet.py b/weatherbenchX/data_loaders/sparse_parquet.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 """Data loaders for tabular data stored in Parquet format."""
 
+from collections.abc import Hashable
 import functools
 import os
-from typing import Callable, Hashable, Mapping, Optional, Sequence, Union
+from typing import Callable, Mapping, Optional, Sequence, Union
 import numpy as np
 import pandas as pd
 import pyarrow
-from weatherbenchX import interpolations
 from weatherbenchX.data_loaders import base
 import xarray as xr
 
@@ -91,7 +91,6 @@ def __init__(
       coordinate_variables: Sequence[str] = (),
       split_variables: bool = False,
       dropna: bool = False,
-      add_nan_mask: bool = False,
       tolerance: Optional[
           np.timedelta64 | tuple[np.timedelta64, np.timedelta64]
       ] = None,
@@ -102,8 +101,7 @@ def __init__(
       observation_dim: Optional[str] = None,
       file_tolerance: np.timedelta64 = np.timedelta64(1, 'h'),
       preprocessing_fn: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
-      interpolation: Optional[interpolations.Interpolation] = None,
-      process_chunk_fn: Optional[Callable[[xr.Dataset], xr.Dataset]] = None,
+      **kwargs,
   ):
     """Init.
 
@@ -123,10 +121,6 @@ def __init__(
       dropna: Whether to drop missing values. If split_variables is True, values
         will be dropped for each variable separately. Otherwise, only indices
         where all variables are non-NaN will be returned.
-      add_nan_mask: Adds a boolean coordinate named 'mask' to each variable
-        (variables will be split into DataArrays if they aren't already), with
-        False indicating NaN values. To be used for masked aggregation. Default:
-        False.
       tolerance: (Optional) Tolerance around the given valid time. If tolerance
         is a single timedelta, data within valid_time +/- tolerance will be
         returned. If tolerance is a 2-tuple of timedeltas, data within
@@ -153,16 +147,12 @@ def __init__(
         1h
       preprocessing_fn: (Optional) Function to apply to the dataframe after
         reading.
-      interpolation: (Optional) Interpolation to be applied to the data.
-      process_chunk_fn: (Optional) Function to apply to the chunk of data after
-        loading.
+      **kwargs: Additional keyword arguments passed to the base DataLoader.
     """
 
     super().__init__(
-        interpolation=interpolation,
         compute=False,  # Data is already loaded.
-        add_nan_mask=add_nan_mask,
-        process_chunk_fn=process_chunk_fn,
+        **kwargs
     )
     self._path = path
     if partitioned_by not in ['hour', 'day', 'month']:
@@ -479,7 +469,6 @@ def __init__(
       time_dim: str,
       split_variables: bool = False,
       dropna: bool = False,
-      add_nan_mask: bool = False,
       tolerance: Optional[np.timedelta64] = None,
       partitioned_by: str = 'month',
       rename_variables: Optional[Mapping[str, str]] = None,
@@ -488,8 +477,7 @@ def __init__(
       pick_closest_duplicate_by: Optional[str] = None,
       file_tolerance: np.timedelta64 = np.timedelta64(1, 'h'),
       preprocessing_fn: Optional[Callable[[pd.DataFrame], pd.DataFrame]] = None,
-      interpolation: Optional[interpolations.Interpolation] = None,
-      process_chunk_fn: Optional[Callable[[xr.Dataset], xr.Dataset]] = None,
+      **kwargs,
   ):
     def metar_preprocessing_fn(
         df: pd.DataFrame,
@@ -521,7 +509,6 @@ def metar_preprocessing_fn(
         observation_dim='stationName',
         split_variables=split_variables,
         dropna=dropna,
-        add_nan_mask=add_nan_mask,
         tolerance=tolerance,
         partitioned_by=partitioned_by,
         rename_variables=METAR_TO_ERA5_NAMES,
@@ -532,6 +519,5 @@ def metar_preprocessing_fn(
         preprocessing_fn=functools.partial(
             metar_preprocessing_fn, preprocessing_fn=preprocessing_fn
         ),
-        interpolation=interpolation,
-        process_chunk_fn=process_chunk_fn,
+        **kwargs,
     )
diff --git a/weatherbenchX/data_loaders/xarray_loaders.py b/weatherbenchX/data_loaders/xarray_loaders.py
@@ -13,12 +13,13 @@
 # limitations under the License.
 """Data loaders for reading gridded Zarr files."""
 
-from typing import Any, Callable, Hashable, Iterable, Mapping, Optional, Union
+from collections.abc import Hashable
+from typing import Any, Callable, Iterable, Mapping, Optional, Union
+
+from absl import logging
 import numpy as np
-from weatherbenchX import interpolations
 from weatherbenchX.data_loaders import base
 import xarray as xr
-from absl import logging
 
 
 def _rename_dataset(
@@ -63,11 +64,8 @@ def __init__(
       rename_dimensions: Optional[Union[Mapping[str, str], str]] = 'ecmwf',
       automatically_convert_lat_lon_to_latitude_longitude: bool = True,
       rename_variables: Optional[Mapping[str, str]] = None,
-      interpolation: Optional[interpolations.Interpolation] = None,
-      compute: bool = True,
-      add_nan_mask: bool = False,
       preprocessing_fn: Optional[Callable[[xr.Dataset], xr.Dataset]] = None,
-      process_chunk_fn: Optional[Callable[[xr.Dataset], xr.Dataset]] = None,
+      **kwargs,
   ):
     """Init.
 
@@ -91,16 +89,9 @@ def __init__(
         automatically convert 'lat' and 'lon' dimensions to 'latitude' and
         'longitude'. Default: True.
       rename_variables: (Optional) Dictionary of variables to rename.
-      interpolation: (Optional) Interpolation instance.
-      compute: Whether to load data into memory. Default: True.
-      add_nan_mask: Adds a boolean coordinate named 'mask' to each variable
-        (variables will be split into DataArrays if they aren't already), with
-        False indicating NaN values. To be used for masked aggregation. Default:
-        False.
       preprocessing_fn: (Optional) A function that is applied to the dataset
         right after it is opened.
-      process_chunk_fn: (Optional) A function that is applied to each chunk
-        after loading, interpolation and compute, but before computing a mask.
+      **kwargs: Keyword arguments to pass to base.DataLoader.
     """
     if path is not None and ds is not None:
       raise ValueError('Only one of path or ds can be specified, not both.')
@@ -120,12 +111,7 @@ def __init__(
     self._preprocessing_fn = preprocessing_fn
 
     self._preprocessed = False
-    super().__init__(
-        interpolation=interpolation,
-        compute=compute,
-        add_nan_mask=add_nan_mask,
-        process_chunk_fn=process_chunk_fn,
-    )
+    super().__init__(**kwargs)
 
   def maybe_prepare_dataset(self):
     """Prepares the dataset (reads and preprocesses it, if not already done)."""