WIP - untested.

pp-mo · pp-mo · commit 500407e259a7 · 2025-01-09T17:41:18.000Z
diff --git a/lib/ncdata/utils/__init__.py b/lib/ncdata/utils/__init__.py
@@ -1,10 +1,13 @@
 """General user utility functions."""
 from ._compare_nc_datasets import dataset_differences, variable_differences
 from ._copy import ncdata_copy
+from ._dim_indexing import Slicer, index_by_dimensions
 from ._save_errors import save_errors
 
 __all__ = [
+    "Slicer",
     "dataset_differences",
+    "index_by_dimensions",
     "ncdata_copy",
     "save_errors",
     "variable_differences",
diff --git a/lib/ncdata/utils/_dim_indexing.py b/lib/ncdata/utils/_dim_indexing.py
@@ -0,0 +1,161 @@
+from numbers import Number
+from typing import Any, List, Mapping, Union
+
+import dask.array as da
+
+from ncdata import NcData
+from ncdata.utils import ncdata_copy
+
+
+def index_by_dimensions(
+    ncdata: NcData, **dim_index_kwargs: Mapping[str, Any]
+) -> NcData:
+    """
+    Index an NcData over dimensions.
+
+    Parameters
+    ----------
+    ncdata
+        input data
+    dim_index_kwargs
+        specify indexing to apply to dimensions.
+        E.G. ``x=1``, ``time=slice(0, 100)``, ``levels=[1,2,5]``.
+
+    Returns
+    -------
+        copy of input with dimensions, and all relevant variables, sub-indexed.
+
+    Notes
+    -----
+    Where a dimension key is a single value, the dimension will be *removed*.
+    This mimics how numpy arrays behave, i.e. the difference between a[1] and a[1:2]
+
+    Examples
+    --------
+    ncdata = index_by_dimensions(ncdata, time=slice(0, 10))  # equivalent to [:10]
+    ncdata = index_by_dimensions(ncdata, levels=[1,2,5])
+    ncdata = index_by_dimensions(ncdata, time=3, levels=slice(2, 10, 3))
+
+    See Also
+    --------
+    :class:`Slicer` provides the same function with a slicing syntax
+    """
+    # Start by copying the input : then modify that in-place
+    ncdata = ncdata_copy(ncdata)
+    for dim_name, key in dim_index_kwargs.items():
+        # Dimension names must occur in the ncdata.
+        dimension = ncdata.dimensions.get(dim_name)
+        if dimension is None:
+            raise ValueError(
+                f"Dimension {dim_name!r} is not present in 'ncdata'."
+            )
+
+        # Check for and fail repeated dimensions: the meaning would be unclear (!)
+        matches = [name for name in dim_index_kwargs if name == dim_name]
+        if len(matches) > 1:
+            msg = (
+                f"Dimensions to index, {tuple(dim_index_kwargs.keys())}, "
+                f"includes dimension {dim_name!r} more than once."
+            )
+            raise ValueError(msg)
+
+        # Hopefully this replicates how numpy makes this decision?
+        remove_dim = isinstance(key, Number)
+
+        # TODO:
+        #   Key types must be supported:
+        #       * int (or other numeric, including numpy scalars ?)
+        #       * list of int
+        #       * slice object
+        #       * 1-D array of numeric
+        #   Key "special" types we could possibly error or convert, to avoid confusion
+        #   with numpy behaviours ? :
+        #       arrays, tuples, booleans, None, newaxis, ellipsis ...
+
+        # Index the data of all referencing variables
+        for var in ncdata.variables.values():
+            if dim_name in var.dimensions:
+                # construct a list of slice objects
+                (i_slicedim,) = [
+                    i
+                    for i, name in enumerate(var.dimensions)
+                    if name == dim_name
+                ]
+                slices = [slice(None) for dim in var.dimensions]
+                slices[i_slicedim] = key
+
+                # index the data
+                var.data = var.data[tuple(slices)]
+
+                # also remove the dim, if it will be removed
+                if remove_dim:
+                    del var.dimensions[dim_name]
+
+        # Remove or reduce the dimension itself.
+        if remove_dim:
+            del ncdata.dimensions[dim_name]
+        else:
+            # calculate the new dim size, using numpy-like logic
+            # TODO: there is probably a better way of calculating this ?
+            new_size = da.zeros(dimension.size)[key].shape[0]
+            dimension.size = new_size
+
+    return ncdata
+
+
+class Slicer:
+    """
+    An object which can index an NcData over its dimensions.
+
+    This wraps the :meth:`index_by_dimensions` method for convenience, returning an
+    object which supports the Python extended slicing syntax.
+
+    Examples
+    --------
+    data = Slicer(ncdata, 'time')[:10]
+    data = Slicer(ncdata, 'level')[[1, 2, 5]]
+    data = Slicer(ncdata, 'level', 'time', 'x', 'y')[1, :3, 2:10:3, ::-1]
+    """
+
+    def __init__(self, ncdata: NcData, dimensions: Union[str, List[str]]):
+        """
+        Create an indexer for an NcData, applying to specific dimensions.
+
+        This can then be indexed to produce a derived (sub-indexed) dataset.
+
+        Parameters
+        ----------
+        ncdata
+            input data
+        dimensions
+            one or more dimension names, to which successive index keys will be applied
+        """
+        self.ncdata = ncdata
+        if isinstance(dimensions, str):
+            dimensions = [dimensions]
+        self.dim_names = tuple(dimensions)
+
+    def __getitem__(self, keys) -> NcData:
+        """
+        Return an indexed portion of self.ncdata.
+
+        Index with 'keys' in the specified dimensions.
+        """
+        if not isinstance(keys, tuple):
+            # Single key, e.g. 1, slice(None), [2,3,4], array([2,3])
+            # N.B. *otherwise* keys is always a tuple
+            # A single tuple argument is passed as-is, i.e. interprets as multiple keys
+            keys = [keys]
+
+        n_keys = len(keys)
+        if len(keys) > len(self.dim_names):
+            msg = (
+                f"Too many index keys, {n_keys}, for the specified indexing dimension "
+                "names, {self.dim_names!r}."
+            )
+            raise ValueError(msg)
+
+        # NB too *few* keys is not a problem, since 'zip' truncates for us.
+        dim_kwargs = {name: key for name, key in zip(self.dim_names, keys)}
+
+        return index_by_dimensions(self.ncdata, **dim_kwargs)