diff --git a/lib/ncdata/utils/__init__.py b/lib/ncdata/utils/__init__.py index c2e0b44..2c9e477 100644 --- a/lib/ncdata/utils/__init__.py +++ b/lib/ncdata/utils/__init__.py @@ -2,10 +2,13 @@ from ._compare_nc_datasets import dataset_differences, variable_differences from ._copy import ncdata_copy +from ._dim_indexing import Slicer, index_by_dimensions from ._save_errors import save_errors __all__ = [ + "Slicer", "dataset_differences", + "index_by_dimensions", "ncdata_copy", "save_errors", "variable_differences", diff --git a/lib/ncdata/utils/_dim_indexing.py b/lib/ncdata/utils/_dim_indexing.py new file mode 100644 index 0000000..7f72419 --- /dev/null +++ b/lib/ncdata/utils/_dim_indexing.py @@ -0,0 +1,163 @@ +from numbers import Number +from typing import Any, List, Mapping, Union + +import dask.array as da +from ncdata import NcData +from ncdata.utils import ncdata_copy + + +def index_by_dimensions( + ncdata: NcData, **dim_index_kwargs: Mapping[str, Any] +) -> NcData: + """ + Index an NcData over dimensions. + + Parameters + ---------- + ncdata + input data + dim_index_kwargs + specify indexing to apply to dimensions. + E.G. ``x=1``, ``time=slice(0, 100)``, ``levels=[1,2,5]``. + + Returns + ------- + copy of input with dimensions, and all relevant variables, sub-indexed. + + Notes + ----- + Where a dimension key is a single value, the dimension will be *removed*. + This mimics how numpy arrays behave, i.e. the difference between a[1] and a[1:2] + + Examples + -------- + ncdata = index_by_dimensions(ncdata, time=slice(0, 10)) # equivalent to [:10] + ncdata = index_by_dimensions(ncdata, levels=[1,2,5]) + ncdata = index_by_dimensions(ncdata, time=3, levels=slice(2, 10, 3)) + + See Also + -------- + :class:`Slicer` provides the same function with a slicing syntax + """ + # Start by copying the input : then modify that in-place + ncdata = ncdata_copy(ncdata) + for dim_name, key in dim_index_kwargs.items(): + # Dimension names must occur in the ncdata. + dimension = ncdata.dimensions.get(dim_name) + if dimension is None: + raise ValueError( + f"Dimension {dim_name!r} is not present in 'ncdata'." + ) + + # Check for and fail repeated dimensions: the meaning would be unclear (!) + matches = [name for name in dim_index_kwargs if name == dim_name] + if len(matches) > 1: + msg = ( + f"Dimensions to index, {tuple(dim_index_kwargs.keys())}, " + f"includes dimension {dim_name!r} more than once." + ) + raise ValueError(msg) + + # Hopefully this replicates how numpy makes this decision? + remove_dim = isinstance(key, Number) + + # TODO: + # Key types must be supported: + # * int (or other numeric, including numpy scalars ?) + # * list of int + # * slice object + # * 1-D array of numeric + # Key "special" types we could possibly error or convert, to avoid confusion + # with numpy behaviours ? : + # arrays, tuples, booleans, None, newaxis, ellipsis ... + + # Index the data of all referencing variables + for var in ncdata.variables.values(): + if dim_name in var.dimensions: + # construct a list of slice objects + (i_slicedim,) = [ + i + for i, name in enumerate(var.dimensions) + if name == dim_name + ] + slices = [slice(None) for dim in var.dimensions] + slices[i_slicedim] = key + + # index the data + var.data = var.data[tuple(slices)] + + # also remove the dim, if it will be removed + if remove_dim: + # N.B. can't 'del' a tuple item + var.dimensions = tuple( + dim for dim in var.dimensions if dim != dim_name + ) + + # Remove or reduce the dimension itself. + if remove_dim: + del ncdata.dimensions[dim_name] + else: + # calculate the new dim size, using numpy-like logic + # TODO: there is probably a better way of calculating this ? + new_size = da.zeros(dimension.size)[key].shape[0] + dimension.size = new_size + + return ncdata + + +class Slicer: + """ + An object which can index an NcData over its dimensions. + + This wraps the :meth:`index_by_dimensions` method for convenience, returning an + object which supports the Python extended slicing syntax. + + Examples + -------- + data = Slicer(ncdata, 'time')[:10] + data = Slicer(ncdata, 'level')[[1, 2, 5]] + data = Slicer(ncdata, 'level', 'time', 'x', 'y')[1, :3, 2:10:3, ::-1] + """ + + def __init__(self, ncdata: NcData, dimensions: Union[str, List[str]]): + """ + Create an indexer for an NcData, applying to specific dimensions. + + This can then be indexed to produce a derived (sub-indexed) dataset. + + Parameters + ---------- + ncdata + input data + dimensions + one or more dimension names, to which successive index keys will be applied + """ + self.ncdata = ncdata + if isinstance(dimensions, str): + dimensions = [dimensions] + self.dim_names = tuple(dimensions) + + def __getitem__(self, keys) -> NcData: + """ + Return an indexed portion of self.ncdata. + + Index with 'keys' in the specified dimensions. + """ + if not isinstance(keys, tuple): + # Single key, e.g. 1, slice(None), [2,3,4], array([2,3]) + # N.B. *otherwise* keys is always a tuple + # A single tuple argument is passed as-is, i.e. interprets as multiple keys + keys = [keys] + + n_keys = len(keys) + if len(keys) > len(self.dim_names): + msg = ( + f"Too many index keys, {n_keys}, for the specified indexing dimension " + "names, {self.dim_names!r}." + ) + raise ValueError(msg) + + # NB too *few* keys is not a problem, since 'zip' truncates for us. + dim_kwargs = {name: key for name, key in zip(self.dim_names, keys)} + + return index_by_dimensions(self.ncdata, **dim_kwargs) diff --git a/tests/unit/utils/dim_indexing/__init__.py b/tests/unit/utils/dim_indexing/__init__.py new file mode 100644 index 0000000..600c9be --- /dev/null +++ b/tests/unit/utils/dim_indexing/__init__.py @@ -0,0 +1,83 @@ +"""Unit tests for :mod:`ncdata.utils._dim_indexing`.""" + +import numpy as np +from ncdata import NcData, NcDimension, NcVariable + + +def dims_test_data(x, y): + """ + Generate a test dataset that we can slice in various ways. + + If 'x' or 'y' are lists (~arrays) then make an X/Y dimension with thesee values, + otherwise if they are a scalar value then not, mimicking numpy indexing away of a + dimension. + + Also add a separate Z dimension, plus variables mapping every combination of + X,Y,Z and no dimension. + + Also include some Groups and Attributes to ensure preservation of all structure and + metadata. + """ + data = NcData( + dimensions=[NcDimension("Z", 2)], + variables=[ + NcVariable("var_0", [], data=np.array(1.2)), + NcVariable( + "var_Z", + ["Z"], + data=np.linspace(1.3, 2.3, 2), + attributes={"role": "Z dim var"}, + ), + ], + attributes={"global": "common_value"}, + ) + + x, y = [np.asarray(key) for key in (x, y)] + + data.variables.add( + NcVariable( + "yvals", + ["Y"] if y.ndim else [], + data=2.2 * np.ones(y.shape), + attributes={"role": "y values", "units": "ft"}, + ) + ) + if y.ndim: + data.dimensions.add(NcDimension("Y", len(y))) + ydims = ["Y"] + else: + ydims = [] + + data.variables.add( + NcVariable("Y", ydims, data=y, attributes={"role": "Y dim var"}) + ) + + data.variables.add( + NcVariable( + "xvals", + ["X"] if x.ndim else [], + data=3.3 * np.ones(x.shape), + attributes={"role": "x values", "units": "m"}, + ) + ) + if x.ndim: + data.dimensions.add(NcDimension("X", len(x))) + xdims = ["X"] + else: + xdims = [] + + data.variables.add( + NcVariable("X", xdims, data=x, attributes={"role": "X dim var"}) + ) + xydims = ["Z"] + (["Y"] if y.ndim else []) + (["X"] if x.ndim else []) + shape = [2] + ([len(y)] if y.ndim else []) + ([len(x)] if x.ndim else []) + data.variables.add( + NcVariable( + "zyx_vals", + xydims, + data=np.ones(shape), + attributes={"role": "data"}, + ) + ) + + return data diff --git a/tests/unit/utils/dim_indexing/test_Slicer.py b/tests/unit/utils/dim_indexing/test_Slicer.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/unit/utils/dim_indexing/test_index_by_dimensions.py b/tests/unit/utils/dim_indexing/test_index_by_dimensions.py new file mode 100644 index 0000000..f11625d --- /dev/null +++ b/tests/unit/utils/dim_indexing/test_index_by_dimensions.py @@ -0,0 +1,85 @@ +"""Tests for class :class:`ncdata.utils.index_by_dimension`. + +This is the more direct indexer approach. +""" + +import numpy as np +import pytest +from ncdata.utils import dataset_differences, index_by_dimensions + +from . import dims_test_data + +_NUL = slice(None) +_SLICE_OPTS = { + "empty": (_NUL, _NUL), + "singleX": (1, _NUL), + "singleY": (_NUL, 2), + "rangeX": (slice(2, 4), _NUL), + "rangeY": (_NUL, slice(1, 3)), + "picked": ([0, 1, 3], _NUL), + "openL": (slice(None, 3), _NUL), + "openR": (slice(3, None), _NUL), + "dual": (slice(1, 3), slice(3, None)), + "minus": (2, slice(-2, None)), +} +_OPT_NAMES = list(_SLICE_OPTS.keys()) +_OPT_VALUES = list(_SLICE_OPTS.values()) + + +@pytest.fixture(params=_OPT_VALUES, ids=_OPT_NAMES) +def slices(request): + return request.param + + +def data_1d(x): + data = dims_test_data(x=x, y=1) + # prune awway anything referring to + del data.dimensions["Z"] + for varname, var in list(data.variables.items()): + if any(dim in var.dimensions for dim in ("Y", "Z")): + data.variables.pop(varname) + return data + + +# One-dimensional testcases +_1D_OPTS = { + "empty": _NUL, + "single": 1, + "range": slice(2, 4), + "picked": [0, 1, 3], + "openL": slice(None, 3), + "openR": slice(3, None), + "minus": slice(-2, None), +} +_1D_OPT_NAMES = list(_1D_OPTS.keys()) +_1D_OPT_VALUES = list(_1D_OPTS.values()) + + +@pytest.fixture(params=_1D_OPT_VALUES, ids=_1D_OPT_NAMES) +def x_slices(request): + return request.param + + +def test_1d_index(x_slices): + x_base = np.arange(5.0) + data = data_1d(x_base) + result = index_by_dimensions(data, X=x_slices) + + expect_x = x_base[x_slices] + expect_data = data_1d(expect_x) + + assert dataset_differences(result, expect_data) == [] + + +def test_2d_index(slices): + x_inds, y_inds = slices + x_base, y_base = np.arange(5.0), np.arange(4.0) + + data = dims_test_data(x=x_base, y=y_base) + result = index_by_dimensions(data, X=x_inds, Y=y_inds) + + expect_x = x_base[x_inds] + expect_y = y_base[y_inds] + expect_data = dims_test_data(expect_x, expect_y) + + assert dataset_differences(result, expect_data) == []