diff --git a/docs/Makefile b/docs/Makefile index 387e70e..bb4252c 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -19,7 +19,7 @@ allapi: sphinx-apidoc -Mfe -o ./details/api ../lib/ncdata towncrier: - towncrier build --keep + towncrier build --yes # Tweaked "make html", which restores the changelog state after docs build. diff --git a/docs/changelog_fragments/161.doc.rst b/docs/changelog_fragments/161.doc.rst new file mode 100644 index 0000000..3579c04 --- /dev/null +++ b/docs/changelog_fragments/161.doc.rst @@ -0,0 +1 @@ +Added a `userguide page `_ summarising all the utility features in :mod:`ncdata.utils`. diff --git a/docs/changelog_fragments/166.feat.rst b/docs/changelog_fragments/166.feat.rst new file mode 100644 index 0000000..d80c79d --- /dev/null +++ b/docs/changelog_fragments/166.feat.rst @@ -0,0 +1,3 @@ +Provide exact == and != for datasets and variables, by just calling the difference utilities. +This can be inefficient, but is simple to understand and generally useful. +See: :ref:`equality_testing` diff --git a/docs/changelog_fragments/68.feat.rst b/docs/changelog_fragments/68.feat.rst new file mode 100644 index 0000000..20eae62 --- /dev/null +++ b/docs/changelog_fragments/68.feat.rst @@ -0,0 +1,6 @@ +Added the ability to extract a sub-region by indexing/slicing over dimensions. +The :class:`ncdata.NcData` objects can be indexed with the ``[]`` operation, or over +specifed dimensions with the :meth:`~ncdata.NcData.slicer` method. +This is based on the new :meth:`~ncdata.utils.index_by_dimensions()` utility method +and :class:`~ncdata.utils.Slicer` class. +See: :ref:`indexing_overview` \ No newline at end of file diff --git a/docs/userdocs/user_guide/common_operations.rst b/docs/userdocs/user_guide/common_operations.rst index 3e484c3..0637a69 100644 --- a/docs/userdocs/user_guide/common_operations.rst +++ b/docs/userdocs/user_guide/common_operations.rst @@ -73,6 +73,8 @@ Example : The utility function :func:`~ncdata.utils.rename_dimension` is provided for this. See : :ref:`howto_rename_dimension`. +.. _copy_notes: + Copying ------- All core objects support a ``.copy()`` method. See for instance @@ -115,23 +117,24 @@ For real data, this is just ``var.data = var.data.copy()``. There is also a utility function :func:`ncdata.utils.ncdata_copy` : This is effectively the same thing as the NcData object :meth:`~ncdata.NcData.copy` method. +.. _equality_testing: + +Equality Testing +---------------- +We implement equality operations ``==`` / ``!=`` for all the core data objects. -Equality Checking ------------------ -We provide a simple, comprehensive ``==`` check for :mod:`~ncdata.NcDimension` and -:mod:`~ncdata.NcAttribute` objects, but not at present :mod:`~ncdata.NcVariable` or -:mod:`~ncdata.NcData`. +However, simple equality testing on :class:`@ncdata.NcData` and :class:`@ncdata.NcVariable` +objects can be very costly if it requires comparing large data arrays. -So, using ``==`` on :mod:`~ncdata.NcVariable` or :mod:`~ncdata.NcData` objects -will only do an identity check -- that is, it tests ``id(A) == id(B)``, or ``A is B``. +If you need to avoid comparing large (and possibly lazy) arrays then you can use the +:func:`ncdata.utils.dataset_differences` and +:func:`ncdata.utils.variable_differences` utility functions. +These functions also provide multiple options to enable more tolerant comparison, +such as allowing variables to have a different ordering. -However, these objects **can** be properly compared with the dataset comparison -utilities, :func:`ncdata.utils.dataset_differences` and -:func:`ncdata.utils.variable_differences`. By default, these operations are very -comprehensive and may be very costly for instance comparing large data arrays, but they -also allow more nuanced and controllable checking, e.g. to skip data array comparisons -or ignore variable ordering. +See: :ref:`utils_equality` +.. _object_creation: Object Creation --------------- @@ -184,8 +187,7 @@ The result is the same: .. doctest:: python - >>> from ncdata.utils import dataset_differences - >>> print(dataset_differences(data1, data2)) - [] + >>> data1 == data2 + True diff --git a/docs/userdocs/user_guide/howtos.rst b/docs/userdocs/user_guide/howtos.rst index 48e967c..f821817 100644 --- a/docs/userdocs/user_guide/howtos.rst +++ b/docs/userdocs/user_guide/howtos.rst @@ -288,7 +288,7 @@ attribute already exists or not. .. Note:: Assigning attributes when *creating* a dataset, variable or group is somewhat - simpler, discussed :ref:`here `. + simpler, discussed :ref:`here `. .. _howto_create_variable: @@ -356,6 +356,66 @@ It can be freely overwritten by the user. valid dimensions, and that ``.data`` arrays match the dimensions. +.. _howto_copy: + +Make a copy of data +------------------- +Use the :meth:`ncdata.NcData.copy` method to make a copy. + +.. doctest:: + + >>> data2 = data.copy() + >>> data == data2 + True + +Note that this creates all-new independent ncdata objects, but all variable data arrays +will be linked to the originals (to avoid making copies). + +See: :ref:`copy_notes` + +.. _howto_slice: + +Extract a subsection by indexing +-------------------------------- +The nicest way is usually just to use the :meth:`~ncdata.Ncdata.slicer` method to specify +dimensions to index, and then index the result. + +.. testsetup:: + + >>> from ncdata import NcData, NcDimension + >>> from ncdata.utils import Slicer + >>> full_data = NcData(dimensions=[NcDimension("x", 7), NcDimension("y", 6)]) + >>> for nn, dim in full_data.dimensions.items(): + ... full_data.variables.add(NcVariable(nn, dimensions=[nn], data=np.arange(dim.size))) + +.. doctest:: + + >>> for dimname in full_data.dimensions: + ... print(dimname, ':', full_data.variables[dimname].data) + x : [0 1 2 3 4 5 6] + y : [0 1 2 3 4 5] + +.. doctest:: + + >>> data_region = full_data.slicer("y", "x")[3, 1::2] + +.. doctest:: + + >>> for dimname in data_region.dimensions: + ... print(dimname, ':', data_region.variables[dimname].data) + x : [1 3 5] + +You can also slice data directly, which simply acts on the dimensions in order: + +.. doctest:: + + >>> data_region_2 = full_data[1::2, 3] + >>> data_region_2 == data_region + True + +See: :ref:`indexing_overview` + + Read data from a NetCDF file ---------------------------- Use the :func:`ncdata.netcdf4.from_nc4` function to load a dataset from a netCDF file. @@ -658,8 +718,7 @@ In fact, there should be NO difference between these two. .. doctest:: python - >>> from ncdata.utils import dataset_differences - >>> print(dataset_differences(data, data2) == []) + >>> data == data2 True diff --git a/docs/userdocs/user_guide/user_guide.rst b/docs/userdocs/user_guide/user_guide.rst index 12b6cbe..8a0ed82 100644 --- a/docs/userdocs/user_guide/user_guide.rst +++ b/docs/userdocs/user_guide/user_guide.rst @@ -9,5 +9,6 @@ Detailed explanations, beyond the basic tutorial-style introductions design_principles data_objects common_operations + utilities general_topics howtos diff --git a/docs/userdocs/user_guide/utilities.rst b/docs/userdocs/user_guide/utilities.rst new file mode 100644 index 0000000..859bb25 --- /dev/null +++ b/docs/userdocs/user_guide/utilities.rst @@ -0,0 +1,230 @@ +Utilities and Conveniences +========================== +This section provide a short overview of various more involved operations which are +provided in the :mod:`~ncdata.utils` module. In all cases, more detail is available in +the `API pages <../../details/api/ncdata.utils.html>`_ + +Rename Dimensions +----------------- +The :func:`~ncdata.utils.rename_dimension` utility does this, in a way which ensures a +safe and consistent result. + +.. _utils_equality: + +Dataset Equality Testing +------------------------ +The function :func:`~ncdata.utils.dataset_differences` produces a list of messages +detailing all the ways in which two datasets are different. + +For Example: +^^^^^^^^^^^^ +.. testsetup:: + + >>> from ncdata import NcData, NcDimension, NcVariable + >>> from ncdata.utils import dataset_differences + >>> import numpy as np + +.. doctest:: + + >>> data1 = NcData( + ... dimensions=[NcDimension("x", 5)], + ... variables=[NcVariable("vx", dimensions=["x"], data=np.arange(5))] + ... ) + >>> data2 = data1.copy() + >>> print(dataset_differences(data1, data2)) + [] + +.. doctest:: + + >>> data2.dimensions["x"].unlimited = True + >>> data2.variables["vx"].data = np.array([1, 3]) # NB must be a *new* array ! + +.. doctest:: + + >>> diffs = dataset_differences(data1, data2) + >>> for msg in diffs: + ... print(msg) + Dataset "x" dimension has different "unlimited" status : False != True + Dataset variable "vx" shapes differ : (5,) != (2,) + +.. note:: + To compare isolated variables, a subsidiary routine + :func:`~ncdata.utils.variable_differences` is also provided. + +.. note:: + The ``==`` and ``!-`` operations on :class:`ncdata.NcData` and + :class:`ncdata.NcVariable` are implemented to call these utility functions. + However, lacking a keyword interface to enable any tolerance options, the operations + compare absolutely everything, and so can be very performance intensive if large data + arrays are present. + +.. _indexing_overview: + +Sub-indexing +------------ +A new dataset can be derived by indexing over dimensions, analagous to sub-indexing +an array. This operation indexes all the variables appropriately, to produce a new +independent dataset which is complete and self-consistent. + +The basic indexing operation is provided in three forms: + +#. the :func:`~ncdata.utils.index_by_dimensions` function provides the basic operation +#. the :class:`~ncdata.utils.Slicer` objects allow indexing with a slicing syntax +#. the :meth:`ncdata.NcData.slicer` and ``NcData.__getitem__`` methods allow a neater syntax + for slicing datasets directly + +.. note:: + The simplest way is usually to use the :class:`~ncdata.NcData` methods. + See: :ref:`howto_slice` + +Indexing function +^^^^^^^^^^^^^^^^^ +The function :func:`~ncdata.utils.index_by_dimensions` provides indexing where the +indices are passed as keywords for each named dimension. + +For example: + +.. testsetup:: + + >>> from ncdata.utils import index_by_dimensions + +.. doctest:: + + >>> data = NcData( + ... dimensions=[NcDimension("y", 4), NcDimension("x", 10)], + ... variables=[NcVariable( + ... "v1", dimensions=["y", "x"], + ... data=np.arange(40).reshape((4, 10)) + ... )] + ... ) + +.. doctest:: + + >>> subdata_A = index_by_dimensions(data, x=2) + >>> print(subdata_A) + + dimensions: + y = 4 + + variables: + + > + >>> print(subdata_A.variables["v1"].data) + [ 2 12 22 32] + + >>> subdata_B = index_by_dimensions(data, y=slice(0, 2), x=[4, 1, 2]) + >>> print(subdata_B) + + dimensions: + y = 2 + x = 3 + + variables: + + > + >>> print(subdata_B.variables["v1"].data) + [[ 4 1 2] + [14 11 12]] + + +Slicing syntax +^^^^^^^^^^^^^^ +The :class:`~ncdata.utils.Slicer` class is provided to enable the same operation to be +expressed using multi-dimensional slicing syntax. + +A Slicer is created by specifying an NcData and a list of dimensions, ``Slicer(data, **dim_names)``. + +If **no dim-names** are specified, this defaults to all dimensions of the NcData in order, +i.e. ``Slicer(data, list(data.dimensions))``. + +A ``Slicer`` object is re-usable, and supports the numpy-like extended slicing syntax, +i.e. keys of the form "a:b:c". + +So for example, the above examples are more neatly expressed like this ... + +.. testsetup:: + + >>> from ncdata.utils import Slicer + +.. doctest:: + + >>> data_slicer = Slicer(data, "x", "y") + >>> subdata_A_2 = data_slicer[2] # equivalent to ibd(data, x=2) + >>> subdata_B_2 = data_slicer[[4, 1, 2], :2] # equivalent to ibd(data, x=[4, 1, 2], y=slice(0, 2)) + +.. doctest:: + + >>> subdata_A == subdata_A_2 + True + >>> subdata_B == subdata_B_2 + True + + +NcData direct indexing +^^^^^^^^^^^^^^^^^^^^^^ +The NcData ``NcData.__getitem__`` and :meth:`~ncdata.NcData.slicer` methods +provide a more concise way of slicing data (which is nevertheless still the same +operation, functionally). + +This is explained by the simple equivalences: + + ``data.slicer(*dims)`` === ``Slicer(data, *dims)`` + +and + + ``data[*keys]`` === ``data.slicer()[*keys]`` + + +So, for example, the above examples can also be written ... + +.. doctest:: + + >>> subdata_A_3 = data.slicer("x")[2] + >>> subdata_A_4 = data[:, 2] + >>> subdata_A_3 == subdata_A_4 == subdata_A + True + +.. doctest:: + + >>> subdata_B_3 = data.slicer("x", "y")[[4, 1, 2], :2] + >>> subdata_B_4 = data[:2, [4, 1, 2]] + >>> subdata_B_3 == subdata_B_4 == subdata_B + True + + +Consistency Checking +-------------------- +The :func:`~ncdata.utils.save_errors` function provides a general +correctness-and-consistency check. + +For example: + +.. testsetup:: + + >>> from ncdata.utils import save_errors + +.. doctest:: + + >>> data_bad = data.copy() + >>> array = data_bad.variables["v1"].data + >>> data_bad.variables["v1"].data = array[:2] + >>> data_bad.variables.add(NcVariable("q", data={"x": 4})) + +.. doctest:: + + >>> for msg in save_errors(data_bad): + ... print(msg) + Variable 'v1' data shape = (2, 10), does not match that of its dimensions = (4, 10). + Variable 'q' has a dtype which cannot be saved to netcdf : dtype('O'). + + +See : :ref:`correctness-checks` + + +Data Copying +------------ +The :func:`~ncdata.utils.ncdata_copy` makes structural copies of datasets. +However, this can be easily be accessed as :meth:`ncdata.NcData.copy`, which is the same +operation. + +See: :ref:`copy_notes` \ No newline at end of file diff --git a/lib/ncdata/_core.py b/lib/ncdata/_core.py index 7ec4c53..7e4bb1e 100644 --- a/lib/ncdata/_core.py +++ b/lib/ncdata/_core.py @@ -471,6 +471,50 @@ def copy(self): return ncdata_copy(self) + # Provide a slicing interface, by just linking to ncdata.utils._dim_indexing code. + def slicer(self, *dim_names): + """ + Make a :class:`~ncdata.utils.Slicer` object to index the data. + + This creates a slicer which can then be indexed to sub-index the data. + See: :ref:`howto_slice` + + Parameters + ---------- + dim_names: list(str) + Names of dimensions to slice. + + Returns + ------- + :class:`~ncdata.utils.Slicer` + + Examples + -------- + .. testsetup:: + >>> from ncdata._core import NcData, NcDimension + >>> ncdata = NcData(dimensions=[NcDimension('x', 4), NcDimension('y', 5)]) + + >>> subregion = ncdata.slicer('x', 'y')[3, 2:4] + """ + from ncdata.utils import Slicer + + return Slicer(self, *dim_names) + + def __getitem__(self, keys): # noqa: D105 + return self.slicer()[*keys] + + # Define equality in terms of dataset comparison utility + def __eq__(self, other): # noqa: D105 + if id(other) == id(self): + result = True + elif not isinstance(other, NcData): + result = False + else: + from ncdata.utils import dataset_differences + + result = dataset_differences(self, other) == [] + return result + class NcDimension: """ @@ -628,6 +672,18 @@ def copy(self): ) return var + # Define equality in terms of variable comparison utility + def __eq__(self, other): # noqa: D105 + if id(other) == id(self): + result = True + elif not isinstance(other, NcVariable): + result = False + else: + from ncdata.utils import variable_differences + + result = variable_differences(self, other) == [] + return result + class NcAttribute: """ diff --git a/lib/ncdata/utils/__init__.py b/lib/ncdata/utils/__init__.py index a91e9e3..297b3cf 100644 --- a/lib/ncdata/utils/__init__.py +++ b/lib/ncdata/utils/__init__.py @@ -2,11 +2,14 @@ from ._compare_nc_datasets import dataset_differences, variable_differences from ._copy import ncdata_copy +from ._dim_indexing import Slicer, index_by_dimensions from ._rename_dim import rename_dimension from ._save_errors import save_errors __all__ = [ + "Slicer", "dataset_differences", + "index_by_dimensions", "ncdata_copy", "rename_dimension", "save_errors", diff --git a/lib/ncdata/utils/_dim_indexing.py b/lib/ncdata/utils/_dim_indexing.py new file mode 100644 index 0000000..42b94ae --- /dev/null +++ b/lib/ncdata/utils/_dim_indexing.py @@ -0,0 +1,202 @@ +from numbers import Number +from typing import Any, Mapping + +import dask.array as da +import numpy as np +from ncdata import NcData + + +def index_by_dimensions( + ncdata: NcData, + **dim_index_kwargs: Mapping[str, Any], +) -> NcData: + r""" + Index an NcData over named dimensions. + + Parameters + ---------- + ncdata: NcData + The input data. + dim_index_kwargs: Mapping[str, Any] + Indexing to apply to named dimensions. + E.G. ``index_by_dimensions(data, x=1)``, + ``index_by_dimensions(data, time=slice(0, 100), levels=[1,2,5])``. + + Returns + ------- + A new copy of 'ncdata', with dimensions and all relevant variables sub-indexed. + + Examples + -------- + .. testsetup:: + >>> from ncdata import NcDimension + >>> from ncdata.utils import index_by_dimensions + >>> data = NcData(dimensions=[NcDimension(nn, 10) for nn in ("time", "levels")]) + + >>> data1 = index_by_dimensions(data, time=slice(0, 10)) # equivalent to [:10] + >>> data2 = index_by_dimensions(data, levels=[1,2,5]) + >>> data3 = index_by_dimensions(data, time=3, levels=slice(2, 10, 3)) + + Notes + ----- + * Where a dimension key is a single value, the dimension will be *removed*. + This mimics how numpy arrays behave, i.e. the difference between a[1] and a[[1]] + or a[1:2]. + + * Supported types of index key are: a single number; a slice; a list of indices or + booleans. A tuple, or one-dimensional array can also be used in place of a list. + + * Key types **not** supported are: Multi-dimensional arrays; ``Ellipsis``; + ``np.newaxis`` / ``None``. + + * A :class:`Slicer` provides the same functionality with a slicing syntax. + + See Also + -------- + :class:`Slicer` + """ + # Start by copying the input : then modify that in-place + ncdata = ncdata.copy() + for dim_name, key in dim_index_kwargs.items(): + # Dimension names must occur in the ncdata. + dimension = ncdata.dimensions.get(dim_name) + if dimension is None: + raise ValueError( + f"Dimension {dim_name!r} is not present in 'ncdata'." + ) + + # Specifically error forbidden key types. + if np.array(key).ndim > 1: + raise ValueError( + f"Key for dimension {dim_name!r} is multi-dimensional: {key}. " + "Multi-dimensional keys are not supported." + ) + elif key is Ellipsis: + raise ValueError( + f'Key for dimension {dim_name!r} is Ellipsis / "...": ' + "Ellipsis is not supported." + ) + elif key in (np.newaxis, None): + raise ValueError( + f"Key for dimension {dim_name!r} is np.newaxis / None: " + "New-axis is not supported. " + ) + + # A single value removes the dimension. + remove_dim = isinstance(key, Number) + + # Index the data of all referencing variables + for var in ncdata.variables.values(): + if dim_name in var.dimensions: + # construct a list of slice objects + (i_slicedim,) = [ + i + for i, name in enumerate(var.dimensions) + if name == dim_name + ] + slices = [slice(None) for dim in var.dimensions] + slices[i_slicedim] = key + + # index the data + var.data = var.data[tuple(slices)] + + # also remove the dim, if it will be removed + if remove_dim: + # N.B. can't 'del' a tuple item + var.dimensions = tuple( + dim for dim in var.dimensions if dim != dim_name + ) + + # Remove or reduce the dimension itself. + if remove_dim: + del ncdata.dimensions[dim_name] + else: + # calculate the new dim size, using numpy-like logic + # TODO: there is probably a better way of calculating this ? + new_size = da.zeros(dimension.size)[key].shape[0] + dimension.size = new_size + + return ncdata + + +class Slicer: + """ + An object which can index an NcData over its dimensions. + + This wraps the :meth:`index_by_dimensions` method for convenience, returning an + object which supports the Python extended slicing syntax. + + Examples + -------- + .. testsetup:: + >>> from ncdata import NcDimension + >>> from ncdata.utils import index_by_dimensions + >>> data = NcData(dimensions=[NcDimension(nn, 10) for nn in ("time", "levels", "X")]) + + >>> subdata = Slicer(data, "time")[:3] + + >>> ds = Slicer(data, 'levels', 'time') + >>> subdata_2 = ds[:10, :2] + >>> subdata_3 = ds[1, [1,2,4]] + + >>> subdata_4 = Slicer(data)[:3, 1:4] + + Notes + ----- + * A Slicer contains the original `ncdata` and presents it in a "sliceable" form. + Indexing it returns a new NcData, so the original data is unchanged. The Slicer + is also unchanged and can be reused. + + * :meth:`index_by_dimensions` provides the same functionality in a different form. + See there for more exact details of the operation. + + See Also + -------- + :meth:`index_by_dimensions` + """ + + def __init__(self, ncdata: NcData, *dimension_names: tuple[str]): + """ + Create an indexer for an NcData, applying to specific dimensions. + + This can then be indexed to produce a derived (sub-indexed) dataset. + + Parameters + ---------- + ncdata: NcData + Input data to be sliced. + dimension_names: list[str] + Dimension names to which successive index keys will be applied. + If none are given, defaults to ``ncdata.dimensions``. + """ + #: data to be indexed. + self.ncdata = ncdata + if not dimension_names: + dimension_names = ncdata.dimensions # Default to "all dimensions." + #: dimensions to index, in order. + self.dim_names = tuple(dimension_names) + + def __getitem__(self, keys) -> NcData: + """ + Return an indexed portion of self.ncdata. + + Index with 'keys' applied to dimensions in the order ``slicer.dim_names``. + """ + if not isinstance(keys, tuple): + # Single key, e.g. 1, slice(None), [2,3,4], array([2,3]) + # N.B. *otherwise* keys is always a tuple + # A single tuple argument is passed as-is, i.e. interprets as multiple keys + keys = [keys] + + n_keys = len(keys) + if len(keys) > len(self.dim_names): + msg = ( + f"Too many index keys ({n_keys}), for the indexing dimensions: " + f"{self.dim_names!r}." + ) + raise ValueError(msg) + + # NB too *few* keys is not a problem, since 'zip' truncates for us. + dim_kwargs = {name: key for name, key in zip(self.dim_names, keys)} + + return index_by_dimensions(self.ncdata, **dim_kwargs) diff --git a/tests/unit/core/test_NcData.py b/tests/unit/core/test_NcData.py index e4fb0ba..fba5e6c 100644 --- a/tests/unit/core/test_NcData.py +++ b/tests/unit/core/test_NcData.py @@ -58,5 +58,61 @@ def test(self, mocker): mocker.patch("ncdata.utils.ncdata_copy", mock_copycall) ncdata = NcData() result = ncdata.copy() - assert mock_copycall.called_once_witk(mocker.call(ncdata)) + assert mock_copycall.call_args_list == [mocker.call(ncdata)] assert result == mock_copied_ncdata + + +class Test_NcData_eq: + # check that == calls dataset_differences + def test(self, mocker): + ncdata1, ncdata2 = [NcData(name) for name in ("data1", "data2")] + called = mocker.patch("ncdata.utils.dataset_differences") + ncdata1 == ncdata2 + assert called.call_args_list == [mocker.call(ncdata1, ncdata2)] + + def test_self_equal(self, mocker): + ds = NcData() + called = mocker.patch("ncdata.utils.dataset_differences") + assert ds == ds + assert called.call_args_list == [] + + def test_badtype_nonequal(self, mocker): + ds = NcData() + called = mocker.patch("ncdata.utils.dataset_differences") + assert ds != 1 + assert called.call_args_list == [] + + +class Test_NcVariable_slicer: + # check that .slicer makes a slice. + def test(self, mocker): + ncdata1 = NcData() + + dim_args = (1, 2, 3) # N.B. not actually acceptable for a real usage + mock_return = mocker.sentinel.retval + called = mocker.patch("ncdata.utils.Slicer", return_value=mock_return) + result = ncdata1.slicer(*dim_args) + + assert called.call_args_list == [mocker.call(ncdata1, *dim_args)] + assert result == mock_return + + +class Test_NcVariable_getitem: + # check that data[*keys] calls data.slicer[*keys] + def test(self, mocker): + from ncdata.utils import Slicer + + ncdata1 = NcData() + + dim_keys = (1, 2, 3) # N.B. not actually acceptable for a real usage + mock_slicer = mocker.MagicMock(spec=Slicer) + called = mocker.patch( + "ncdata._core.NcData.slicer", return_value=mock_slicer + ) + result = ncdata1[*dim_keys] + + assert called.call_args_list == [mocker.call()] + assert mock_slicer.__getitem__.call_args_list == [ + mocker.call(dim_keys) + ] + assert result == mock_slicer[dim_keys] diff --git a/tests/unit/core/test_NcVariable.py b/tests/unit/core/test_NcVariable.py index 3965037..34c874e 100644 --- a/tests/unit/core/test_NcVariable.py +++ b/tests/unit/core/test_NcVariable.py @@ -257,3 +257,24 @@ def test_populated(self): ) result = var.copy() self.check_var_iscopy(result, var) + + +class Test_NcVariable_eq: + def test_equal(self, mocker): + # Check that == calls variable_differences + var1, var2 = [NcVariable(name=name) for name in ("x", "y")] + called = mocker.patch("ncdata.utils.variable_differences") + var1 == var2 + assert called.call_args_list == [mocker.call(var1, var2)] + + def test_self_equal(self, mocker): + var = NcVariable(name="x") + called = mocker.patch("ncdata.utils.variable_differences") + assert var == var + assert called.call_args_list == [] + + def test_badtype_nonequal(self, mocker): + var = NcVariable(name="x") + called = mocker.patch("ncdata.utils.variable_differences") + assert var != 1 + assert called.call_args_list == [] diff --git a/tests/unit/utils/dim_indexing/__init__.py b/tests/unit/utils/dim_indexing/__init__.py new file mode 100644 index 0000000..8fda765 --- /dev/null +++ b/tests/unit/utils/dim_indexing/__init__.py @@ -0,0 +1,151 @@ +"""Unit tests for :mod:`ncdata.utils._dim_indexing`.""" + +import numpy as np +import pytest +from ncdata import NcData, NcDimension, NcVariable + + +def make_dims_testdata(x, y): + """ + Generate a test dataset that we can slice in various ways. + + If 'x' or 'y' are lists (~arrays) then make an X/Y dimension with these values, + otherwise if they are a scalar value then not, mimicking numpy indexing away of a + dimension. + + Also add a separate Z dimension, plus variables mapping every combination of + X,Y,Z and no dimension. + + Also include some Groups and Attributes to ensure preservation of all structure and + metadata. + """ + data = NcData( + dimensions=[NcDimension("Z", 2)], + variables=[ + NcVariable("var_0", [], data=np.array(1.2)), + NcVariable( + "var_Z", + ["Z"], + data=np.linspace(1.3, 2.3, 2), + attributes={"role": "Z dim var"}, + ), + ], + attributes={"global": "common_value"}, + ) + + x, y = [np.asarray(key) for key in (x, y)] + + data.variables.add( + NcVariable( + "yvals", + ["Y"] if y.ndim else [], + data=2.2 * np.ones(y.shape), + attributes={"role": "y values", "units": "ft"}, + ) + ) + if y.ndim: + data.dimensions.add(NcDimension("Y", len(y))) + ydims = ["Y"] + else: + ydims = [] + + data.variables.add( + NcVariable("Y", ydims, data=y, attributes={"role": "Y dim var"}) + ) + + data.variables.add( + NcVariable( + "xvals", + ["X"] if x.ndim else [], + data=3.3 * np.ones(x.shape), + attributes={"role": "x values", "units": "m"}, + ) + ) + if x.ndim: + data.dimensions.add(NcDimension("X", len(x))) + xdims = ["X"] + else: + xdims = [] + + data.variables.add( + NcVariable("X", xdims, data=x, attributes={"role": "X dim var"}) + ) + xydims = ["Z"] + (["Y"] if y.ndim else []) + (["X"] if x.ndim else []) + shape = [2] + ([len(y)] if y.ndim else []) + ([len(x)] if x.ndim else []) + data.variables.add( + NcVariable( + "zyx_vals", + xydims, + data=np.ones(shape), + attributes={"role": "data"}, + ) + ) + + return data + + +def make_dims_testdata_1d(x): + """Create 1-D testdata.""" + data = make_dims_testdata(x=x, y=1) + # remove 'Z' dimension altogether. + del data.dimensions["Z"] + # prune away any variables referring to 'Y' or 'Z'. + for varname, var in list(data.variables.items()): + if any(dim in var.dimensions for dim in ("Y", "Z")): + data.variables.pop(varname) + return data + + +class Slicekeys: # noqa: D101 + def __getitem__(self, keys): # noqa: D105 + return keys + + +_SLICEKEYS = Slicekeys() +_NUL = _SLICEKEYS[:] + +# Define one-dimensional testcases, ultimately as the 'x_slices' fixture. +# NB these are used to test both 'index_by_dimensions' and 'Slicer'. +_1D_X_SLICE_OPTS = { + "empty": _NUL, + "single": 1, + "range": _SLICEKEYS[2:4], + "picked": [0, 1, 3], + "openL": _SLICEKEYS[:3], + "openR": _SLICEKEYS[2:], + "minus": -2, + "top2": _SLICEKEYS[-2:], +} +_1D_OPT_NAMES = list(_1D_X_SLICE_OPTS.keys()) +_1D_OPT_VALUES = list(_1D_X_SLICE_OPTS.values()) + + +@pytest.fixture(params=_1D_OPT_VALUES, ids=_1D_OPT_NAMES) +def x_slices(request): + """Fixture yielding 1-D testcases.""" + return request.param + + +# Define two-dimensional testcases, ultimately as the 'xy_slices' fixture. +# NB these are used to test both 'index_by_dimensions' and 'Slicer'. +_2D_XY_SLICE_OPTS = { + "empty": (_NUL, _NUL), + "singleX": (1, _NUL), + "singleY": (_NUL, 2), + "rangeX": (_SLICEKEYS[2:4], _NUL), + "rangeY": (_NUL, _SLICEKEYS[1:3]), + "picked": ([0, 1, 3], _NUL), + "openL": (_SLICEKEYS[:3], _NUL), + "openR": (_SLICEKEYS[2:], _NUL), + "dual": (_SLICEKEYS[1:3], _SLICEKEYS[2:]), + "minus": (2, -2), + "top2": (2, _SLICEKEYS[-2:]), +} +_2D_OPT_NAMES = list(_2D_XY_SLICE_OPTS.keys()) +_2D_OPT_VALUES = list(_2D_XY_SLICE_OPTS.values()) + + +@pytest.fixture(params=_2D_OPT_VALUES, ids=_2D_OPT_NAMES) +def xy_slices(request): + """Fixture yielding 2-D testcases.""" + return request.param diff --git a/tests/unit/utils/dim_indexing/test_Slicer.py b/tests/unit/utils/dim_indexing/test_Slicer.py new file mode 100644 index 0000000..882db11 --- /dev/null +++ b/tests/unit/utils/dim_indexing/test_Slicer.py @@ -0,0 +1,73 @@ +"""Tests for class :class:`ncdata.utils.Slicer`. + +This is the "indirect" approach, with a Slicer object. +""" + +import numpy as np +import pytest +from ncdata.utils import Slicer, dataset_differences + +from . import ( # noqa: F401 + make_dims_testdata, + make_dims_testdata_1d, + x_slices, + xy_slices, +) + +# +# These tests apply the same test-cases as "test_index_by_dimensions". +# + + +def test_1d_index(x_slices): # noqa: F811 + x_base = np.arange(5.0) + data = make_dims_testdata_1d(x_base) + + print(x_slices) + result = Slicer(data, "X")[x_slices] + + # Make an expected result from an array indexed with *numpy*, to check equivalence. + expect_x = x_base[x_slices] + expect_data = make_dims_testdata_1d(expect_x) + + assert dataset_differences(result, expect_data) == [] + + +def test_2d_index(xy_slices): # noqa: F811 + x_inds, y_inds = xy_slices + x_base, y_base = np.arange(5.0), np.arange(4.0) + + data = make_dims_testdata(x=x_base, y=y_base) + + print(xy_slices) + result = Slicer(data, "X", "Y")[x_inds, y_inds] + + expect_x = x_base[x_inds] + expect_y = y_base[y_inds] + expect_data = make_dims_testdata(expect_x, expect_y) + + assert dataset_differences(result, expect_data) == [] + + +class TestSlicerDims: + @pytest.fixture(autouse=True) + def setup(self): + self.testdata = make_dims_testdata([1, 2], [3, 4, 5]) + + def test_nodims(self): + # Check indexing with no names dims. + slicer1 = Slicer(self.testdata) + assert slicer1.dim_names == ("Z", "Y", "X") + + def test_fewerdims(self): + slicer1 = Slicer(self.testdata, "X", "Y") + assert slicer1[1] == slicer1[1, :] + + def test_toomanydims_fail(self): + # Check that too many dims causes an error. + msg = ( + r"Too many index keys \(3\), for the indexing dimensions: " + r"\('X', 'Y'\)\." + ) + with pytest.raises(ValueError, match=msg): + Slicer(self.testdata, "X", "Y")[0, 0, 0] diff --git a/tests/unit/utils/dim_indexing/test_index_by_dimensions.py b/tests/unit/utils/dim_indexing/test_index_by_dimensions.py new file mode 100644 index 0000000..a0c2200 --- /dev/null +++ b/tests/unit/utils/dim_indexing/test_index_by_dimensions.py @@ -0,0 +1,99 @@ +"""Tests for class :class:`ncdata.utils.index_by_dimension`. + +This is the more direct indexer approach. +""" + +import numpy as np +import pytest +from ncdata.utils import dataset_differences, index_by_dimensions + +from . import ( # noqa: F401 + make_dims_testdata, + make_dims_testdata_1d, + x_slices, + xy_slices, +) + + +def test_1d_index(x_slices): # noqa: F811 + x_base = np.arange(5.0) + data = make_dims_testdata_1d(x_base) + result = index_by_dimensions(data, X=x_slices) + + # Make an expected result from an array indexed with *numpy*, to check equivalence. + expect_x = x_base[x_slices] + expect_data = make_dims_testdata_1d(expect_x) + + assert dataset_differences(result, expect_data) == [] + + +def test_2d_index(xy_slices): # noqa: F811 + x_inds, y_inds = xy_slices + x_base, y_base = np.arange(5.0), np.arange(4.0) + + data = make_dims_testdata(x=x_base, y=y_base) + result = index_by_dimensions(data, X=x_inds, Y=y_inds) + + expect_x = x_base[x_inds] + expect_y = y_base[y_inds] + expect_data = make_dims_testdata(expect_x, expect_y) + + assert dataset_differences(result, expect_data) == [] + + +class TestKwargs: + @pytest.fixture(autouse=True) + def sample_3d(self): + self.sample = make_dims_testdata(np.arange(5), np.arange(4)) + + def test_bad_dimname_fail(self): + msg = "Dimension 'Q' is not present in 'ncdata'." + with pytest.raises(ValueError, match=msg): + index_by_dimensions(self.sample, Q=7) + + +class TestIndexTypes: + @pytest.fixture(autouse=True) + def sample_3d(self): + self.sample = make_dims_testdata(np.arange(5), np.arange(4)) + + def test_slices(self): + res1 = index_by_dimensions( + self.sample, Z=slice(0, 1), Y=slice(None, 2), X=slice(1, None, 2) + ) + assert np.array_equal( + res1.variables["zyx_vals"].data, + self.sample.variables["zyx_vals"].data[0:1, :2, 1::2], + ) + + def test_integer_array(self): + res1 = index_by_dimensions(self.sample, X=[1, 3, 2, 1]) + assert np.array_equal( + res1.variables["zyx_vals"].data, + self.sample.variables["zyx_vals"].data[:, :, [1, 3, 2, 1]], + ) + + def test_boolean_array(self): + res1 = index_by_dimensions( + self.sample, X=[True, False, True, False, False] + ) + res2 = index_by_dimensions(self.sample, X=[0, 2]) + assert dataset_differences(res1, res2) == [] + + def test_multidim_fail(self): + msg = "Key for dimension 'Y' is multi-dimensional: .* not supported." + with pytest.raises(ValueError, match=msg): + index_by_dimensions(self.sample, Y=[[1, 2], [3, 4]]) + + def test_ellipsis_fail(self): + msg = "Key for dimension 'Z' is Ellipsis.* not supported." + with pytest.raises(ValueError, match=msg): + index_by_dimensions(self.sample, Z=...) + + @pytest.mark.parametrize( + "key", [np.newaxis, None], ids=["newaxis", "None"] + ) + def test_newaxis_fail(self, key): + msg = "Key for dimension 'Y' is np.newaxis / None.* not supported." + with pytest.raises(ValueError, match=msg): + index_by_dimensions(self.sample, Y=key)