Skip to content

Commit 452ded8

Browse files
committed
WIP - untested.
1 parent 08f0161 commit 452ded8

File tree

3 files changed

+216
-1
lines changed

3 files changed

+216
-1
lines changed

lib/ncdata/utils/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""General user utility functions."""
22

3+
from ._copy import ncdata_copy
4+
from ._dim_indexing import Slicer, index_by_dimensions
35
from ._save_errors import save_errors
46

5-
__all__ = ["save_errors"]
7+
__all__ = ["Slicer", "index_by_dimensions", "ncdata_copy", "save_errors"]

lib/ncdata/utils/_copy.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"""Utility to copy NcData objects, but not copying any contained data arrays."""
2+
3+
from ncdata import NameMap, NcAttribute, NcData, NcDimension, NcVariable
4+
5+
6+
def _attributes_copy(attrs: NameMap) -> NameMap:
7+
return NameMap.from_items(
8+
[
9+
NcAttribute(name=attr.name, value=attr.value)
10+
for attr in attrs.values()
11+
],
12+
item_type=NcAttribute,
13+
)
14+
15+
16+
def ncdata_copy(ncdata: NcData) -> NcData:
17+
"""
18+
Return a copy of the data.
19+
20+
The operation makes fresh copies of all ncdata objects, but does not copy arrays in
21+
either variable data attribute values.
22+
23+
Parameters
24+
----------
25+
ncdata
26+
data to copy
27+
28+
Returns
29+
-------
30+
ncdata
31+
identical but distinct copy of input
32+
33+
"""
34+
return NcData(
35+
attributes=_attributes_copy(ncdata.attributes),
36+
dimensions=[
37+
NcDimension(dim.name, size=dim.size, unlimited=dim.unlimited)
38+
for dim in ncdata.dimensions.values()
39+
],
40+
variables=[
41+
NcVariable(
42+
name=var.name,
43+
dimensions=var.dimensions,
44+
dtype=var.dtype,
45+
data=var.data,
46+
attributes=_attributes_copy(var.attributes),
47+
group=var.group,
48+
)
49+
for var in ncdata.variables.values()
50+
],
51+
groups=[ncdata_copy(group) for group in ncdata.groups.values()],
52+
)

lib/ncdata/utils/_dim_indexing.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
from numbers import Number
2+
from typing import Any, List, Mapping, Union
3+
4+
import dask.array as da
5+
6+
from ncdata import NcData
7+
from ncdata.utils import ncdata_copy
8+
9+
10+
def index_by_dimensions(
11+
ncdata: NcData, **dim_index_kwargs: Mapping[str, Any]
12+
) -> NcData:
13+
"""
14+
Index an NcData over dimensions.
15+
16+
Parameters
17+
----------
18+
ncdata
19+
input data
20+
dim_index_kwargs
21+
specify indexing to apply to dimensions.
22+
E.G. ``x=1``, ``time=slice(0, 100)``, ``levels=[1,2,5]``.
23+
24+
Returns
25+
-------
26+
copy of input with dimensions, and all relevant variables, sub-indexed.
27+
28+
Notes
29+
-----
30+
Where a dimension key is a single value, the dimension will be *removed*.
31+
This mimics how numpy arrays behave, i.e. the difference between a[1] and a[1:2]
32+
33+
Examples
34+
--------
35+
ncdata = index_by_dimensions(ncdata, time=slice(0, 10)) # equivalent to [:10]
36+
ncdata = index_by_dimensions(ncdata, levels=[1,2,5])
37+
ncdata = index_by_dimensions(ncdata, time=3, levels=slice(2, 10, 3))
38+
39+
See Also
40+
--------
41+
:class:`Slicer` provides the same function with a slicing syntax
42+
"""
43+
# Start by copying the input : then modify that in-place
44+
ncdata = ncdata_copy(ncdata)
45+
for dim_name, key in dim_index_kwargs:
46+
# Dimension names must occur in the ncdata.
47+
dimension = ncdata.dimensions.get(dim_name)
48+
if dimension is None:
49+
raise ValueError(
50+
f"Dimension {dim_name!r} is not present in 'ncdata'."
51+
)
52+
53+
# Check for and fail repeated dimensions: the meaning would be unclear (!)
54+
matches = [name for name in dim_index_kwargs if name == dim_name]
55+
if len(matches) > 1:
56+
msg = (
57+
f"Dimensions to index, {tuple(dim_index_kwargs.keys())}, "
58+
f"includes dimension {dim_name!r} more than once."
59+
)
60+
raise ValueError(msg)
61+
62+
# Hopefully this replicates how numpy makes this decision?
63+
remove_dim = isinstance(key, Number)
64+
65+
# TODO:
66+
# Key types must be supported:
67+
# * int (or other numeric, including numpy scalars ?)
68+
# * list of int
69+
# * slice object
70+
# * 1-D array of numeric
71+
# Key "special" types we could possibly error or convert, to avoid confusion
72+
# with numpy behaviours ? :
73+
# arrays, tuples, booleans, None, newaxis, ellipsis ...
74+
75+
# Index the data of all referencing variables
76+
for var in ncdata.variables.values():
77+
if dim_name in var.dimensions:
78+
# construct a list of slice objects
79+
(i_slicedim,) = [
80+
i
81+
for i, name in enumerate(var.dimensions)
82+
if name == dim_name
83+
]
84+
slices = [slice(None) for dim in var.dimensions]
85+
slices[i_slicedim] = key
86+
87+
# index the data
88+
var.data = var.data[tuple(slices)]
89+
90+
# also remove the dim, if it will be removed
91+
if remove_dim:
92+
del var.dimensions[dim_name]
93+
94+
# Remove or reduce the dimension itself.
95+
if remove_dim:
96+
del ncdata.dimensions[dim_name]
97+
else:
98+
# calculate the new dim size, using numpy-like logic
99+
# TODO: there is probably a better way of calculating this ?
100+
new_size = da.zeros(dimension.size)[key].shape[0]
101+
dimension.size = new_size
102+
103+
return ncdata
104+
105+
106+
class Slicer:
107+
"""
108+
An object which can index an NcData over its dimensions.
109+
110+
This wraps the :meth:`index_by_dimensions` method for convenience, returning an
111+
object which supports the Python extended slicing syntax.
112+
113+
Examples
114+
--------
115+
data = Slicer(ncdata, 'time')[:10]
116+
data = Slicer(ncdata, 'level')[[1, 2, 5]]
117+
data = Slicer(ncdata, 'level', 'time', 'x', 'y')[1, :3, 2:10:3, ::-1]
118+
"""
119+
120+
def __init__(self, ncdata: NcData, dimensions: Union[str, List[str]]):
121+
"""
122+
Create an indexer for an NcData, applying to specific dimensions.
123+
124+
This can then be indexed to produce a derived (sub-indexed) dataset.
125+
126+
Parameters
127+
----------
128+
ncdata
129+
input data
130+
dimensions
131+
one or more dimension names, to which successive index keys will be applied
132+
"""
133+
self.ncdata = ncdata
134+
if isinstance(dimensions, str):
135+
dimensions = [dimensions]
136+
self.dim_names = tuple(dimensions)
137+
138+
def __getitem__(self, keys) -> NcData:
139+
"""
140+
Return an indexed portion of self.ncdata.
141+
142+
Index with 'keys' in the specified dimensions.
143+
"""
144+
if not isinstance(keys, tuple):
145+
# Single key, e.g. 1, slice(None), [2,3,4], array([2,3])
146+
# N.B. *otherwise* keys is always a tuple
147+
# A single tuple argument is passed as-is, i.e. interprets as multiple keys
148+
keys = [keys]
149+
150+
n_keys = len(keys)
151+
if len(keys) > self.dim_names:
152+
msg = (
153+
f"Too many index keys, {n_keys}, for the specified indexing dimension "
154+
"names, {self.dim_names!r}."
155+
)
156+
raise ValueError(msg)
157+
158+
# NB too *few* keys is not a problem, since 'zip' truncates for us.
159+
dim_kwargs = {name: key for name, key in zip(self.dim_names, keys)}
160+
161+
return index_by_dimensions(self.ncdata, **dim_kwargs)

0 commit comments

Comments
 (0)