Skip to content

Commit f458020

Browse files
authored
Add encoder, decoder for MultiIndexes. (#321)
1 parent 244cc26 commit f458020

File tree

8 files changed

+234
-1
lines changed

8 files changed

+234
-1
lines changed

cf_xarray/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
from .accessor import CFAccessor # noqa
2+
from .coding import ( # noqa
3+
decode_compress_to_multi_index,
4+
encode_multi_index_as_compress,
5+
)
26
from .geometry import cf_to_shapely, shapely_to_cf # noqa
37
from .helpers import bounds_to_vertices, vertices_to_bounds # noqa
48
from .options import set_options # noqa

cf_xarray/coding.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
"""
2+
Encoders and decoders for CF conventions not implemented by Xarray.
3+
"""
4+
import numpy as np
5+
import pandas as pd
6+
import xarray as xr
7+
8+
9+
def encode_multi_index_as_compress(ds, idxnames=None):
10+
"""
11+
Encode a MultiIndexed dimension using the "compression by gathering" CF convention.
12+
13+
Parameters
14+
----------
15+
ds : xarray.Dataset
16+
Dataset with at least one MultiIndexed dimension
17+
idxnames : hashable or iterable of hashable, optional
18+
Dimensions that are MultiIndex-ed. If None, will detect all MultiIndex-ed dimensions.
19+
20+
Returns
21+
-------
22+
xarray.Dataset
23+
Encoded Dataset with ``name`` as a integer coordinate with a ``"compress"`` attribute.
24+
25+
References
26+
----------
27+
CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_
28+
"""
29+
if idxnames is None:
30+
idxnames = tuple(
31+
name
32+
for name, idx in ds.indexes.items()
33+
if isinstance(idx, pd.MultiIndex)
34+
# After the flexible indexes refactor, all MultiIndex Levels
35+
# have a MultiIndex but the name won't match.
36+
# Prior to that refactor, there is only a single MultiIndex with name=None
37+
and (idx.name == name if idx.name is not None else True)
38+
)
39+
elif isinstance(idxnames, str):
40+
idxnames = (idxnames,)
41+
42+
if not idxnames:
43+
raise ValueError("No MultiIndex-ed dimensions found in Dataset.")
44+
45+
encoded = ds.reset_index(idxnames)
46+
for idxname in idxnames:
47+
mindex = ds.indexes[idxname]
48+
coords = dict(zip(mindex.names, mindex.levels))
49+
encoded.update(coords)
50+
encoded[idxname] = np.ravel_multi_index(mindex.codes, mindex.levshape)
51+
encoded[idxname].attrs = ds[idxname].attrs
52+
if (
53+
"compress" in encoded[idxname].encoding
54+
or "compress" in encoded[idxname].attrs
55+
):
56+
raise ValueError(
57+
f"Does not support the 'compress' attribute in {idxname}.encoding or {idxname}.attrs. "
58+
"This is generated automatically."
59+
)
60+
encoded[idxname].attrs["compress"] = " ".join(mindex.names)
61+
return encoded
62+
63+
64+
def decode_compress_to_multi_index(encoded, idxnames=None):
65+
"""
66+
Decode a compressed variable to a pandas MultiIndex.
67+
68+
Parameters
69+
----------
70+
encoded : xarray.Dataset
71+
Encoded Dataset with variables that use "compression by gathering".capitalize
72+
idxnames : hashable or iterable of hashable, optional
73+
Variable names that represents a compressed dimension. These variables must have
74+
the attribute ``"compress"``. If None, will detect all indexes with a ``"compress"``
75+
attribute and decode those.
76+
77+
Returns
78+
-------
79+
xarray.Dataset
80+
Decoded Dataset with ``name`` as a MultiIndexed dimension.
81+
82+
References
83+
----------
84+
CF conventions on `compression by gathering <http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering>`_
85+
"""
86+
decoded = xr.Dataset()
87+
if idxnames is None:
88+
idxnames = tuple(
89+
name for name in encoded.indexes if "compress" in encoded[name].attrs
90+
)
91+
elif isinstance(idxnames, str):
92+
idxnames = (idxnames,)
93+
94+
for idxname in idxnames:
95+
if "compress" not in encoded[idxname].attrs:
96+
raise ValueError("Attribute 'compress' not found in provided Dataset.")
97+
98+
if not isinstance(encoded, xr.Dataset):
99+
raise ValueError(
100+
f"Must provide a Dataset. Received {type(encoded)} instead."
101+
)
102+
103+
names = encoded[idxname].attrs["compress"].split(" ")
104+
shape = [encoded.sizes[dim] for dim in names]
105+
indices = np.unravel_index(encoded.landpoint.data, shape)
106+
arrays = [encoded[dim].data[index] for dim, index in zip(names, indices)]
107+
mindex = pd.MultiIndex.from_arrays(arrays, names=names)
108+
109+
decoded.coords[idxname] = mindex
110+
decoded.coords[idxname].attrs = encoded[idxname].attrs.copy()
111+
del decoded[idxname].attrs["compress"]
112+
113+
for varname in encoded.data_vars:
114+
if idxname in encoded[varname].dims:
115+
decoded[varname] = (
116+
idxname,
117+
encoded[varname].data,
118+
encoded[varname].attrs,
119+
)
120+
return decoded

cf_xarray/tests/test_coding.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import numpy as np
2+
import pandas as pd
3+
import pytest
4+
import xarray as xr
5+
6+
import cf_xarray as cfxr
7+
8+
9+
@pytest.mark.parametrize(
10+
"mindex",
11+
[
12+
pd.MultiIndex.from_product([["a", "b"], [1, 2]], names=("lat", "lon")),
13+
pd.MultiIndex.from_arrays(
14+
[["a", "b", "c", "d"], [1, 2, 4, 10]], names=("lat", "lon")
15+
),
16+
pd.MultiIndex.from_arrays(
17+
[["a", "b", "b", "a"], [1, 2, 1, 2]], names=("lat", "lon")
18+
),
19+
],
20+
)
21+
@pytest.mark.parametrize("idxnames", ["landpoint", ("landpoint",), None])
22+
def test_compression_by_gathering_multi_index_roundtrip(mindex, idxnames):
23+
dataset = xr.Dataset(
24+
{"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})},
25+
{"landpoint": ("landpoint", mindex, {"long_name": "land point number"})},
26+
)
27+
encoded = cfxr.encode_multi_index_as_compress(dataset, idxnames)
28+
roundtrip = cfxr.decode_compress_to_multi_index(encoded, idxnames)
29+
assert "compress" not in roundtrip["landpoint"].encoding
30+
xr.testing.assert_identical(roundtrip, dataset)
31+
32+
dataset["landpoint"].attrs["compress"] = "lat lon"
33+
with pytest.raises(ValueError):
34+
cfxr.encode_multi_index_as_compress(dataset, idxnames)

doc/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ Top-level API
1414
shapely_to_cf
1515
cf_to_shapely
1616
set_options
17+
encode_multi_index_as_compress
18+
decode_compress_to_multi_index
1719

1820
.. currentmodule:: xarray
1921

doc/coding.md

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
---
2+
jupytext:
3+
text_representation:
4+
format_name: myst
5+
kernelspec:
6+
display_name: Python 3
7+
name: python3
8+
---
9+
```{eval-rst}
10+
.. currentmodule:: cf_xarray
11+
```
12+
```{code-cell}
13+
---
14+
tags: [remove-cell]
15+
---
16+
import cf_xarray as cfxr
17+
import numpy as np
18+
import pandas as pd
19+
import xarray as xr
20+
xr.set_options(display_expand_data=False)
21+
```
22+
23+
24+
# Encoding and decoding
25+
26+
`cf_xarray` aims to support encoding and decoding variables using CF conventions not yet implemented by Xarray.
27+
28+
## Compression by gathering
29+
30+
The ["compression by gathering"](http://cfconventions.org/Data/cf-conventions/cf-conventions-1.8/cf-conventions.html#compression-by-gathering)
31+
convention could be used for either {py:class}`pandas.MultiIndex` objects or `pydata/sparse` arrays.
32+
33+
### MultiIndex
34+
35+
``cf_xarray`` provides {py:func}`encode_multi_index_as_compress` and {py:func}`decode_compress_to_multi_index` to encode MultiIndex-ed
36+
dimensions using "compression by gethering".
37+
38+
Here's a test dataset
39+
```{code-cell}
40+
ds = xr.Dataset(
41+
{"landsoilt": ("landpoint", np.random.randn(4), {"foo": "bar"})},
42+
{
43+
"landpoint": pd.MultiIndex.from_product(
44+
[["a", "b"], [1, 2]], names=("lat", "lon")
45+
)
46+
},
47+
)
48+
ds
49+
```
50+
First encode (note the `"compress"` attribute on the `landpoint` variable)
51+
```{code-cell}
52+
encoded = cfxr.encode_multi_index_as_compress(ds, "landpoint")
53+
encoded
54+
```
55+
56+
At this point, we can write `encoded` to a CF-compliant dataset using {py:func}`xarray.Dataset.to_netcdf` for example.
57+
After reading that file, decode using
58+
```{code-cell}
59+
decoded = cfxr.decode_compress_to_multi_index(encoded, "landpoint")
60+
decoded
61+
```
62+
63+
We roundtrip perfectly
64+
```{code-cell}
65+
ds.identical(decoded)
66+
```
67+
68+
### Sparse arrays
69+
70+
This is unsupported currently but a pull request is welcome!

doc/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@
251251
intersphinx_mapping = {
252252
"python": ("https://docs.python.org/3/", None),
253253
"xarray": ("https://xarray.pydata.org/en/stable/", None),
254+
"pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
254255
}
255256

256257
autosummary_generate = True

doc/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ or using ``conda``
5050
units
5151
parametricz
5252
bounds
53+
coding
5354
dsg
5455
geometry
5556
plotting

doc/whats-new.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@ What's New
55

66
v 0.7.1 (unreleased)
77
====================
8+
- added encoder and decoder for writing pandas MultiIndex-es to file using "compression by gathering".
9+
See :doc:`coding` for more. By `Deepak Cherian`_.
810
- added another type of vertical coordinate to decode: ``ocean_sigma_coordinate``. By `Kristen Thyng`_.
911

10-
1112
v0.7.0 (January 24, 2022)
1213
=========================
1314
- Many improvements to autoguessing for plotting. By `Deepak Cherian`_

0 commit comments

Comments
 (0)