Skip to content

Commit c9e257a

Browse files
authored
Merge pull request #158 from csiro-coasts/geometry-hash
Dataset Cache Keys
2 parents 935b939 + 3687141 commit c9e257a

File tree

6 files changed

+412
-0
lines changed

6 files changed

+412
-0
lines changed

docs/api/operations/cache.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
=========================
2+
emsarray.operations.cache
3+
=========================
4+
5+
.. automodule:: emsarray.operations.cache
6+
7+
Functions
8+
=========
9+
10+
.. autofunction:: make_cache_key

docs/api/operations/index.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ and behave the same across all supported conventions.
1313

1414
./*
1515

16+
:doc:`cache`
17+
These operations create hash keys of datasets for use
18+
in caching data.
19+
1620
:doc:`depth`
1721
These operations manipulate datasets with a depth axis,
1822
such as the output of ocean models.

docs/releases/development.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,6 @@ Next release (in development)
1111
but the trade off is worth the added security
1212
after the invalid polygons found in :pr:`154`
1313
(:pr:`156`).
14+
* Added new :mod:`emsarray.operations.cache` module
15+
for generating cache keys based on dataset geometry.
16+
(:issue:`153`, :pr:`158`).

src/emsarray/conventions/_base.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import abc
22
import dataclasses
33
import enum
4+
import hashlib
45
import logging
56
import warnings
67
from collections.abc import Callable, Hashable, Iterable, Sequence
@@ -18,6 +19,7 @@
1819
from emsarray.compat.shapely import SpatialIndex
1920
from emsarray.exceptions import InvalidPolygonWarning, NoSuchCoordinateError
2021
from emsarray.operations import depth, point_extraction
22+
from emsarray.operations.cache import hash_attributes, hash_int, hash_string
2123
from emsarray.plot import (
2224
_requires_plot, animate_on_figure, make_plot_title, plot_on_figure,
2325
polygons_to_collection
@@ -1950,6 +1952,39 @@ def normalize_depth_variables(
19501952
self.dataset, self.depth_coordinates,
19511953
positive_down=positive_down, deep_to_shallow=deep_to_shallow)
19521954

1955+
def hash_geometry(self, hash: "hashlib._Hash") -> None:
1956+
"""
1957+
Updates the provided hash with all of the relevant geometry data for this dataset.
1958+
1959+
Parameters
1960+
----------
1961+
hash : hashlib-style hash instance
1962+
The hash instance to update with geometry data.
1963+
This must follow the interface defined in :mod:`hashlib`.
1964+
"""
1965+
geometry_names = self.get_all_geometry_names()
1966+
1967+
for geometry_name in geometry_names:
1968+
data_array = self.dataset[geometry_name]
1969+
1970+
# Include the variable name in the digest.
1971+
hash_string(hash, str(geometry_name))
1972+
1973+
# Include the dtype of the data array.
1974+
# A float array and an int array mean very different things,
1975+
# but could have identical byte patterns.
1976+
hash_string(hash, data_array.encoding['dtype'].name)
1977+
1978+
# Include the size and shape of the data.
1979+
# 1D coordinate arrays are very different to 2D coordinate arrays,
1980+
# but could have identical byte patterns.
1981+
hash_int(hash, data_array.size)
1982+
hash.update(numpy.array(data_array.shape, dtype='int32').tobytes('C'))
1983+
hash.update(data_array.to_numpy().tobytes('C'))
1984+
1985+
# Hash dataset attributes
1986+
hash_attributes(hash, data_array.attrs)
1987+
19531988

19541989
class DimensionConvention(Convention[GridKind, Index]):
19551990
"""

src/emsarray/operations/cache.py

Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
"""
2+
Operations for making cache keys based on dataset geometry.
3+
4+
Some operations such as :func:`~.operations.triangulate.triangulate_dataset`
5+
only depend on the dataset geometry and are expensive to compute.
6+
For applications that need to derive data from the dataset geometry
7+
it would be useful if the derived data could be reused between different runs of the same application
8+
or between multiple time slices of the same geometry distributed across multiple files.
9+
This module provides :func:`.make_cache_key` to assist in this process
10+
by deriving a cache key from the important parts of a dataset geometry.
11+
Applications can use this cache key
12+
as part of a filename when save derived geometry data to disk
13+
or as a key to an in-memory cache of derived geometry.
14+
15+
The derived cache keys will be identical between different instances of an application,
16+
and between different files in multi-file datasets split over an unlimited dimension.
17+
18+
This module does not provide an actual cache implementation.
19+
"""
20+
import hashlib
21+
import marshal
22+
23+
import numpy
24+
import xarray
25+
26+
import emsarray
27+
28+
29+
def hash_attributes(hash: "hashlib._Hash", attributes: dict) -> None:
30+
"""
31+
Adds the contents of an :attr:`attributes dictionary <xarray.DataArray.attrs>`
32+
to a hash.
33+
34+
Parameters
35+
----------
36+
hash : hashlib-style hash instance
37+
The hash instance to add the attribute dictionary to.
38+
This must follow the interface defined in :mod:`hashlib`.
39+
attributes : dict
40+
A dictionary of attributes from a :class:`~xarray.Dataset` or :class:`~xarray.DataArray`.
41+
42+
Notes
43+
-----
44+
The attribute dictionary is serialized to bytes using :func:`marshal.dumps`.
45+
This is an implementation detail that may change in future releases.
46+
"""
47+
# Prepend the marshal encoding version
48+
marshal_version = 4
49+
hash_int(hash, marshal_version)
50+
# Specify marshal encoding version when serialising
51+
attribute_dict_marshal_bytes = marshal.dumps(attributes, marshal_version)
52+
# Prepend the number of attributes
53+
hash_int(hash, len(attributes))
54+
# Prepend the size of the pickled attributes
55+
hash_int(hash, len(attribute_dict_marshal_bytes))
56+
hash.update(attribute_dict_marshal_bytes)
57+
58+
59+
def hash_string(hash: "hashlib._Hash", value: str) -> None:
60+
"""
61+
Adds a :class:`string <str>` to a hash.
62+
63+
Parameters
64+
----------
65+
hash : hashlib-style hash instance
66+
The hash instance to add the string to.
67+
This must follow the interface defined in :mod:`hashlib`.
68+
value : str
69+
Any unicode string.
70+
71+
Notes
72+
-----
73+
The string is UTF-8 encoded as part of being added to the hash.
74+
This is an implementation detail that may change in future releases.
75+
"""
76+
# Prepend the length of the string to the hash
77+
# to prevent malicious datasets generating overlapping string hashes.
78+
hash_int(hash, len(value))
79+
hash.update(value.encode('utf-8'))
80+
81+
82+
def hash_int(hash: "hashlib._Hash", value: int) -> None:
83+
"""
84+
Adds an :class:`int` to a hash.
85+
86+
Parameters
87+
----------
88+
hash : hashlib-style hash instance
89+
The hash instance to add the integer to.
90+
This must follow the interface defined in :mod:`hashlib`.
91+
value : int
92+
Any int representable as an :data:`numpy.int32`
93+
94+
Notes
95+
-----
96+
The int is cast to a :data:`numpy.int32` as part of being added to the hash.
97+
This is an implementation detail that may change in the future
98+
if larger integers are required.
99+
"""
100+
with numpy.errstate(over='raise'):
101+
# Manual overflow check as older numpy versions dont throw the exception
102+
if numpy.iinfo("int32").min <= value <= numpy.iinfo("int32").max:
103+
hash.update(numpy.int32(value).tobytes())
104+
else:
105+
raise OverflowError
106+
107+
108+
def make_cache_key(dataset: xarray.Dataset, hash: "hashlib._Hash | None" = None) -> str:
109+
"""
110+
Derive a cache key from the geometry of a dataset.
111+
112+
Parameters
113+
----------
114+
dataset : xarray.Dataset
115+
The dataset to generate a cache key from.
116+
hash : :mod:`hashlib`-compatible hash instance, optional
117+
An instance of a hashlib hash class.
118+
Defaults to :func:`hashlib.blake2b` with a digest size of 32,
119+
which is secure enough and fast enough for most purposes.
120+
121+
Returns
122+
-------
123+
cache_key : str
124+
A string suitable for use as a cache key.
125+
The string will be safe for use as part of a filename if data is to be cached to disk.
126+
127+
Examples
128+
--------
129+
130+
.. code-block:: python
131+
132+
import emsarray
133+
from emsarray.operations.cache import make_cache_key
134+
135+
# Make a cache key from the dataset
136+
dataset = emsarray.tuorial.open_dataset("austen")
137+
cache_key = make_cache_key(dataset)
138+
>>> cache_key
139+
'580853c44e732878937598e86d0b26cb81e18d986072c0790a122244e9d3f480'
140+
141+
Notes
142+
-----
143+
The cache key will depend on the Convention class,
144+
the emsarray version, and a hash of the geometry of the dataset.
145+
The specific structure of the cache key may change between emsarray and python versions,
146+
and should not be relied upon.
147+
"""
148+
if hash is None:
149+
hash = hashlib.blake2b(digest_size=32)
150+
151+
dataset.ems.hash_geometry(hash)
152+
153+
# Hash convention name, convention module path and emsarray version
154+
hash_string(hash, dataset.ems.__class__.__module__)
155+
hash_string(hash, dataset.ems.__class__.__name__)
156+
hash_string(hash, emsarray.__version__)
157+
158+
return hash.hexdigest()

0 commit comments

Comments
 (0)