|
| 1 | +""" |
| 2 | +Operations for making cache keys based on dataset geometry. |
| 3 | +
|
| 4 | +Some operations such as :func:`~.operations.triangulate.triangulate_dataset` |
| 5 | +only depend on the dataset geometry and are expensive to compute. |
| 6 | +For applications that need to derive data from the dataset geometry |
| 7 | +it would be useful if the derived data could be reused between different runs of the same application |
| 8 | +or between multiple time slices of the same geometry distributed across multiple files. |
| 9 | +This module provides :func:`.make_cache_key` to assist in this process |
| 10 | +by deriving a cache key from the important parts of a dataset geometry. |
| 11 | +Applications can use this cache key |
| 12 | +as part of a filename when save derived geometry data to disk |
| 13 | +or as a key to an in-memory cache of derived geometry. |
| 14 | +
|
| 15 | +The derived cache keys will be identical between different instances of an application, |
| 16 | +and between different files in multi-file datasets split over an unlimited dimension. |
| 17 | +
|
| 18 | +This module does not provide an actual cache implementation. |
| 19 | +""" |
| 20 | +import hashlib |
| 21 | +import marshal |
| 22 | + |
| 23 | +import numpy |
| 24 | +import xarray |
| 25 | + |
| 26 | +import emsarray |
| 27 | + |
| 28 | + |
| 29 | +def hash_attributes(hash: "hashlib._Hash", attributes: dict) -> None: |
| 30 | + """ |
| 31 | + Adds the contents of an :attr:`attributes dictionary <xarray.DataArray.attrs>` |
| 32 | + to a hash. |
| 33 | +
|
| 34 | + Parameters |
| 35 | + ---------- |
| 36 | + hash : hashlib-style hash instance |
| 37 | + The hash instance to add the attribute dictionary to. |
| 38 | + This must follow the interface defined in :mod:`hashlib`. |
| 39 | + attributes : dict |
| 40 | + A dictionary of attributes from a :class:`~xarray.Dataset` or :class:`~xarray.DataArray`. |
| 41 | +
|
| 42 | + Notes |
| 43 | + ----- |
| 44 | + The attribute dictionary is serialized to bytes using :func:`marshal.dumps`. |
| 45 | + This is an implementation detail that may change in future releases. |
| 46 | + """ |
| 47 | + # Prepend the marshal encoding version |
| 48 | + marshal_version = 4 |
| 49 | + hash_int(hash, marshal_version) |
| 50 | + # Specify marshal encoding version when serialising |
| 51 | + attribute_dict_marshal_bytes = marshal.dumps(attributes, marshal_version) |
| 52 | + # Prepend the number of attributes |
| 53 | + hash_int(hash, len(attributes)) |
| 54 | + # Prepend the size of the pickled attributes |
| 55 | + hash_int(hash, len(attribute_dict_marshal_bytes)) |
| 56 | + hash.update(attribute_dict_marshal_bytes) |
| 57 | + |
| 58 | + |
| 59 | +def hash_string(hash: "hashlib._Hash", value: str) -> None: |
| 60 | + """ |
| 61 | + Adds a :class:`string <str>` to a hash. |
| 62 | +
|
| 63 | + Parameters |
| 64 | + ---------- |
| 65 | + hash : hashlib-style hash instance |
| 66 | + The hash instance to add the string to. |
| 67 | + This must follow the interface defined in :mod:`hashlib`. |
| 68 | + value : str |
| 69 | + Any unicode string. |
| 70 | +
|
| 71 | + Notes |
| 72 | + ----- |
| 73 | + The string is UTF-8 encoded as part of being added to the hash. |
| 74 | + This is an implementation detail that may change in future releases. |
| 75 | + """ |
| 76 | + # Prepend the length of the string to the hash |
| 77 | + # to prevent malicious datasets generating overlapping string hashes. |
| 78 | + hash_int(hash, len(value)) |
| 79 | + hash.update(value.encode('utf-8')) |
| 80 | + |
| 81 | + |
| 82 | +def hash_int(hash: "hashlib._Hash", value: int) -> None: |
| 83 | + """ |
| 84 | + Adds an :class:`int` to a hash. |
| 85 | +
|
| 86 | + Parameters |
| 87 | + ---------- |
| 88 | + hash : hashlib-style hash instance |
| 89 | + The hash instance to add the integer to. |
| 90 | + This must follow the interface defined in :mod:`hashlib`. |
| 91 | + value : int |
| 92 | + Any int representable as an :data:`numpy.int32` |
| 93 | +
|
| 94 | + Notes |
| 95 | + ----- |
| 96 | + The int is cast to a :data:`numpy.int32` as part of being added to the hash. |
| 97 | + This is an implementation detail that may change in the future |
| 98 | + if larger integers are required. |
| 99 | + """ |
| 100 | + with numpy.errstate(over='raise'): |
| 101 | + # Manual overflow check as older numpy versions dont throw the exception |
| 102 | + if numpy.iinfo("int32").min <= value <= numpy.iinfo("int32").max: |
| 103 | + hash.update(numpy.int32(value).tobytes()) |
| 104 | + else: |
| 105 | + raise OverflowError |
| 106 | + |
| 107 | + |
| 108 | +def make_cache_key(dataset: xarray.Dataset, hash: "hashlib._Hash | None" = None) -> str: |
| 109 | + """ |
| 110 | + Derive a cache key from the geometry of a dataset. |
| 111 | +
|
| 112 | + Parameters |
| 113 | + ---------- |
| 114 | + dataset : xarray.Dataset |
| 115 | + The dataset to generate a cache key from. |
| 116 | + hash : :mod:`hashlib`-compatible hash instance, optional |
| 117 | + An instance of a hashlib hash class. |
| 118 | + Defaults to :func:`hashlib.blake2b` with a digest size of 32, |
| 119 | + which is secure enough and fast enough for most purposes. |
| 120 | +
|
| 121 | + Returns |
| 122 | + ------- |
| 123 | + cache_key : str |
| 124 | + A string suitable for use as a cache key. |
| 125 | + The string will be safe for use as part of a filename if data is to be cached to disk. |
| 126 | +
|
| 127 | + Examples |
| 128 | + -------- |
| 129 | +
|
| 130 | + .. code-block:: python |
| 131 | +
|
| 132 | + import emsarray |
| 133 | + from emsarray.operations.cache import make_cache_key |
| 134 | +
|
| 135 | + # Make a cache key from the dataset |
| 136 | + dataset = emsarray.tuorial.open_dataset("austen") |
| 137 | + cache_key = make_cache_key(dataset) |
| 138 | + >>> cache_key |
| 139 | + '580853c44e732878937598e86d0b26cb81e18d986072c0790a122244e9d3f480' |
| 140 | +
|
| 141 | + Notes |
| 142 | + ----- |
| 143 | + The cache key will depend on the Convention class, |
| 144 | + the emsarray version, and a hash of the geometry of the dataset. |
| 145 | + The specific structure of the cache key may change between emsarray and python versions, |
| 146 | + and should not be relied upon. |
| 147 | + """ |
| 148 | + if hash is None: |
| 149 | + hash = hashlib.blake2b(digest_size=32) |
| 150 | + |
| 151 | + dataset.ems.hash_geometry(hash) |
| 152 | + |
| 153 | + # Hash convention name, convention module path and emsarray version |
| 154 | + hash_string(hash, dataset.ems.__class__.__module__) |
| 155 | + hash_string(hash, dataset.ems.__class__.__name__) |
| 156 | + hash_string(hash, emsarray.__version__) |
| 157 | + |
| 158 | + return hash.hexdigest() |
0 commit comments