Skip to content

Commit 0579a7c

Browse files
Merge pull request #167 from csiro-coasts/166-multifile-datasets-dont-work-with-cache-key-generation
Updated hash key generation to handle missing encoding dtype.
2 parents 0bbb470 + 38926af commit 0579a7c

File tree

7 files changed

+44
-2
lines changed

7 files changed

+44
-2
lines changed

docs/releases/development.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
Next release (in development)
33
=============================
44

5-
* ...
5+
* Fix datasets hash_key generation when geometry encoding
6+
is missing a dtype (:issue:`166`, :pr:`167`).

src/emsarray/conventions/_base.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1973,7 +1973,9 @@ def hash_geometry(self, hash: "hashlib._Hash") -> None:
19731973
# Include the dtype of the data array.
19741974
# A float array and an int array mean very different things,
19751975
# but could have identical byte patterns.
1976-
hash_string(hash, data_array.encoding['dtype'].name)
1976+
# Checking for encoding dtype and falling back to values.dtype due to
1977+
# xarray multifile dataset bug - https://github.com/pydata/xarray/issues/2436
1978+
hash_string(hash, data_array.encoding.get('dtype', data_array.values.dtype).name)
19771979

19781980
# Include the size and shape of the data.
19791981
# 1D coordinate arrays are very different to 2D coordinate arrays,
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

tests/operations/test_cache.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import pathlib
33

44
import pytest
5+
import xarray
56

67
import emsarray
78
import emsarray.operations.cache
@@ -12,6 +13,8 @@
1213
int_hash = '7b08e025e311c3dfcf5179b67c0fdc08e73de261'
1314
attr_hash_lat = "2cb433979fc2d9c3884eea8569dd6a44406950f3"
1415
cache_key_hash_cf1d_sha1 = "2b006999273225ed70d4810357b6a06e6bebe9a6"
16+
cache_key_hash_multifile_cf2d_sha1 = "ea2d2e6131f1e499f622e83ed4fc2415649def06"
17+
cache_key_hash_multifile_ugrid_mesh2d_sha1 = "1d72e01b159135208324ae9a643166f85aecba27"
1518

1619
# Blake2b
1720
cache_key_hash_cf1d = "1a3226072f08441ee79f727b0775709209ff2965299539c898ecc401cf17e23f"
@@ -200,3 +203,39 @@ def test_cache_key_cfgrid1d_sha1(datasets: pathlib.Path):
200203
assert result_cache_key_cf is not None
201204

202205
assert result_cache_key_cf == cache_key_hash_cf1d_sha1
206+
207+
208+
def test_cache_key_with_multifile_dataset_ugrid_mesh2d(datasets: pathlib.Path):
209+
210+
ugrid_path1 = datasets / 'multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-01.nc'
211+
ugrid_path2 = datasets / 'multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-02.nc'
212+
213+
dataset_paths = [ugrid_path1, ugrid_path2]
214+
215+
multifile_dataset = xarray.open_mfdataset(dataset_paths, data_vars=['values'])
216+
217+
multifile_ds_hash = hashlib.sha1()
218+
219+
multifile_dataset.ems.hash_geometry(multifile_ds_hash)
220+
221+
multifile_ds_digest = multifile_ds_hash.hexdigest()
222+
223+
assert multifile_ds_digest == cache_key_hash_multifile_ugrid_mesh2d_sha1
224+
225+
226+
def test_cache_key_with_multifile_dataset_cfgrid2d(datasets: pathlib.Path):
227+
228+
cfgrid_path1 = datasets / 'multifile_datasets/cfgrid2d/cfgrid2d_2024-01-01.nc'
229+
cfgrid_path2 = datasets / 'multifile_datasets/cfgrid2d/cfgrid2d_2024-01-02.nc'
230+
231+
dataset_paths = [cfgrid_path1, cfgrid_path2]
232+
233+
multifile_dataset = xarray.open_mfdataset(dataset_paths, data_vars=['values'])
234+
235+
multifile_ds_hash = hashlib.sha1()
236+
237+
multifile_dataset.ems.hash_geometry(multifile_ds_hash)
238+
239+
multifile_ds_digest = multifile_ds_hash.hexdigest()
240+
241+
assert multifile_ds_digest == cache_key_hash_multifile_cf2d_sha1

0 commit comments

Comments
 (0)