diff --git a/docs/releases/development.rst b/docs/releases/development.rst index b95e61fd..470491ae 100644 --- a/docs/releases/development.rst +++ b/docs/releases/development.rst @@ -2,4 +2,5 @@ Next release (in development) ============================= -* ... +* Fix datasets hash_key generation when geometry encoding + is missing a dtype (:issue:`166`, :pr:`167`). diff --git a/src/emsarray/conventions/_base.py b/src/emsarray/conventions/_base.py index a6ced9b9..e35c38f1 100644 --- a/src/emsarray/conventions/_base.py +++ b/src/emsarray/conventions/_base.py @@ -1973,7 +1973,9 @@ def hash_geometry(self, hash: "hashlib._Hash") -> None: # Include the dtype of the data array. # A float array and an int array mean very different things, # but could have identical byte patterns. - hash_string(hash, data_array.encoding['dtype'].name) + # Checking for encoding dtype and falling back to values.dtype due to + # xarray multifile dataset bug - https://github.com/pydata/xarray/issues/2436 + hash_string(hash, data_array.encoding.get('dtype', data_array.values.dtype).name) # Include the size and shape of the data. # 1D coordinate arrays are very different to 2D coordinate arrays, diff --git a/tests/datasets/multifile_datasets/cfgrid2d/cfgrid2d_2024-01-01.nc b/tests/datasets/multifile_datasets/cfgrid2d/cfgrid2d_2024-01-01.nc new file mode 100644 index 00000000..83fb6f69 Binary files /dev/null and b/tests/datasets/multifile_datasets/cfgrid2d/cfgrid2d_2024-01-01.nc differ diff --git a/tests/datasets/multifile_datasets/cfgrid2d/cfgrid2d_2024-01-02.nc b/tests/datasets/multifile_datasets/cfgrid2d/cfgrid2d_2024-01-02.nc new file mode 100644 index 00000000..4004959e Binary files /dev/null and b/tests/datasets/multifile_datasets/cfgrid2d/cfgrid2d_2024-01-02.nc differ diff --git a/tests/datasets/multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-01.nc b/tests/datasets/multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-01.nc new file mode 100644 index 00000000..92f155de Binary files /dev/null and b/tests/datasets/multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-01.nc differ diff --git a/tests/datasets/multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-02.nc b/tests/datasets/multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-02.nc new file mode 100644 index 00000000..b22e1612 Binary files /dev/null and b/tests/datasets/multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-02.nc differ diff --git a/tests/operations/test_cache.py b/tests/operations/test_cache.py index 7883ace4..48ae228e 100644 --- a/tests/operations/test_cache.py +++ b/tests/operations/test_cache.py @@ -2,6 +2,7 @@ import pathlib import pytest +import xarray import emsarray import emsarray.operations.cache @@ -12,6 +13,8 @@ int_hash = '7b08e025e311c3dfcf5179b67c0fdc08e73de261' attr_hash_lat = "2cb433979fc2d9c3884eea8569dd6a44406950f3" cache_key_hash_cf1d_sha1 = "2b006999273225ed70d4810357b6a06e6bebe9a6" +cache_key_hash_multifile_cf2d_sha1 = "ea2d2e6131f1e499f622e83ed4fc2415649def06" +cache_key_hash_multifile_ugrid_mesh2d_sha1 = "1d72e01b159135208324ae9a643166f85aecba27" # Blake2b cache_key_hash_cf1d = "1a3226072f08441ee79f727b0775709209ff2965299539c898ecc401cf17e23f" @@ -200,3 +203,39 @@ def test_cache_key_cfgrid1d_sha1(datasets: pathlib.Path): assert result_cache_key_cf is not None assert result_cache_key_cf == cache_key_hash_cf1d_sha1 + + +def test_cache_key_with_multifile_dataset_ugrid_mesh2d(datasets: pathlib.Path): + + ugrid_path1 = datasets / 'multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-01.nc' + ugrid_path2 = datasets / 'multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-02.nc' + + dataset_paths = [ugrid_path1, ugrid_path2] + + multifile_dataset = xarray.open_mfdataset(dataset_paths, data_vars=['values']) + + multifile_ds_hash = hashlib.sha1() + + multifile_dataset.ems.hash_geometry(multifile_ds_hash) + + multifile_ds_digest = multifile_ds_hash.hexdigest() + + assert multifile_ds_digest == cache_key_hash_multifile_ugrid_mesh2d_sha1 + + +def test_cache_key_with_multifile_dataset_cfgrid2d(datasets: pathlib.Path): + + cfgrid_path1 = datasets / 'multifile_datasets/cfgrid2d/cfgrid2d_2024-01-01.nc' + cfgrid_path2 = datasets / 'multifile_datasets/cfgrid2d/cfgrid2d_2024-01-02.nc' + + dataset_paths = [cfgrid_path1, cfgrid_path2] + + multifile_dataset = xarray.open_mfdataset(dataset_paths, data_vars=['values']) + + multifile_ds_hash = hashlib.sha1() + + multifile_dataset.ems.hash_geometry(multifile_ds_hash) + + multifile_ds_digest = multifile_ds_hash.hexdigest() + + assert multifile_ds_digest == cache_key_hash_multifile_cf2d_sha1