Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion docs/releases/development.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
Next release (in development)
=============================

* ...
* Fix datasets hash_key generation when geometry encoding
is missing a dtype (:issue:`166`, :pr:`167`).
4 changes: 3 additions & 1 deletion src/emsarray/conventions/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1973,7 +1973,9 @@ def hash_geometry(self, hash: "hashlib._Hash") -> None:
# Include the dtype of the data array.
# A float array and an int array mean very different things,
# but could have identical byte patterns.
hash_string(hash, data_array.encoding['dtype'].name)
# Checking for encoding dtype and falling back to values.dtype due to
# xarray multifile dataset bug - https://github.com/pydata/xarray/issues/2436
hash_string(hash, data_array.encoding.get('dtype', data_array.values.dtype).name)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The dtype encoding value should be present most of the time. It being missing is an unexpected exception to the norm. Without the context of this bug checking data_array.values.dtype looks unnecessary.

Please add brief one or two line description of why we are doing this extra step, and include a link to the xarray bug report tracking this issue: pydata/xarray#2436

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added a comment with issue link and a simple description.


# Include the size and shape of the data.
# 1D coordinate arrays are very different to 2D coordinate arrays,
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
39 changes: 39 additions & 0 deletions tests/operations/test_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pathlib

import pytest
import xarray

import emsarray
import emsarray.operations.cache
Expand All @@ -12,6 +13,8 @@
int_hash = '7b08e025e311c3dfcf5179b67c0fdc08e73de261'
attr_hash_lat = "2cb433979fc2d9c3884eea8569dd6a44406950f3"
cache_key_hash_cf1d_sha1 = "2b006999273225ed70d4810357b6a06e6bebe9a6"
cache_key_hash_multifile_cf2d_sha1 = "ea2d2e6131f1e499f622e83ed4fc2415649def06"
cache_key_hash_multifile_ugrid_mesh2d_sha1 = "1d72e01b159135208324ae9a643166f85aecba27"

# Blake2b
cache_key_hash_cf1d = "1a3226072f08441ee79f727b0775709209ff2965299539c898ecc401cf17e23f"
Expand Down Expand Up @@ -200,3 +203,39 @@ def test_cache_key_cfgrid1d_sha1(datasets: pathlib.Path):
assert result_cache_key_cf is not None

assert result_cache_key_cf == cache_key_hash_cf1d_sha1


def test_cache_key_with_multifile_dataset_ugrid_mesh2d(datasets: pathlib.Path):

ugrid_path1 = datasets / 'multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-01.nc'
ugrid_path2 = datasets / 'multifile_datasets/ugrid_mesh2d/ugrid_mesh2d_2024-01-02.nc'

dataset_paths = [ugrid_path1, ugrid_path2]

multifile_dataset = xarray.open_mfdataset(dataset_paths, data_vars=['values'])

multifile_ds_hash = hashlib.sha1()

multifile_dataset.ems.hash_geometry(multifile_ds_hash)

multifile_ds_digest = multifile_ds_hash.hexdigest()

assert multifile_ds_digest == cache_key_hash_multifile_ugrid_mesh2d_sha1


def test_cache_key_with_multifile_dataset_cfgrid2d(datasets: pathlib.Path):

cfgrid_path1 = datasets / 'multifile_datasets/cfgrid2d/cfgrid2d_2024-01-01.nc'
cfgrid_path2 = datasets / 'multifile_datasets/cfgrid2d/cfgrid2d_2024-01-02.nc'

dataset_paths = [cfgrid_path1, cfgrid_path2]

multifile_dataset = xarray.open_mfdataset(dataset_paths, data_vars=['values'])

multifile_ds_hash = hashlib.sha1()

multifile_dataset.ems.hash_geometry(multifile_ds_hash)

multifile_ds_digest = multifile_ds_hash.hexdigest()

assert multifile_ds_digest == cache_key_hash_multifile_cf2d_sha1
Loading