-
-
Notifications
You must be signed in to change notification settings - Fork 1.2k
correctly encode/decode _FillValues/missing_values/dtypes for packed data #8713
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from 6 commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
477cd58
correctly encode/decode _FillValues
kmuehlbauer 792e942
fix mypy
kmuehlbauer f743fd1
fix CFMaskCode test
kmuehlbauer 551719a
fix scale/offset
kmuehlbauer 85730e2
avert zarr issue
kmuehlbauer 5fe874f
add whats-new.rst entry
kmuehlbauer f848872
refactor fillvalue/missing value check to catch non-conforming values…
kmuehlbauer 672e84e
fix typing
kmuehlbauer bff3a5d
suppress warning in doc
kmuehlbauer 45b5b8c
Merge branch 'main' into fix-7691
kmuehlbauer ed79eb7
Merge branch 'main' into fix-7691
kmuehlbauer b89ce93
Merge branch 'main' into fix-7691
dcherian f90e7e0
FIX: silence SerializationWarnings
kmuehlbauer 554a61e
Merge branch 'main' into fix-7691
kmuehlbauer da2222e
FIX: silence mypy by casting to string early
kmuehlbauer f6fe9cb
Update xarray/tests/test_conventions.py
kmuehlbauer 6b30298
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] 1d1b1a0
Shorten test, add comment checking for two warnings
kmuehlbauer 30985fa
make test even shorter
kmuehlbauer File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -280,8 +280,10 @@ def encode(self, variable: Variable, name: T_Name = None): | |
data = duck_array_ops.fillna(data, fill_value) | ||
|
||
if mv_exists: | ||
# Use _FillValue if available to align missing_value to prevent issues | ||
# when decoding | ||
# Ensure missing_value is cast to same dtype as data's | ||
encoding["missing_value"] = dtype.type(mv) | ||
encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv)) | ||
fill_value = pop_to(encoding, attrs, "missing_value", name=name) | ||
if not pd.isnull(fill_value) and not fv_exists: | ||
if is_time_like and data.dtype.kind in "iu": | ||
|
@@ -322,7 +324,13 @@ def decode(self, variable: Variable, name: T_Name = None): | |
if _is_time_like(attrs.get("units")) and data.dtype.kind in "iu": | ||
dtype, decoded_fill_value = np.int64, np.iinfo(np.int64).min | ||
else: | ||
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) | ||
if "scale_factor" not in attrs and "add_offset" not in attrs: | ||
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype) | ||
else: | ||
dtype, decoded_fill_value = ( | ||
_choose_float_dtype(data.dtype, attrs), | ||
np.nan, | ||
) | ||
|
||
if encoded_fill_values: | ||
transform = partial( | ||
|
@@ -347,20 +355,51 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype: np.typing.DTyp | |
return data | ||
|
||
|
||
def _choose_float_dtype(dtype: np.dtype, has_offset: bool) -> type[np.floating[Any]]: | ||
def _choose_float_dtype( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function checks for the most appropriate dtype to use when encoding/decoding in |
||
dtype: np.dtype, mapping: MutableMapping | ||
) -> type[np.floating[Any]]: | ||
"""Return a float dtype that can losslessly represent `dtype` values.""" | ||
# Keep float32 as-is. Upcast half-precision to single-precision, | ||
# check scale/offset first to derive wanted float dtype | ||
# see https://github.com/pydata/xarray/issues/5597#issuecomment-879561954 | ||
scale_factor = mapping.get("scale_factor") | ||
add_offset = mapping.get("add_offset") | ||
if scale_factor is not None or add_offset is not None: | ||
# get the type from scale_factor/add_offset to determine | ||
# the needed floating point type | ||
if scale_factor is not None: | ||
scale_type = type(scale_factor) | ||
if add_offset is not None: | ||
offset_type = type(add_offset) | ||
# CF conforming, both scale_factor and add-offset are given and | ||
# of same floating point type (float32/64) | ||
if ( | ||
add_offset is not None | ||
and scale_factor is not None | ||
and offset_type == scale_type | ||
and scale_type in [np.float32, np.float64] | ||
kmuehlbauer marked this conversation as resolved.
Show resolved
Hide resolved
|
||
): | ||
# in case of int32 -> we need upcast to float64 | ||
# due to precision issues | ||
if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer): | ||
return np.float64 | ||
return np.dtype(scale_type).type | ||
# Not CF conforming and add_offset given: | ||
# A scale factor is entirely safe (vanishing into the mantissa), | ||
# but a large integer offset could lead to loss of precision. | ||
# Sensitivity analysis can be tricky, so we just use a float64 | ||
# if there's any offset at all - better unoptimised than wrong! | ||
if add_offset is not None: | ||
return np.float64 | ||
# return dtype depending on given scale_factor | ||
return np.dtype(scale_type).type | ||
# If no scale_factor or add_offset is given, use some general rules. | ||
# Keep float32 as-is. Upcast half-precision to single-precision, | ||
# because float16 is "intended for storage but not computation" | ||
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating): | ||
return np.float32 | ||
# float32 can exactly represent all integers up to 24 bits | ||
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer): | ||
# A scale factor is entirely safe (vanishing into the mantissa), | ||
# but a large integer offset could lead to loss of precision. | ||
# Sensitivity analysis can be tricky, so we just use a float64 | ||
# if there's any offset at all - better unoptimised than wrong! | ||
if not has_offset: | ||
return np.float32 | ||
return np.float32 | ||
# For all other types and circumstances, we just use float64. | ||
# (safe because eg. complex numbers are not supported in NetCDF) | ||
return np.float64 | ||
|
@@ -377,7 +416,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: | |
dims, data, attrs, encoding = unpack_for_encoding(variable) | ||
|
||
if "scale_factor" in encoding or "add_offset" in encoding: | ||
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) | ||
# if we have a _FillValue/masked_value we do not want to cast now | ||
# but leave that to CFMaskCoder | ||
dtype = data.dtype | ||
if "_FillValue" not in encoding and "missing_value" not in encoding: | ||
dtype = _choose_float_dtype(data.dtype, encoding) | ||
# but still we need a copy prevent changing original data | ||
data = duck_array_ops.astype(data, dtype=dtype, copy=True) | ||
if "add_offset" in encoding: | ||
data -= pop_to(encoding, attrs, "add_offset", name=name) | ||
|
@@ -393,11 +437,17 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: | |
|
||
scale_factor = pop_to(attrs, encoding, "scale_factor", name=name) | ||
add_offset = pop_to(attrs, encoding, "add_offset", name=name) | ||
dtype = _choose_float_dtype(data.dtype, "add_offset" in encoding) | ||
if np.ndim(scale_factor) > 0: | ||
scale_factor = np.asarray(scale_factor).item() | ||
if np.ndim(add_offset) > 0: | ||
add_offset = np.asarray(add_offset).item() | ||
# if we have a _FillValue/masked_value we already have the wanted | ||
# floating point dtype here (via CFMaskCoder), so no check is necessary | ||
# only check in other cases | ||
dtype = data.dtype | ||
if "_FillValue" not in encoding and "missing_value" not in encoding: | ||
dtype = _choose_float_dtype(dtype, encoding) | ||
|
||
transform = partial( | ||
_scale_offset_decoding, | ||
scale_factor=scale_factor, | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.