Skip to content

Commit 51f7515

Browse files
committed
apply astype in the right order to prevent loss of precision, more fillvalue fixes
1 parent 986ffc6 commit 51f7515

File tree

1 file changed

+51
-26
lines changed

1 file changed

+51
-26
lines changed

xarray/coding/variables.py

Lines changed: 51 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -220,34 +220,38 @@ class CFMaskCoder(VariableCoder):
220220
def encode(self, variable: Variable, name: T_Name = None):
221221
dims, data, attrs, encoding = unpack_for_encoding(variable)
222222

223+
# get dtype from encoding if available, otherwise use data.dtype
223224
dtype = np.dtype(encoding.get("dtype", data.dtype))
224225
fv = encoding.get("_FillValue")
225226
mv = encoding.get("missing_value")
226227

227-
if (
228-
fv is not None
229-
and mv is not None
230-
and not duck_array_ops.allclose_or_equiv(fv, mv)
231-
):
232-
raise ValueError(
233-
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
234-
)
235-
236-
if fv is not None:
237-
# Ensure _FillValue is cast to same dtype as data's
238-
encoding["_FillValue"] = dtype.type(fv)
239-
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
240-
if not pd.isnull(fill_value):
241-
data = duck_array_ops.fillna(data, fill_value)
242-
243-
if mv is not None:
244-
# Ensure missing_value is cast to same dtype as data's
245-
encoding["missing_value"] = dtype.type(mv)
246-
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
247-
if not pd.isnull(fill_value) and fv is None:
248-
data = duck_array_ops.fillna(data, fill_value)
228+
if fv is not None or mv is not None:
229+
if (
230+
fv is not None
231+
and mv is not None
232+
and not duck_array_ops.allclose_or_equiv(fv, mv)
233+
):
234+
raise ValueError(
235+
f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data."
236+
)
249237

250-
return Variable(dims, data, attrs, encoding, fastpath=True)
238+
if fv is not None:
239+
# Ensure _FillValue is cast to same dtype as data's
240+
encoding["_FillValue"] = dtype.type(fv)
241+
fill_value = pop_to(encoding, attrs, "_FillValue", name=name)
242+
if not pd.isnull(fill_value):
243+
data = duck_array_ops.fillna(data, fill_value)
244+
245+
if mv is not None:
246+
# Only use mv if _FillValue isn't available
247+
# Ensure missing_value is cast to same dtype as data's
248+
encoding["missing_value"] = attrs.get("_FillValue", dtype.type(mv))
249+
fill_value = pop_to(encoding, attrs, "missing_value", name=name)
250+
if not pd.isnull(fill_value) and fv is None:
251+
data = duck_array_ops.fillna(data, fill_value)
252+
return Variable(dims, data, attrs, encoding, fastpath=True)
253+
else:
254+
return variable
251255

252256
def decode(self, variable: Variable, name: T_Name = None):
253257
dims, data, attrs, encoding = unpack_for_decoding(variable)
@@ -272,7 +276,13 @@ def decode(self, variable: Variable, name: T_Name = None):
272276
stacklevel=3,
273277
)
274278

275-
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
279+
if "scale_factor" not in attrs and "add_offset" not in attrs:
280+
dtype, decoded_fill_value = dtypes.maybe_promote(data.dtype)
281+
else:
282+
dtype, decoded_fill_value = (
283+
_choose_float_dtype(data.dtype, attrs),
284+
np.nan,
285+
)
276286

277287
if encoded_fill_values:
278288
transform = partial(
@@ -319,6 +329,10 @@ def _choose_float_dtype(
319329
and offset_type == scale_type
320330
and scale_type in [np.float32, np.float64]
321331
):
332+
# in case of int32 -> we need upcast to float64
333+
# due to precision issues
334+
if dtype.itemsize == 4 and np.issubdtype(dtype, np.integer):
335+
return np.float64
322336
return np.dtype(scale_type).type
323337
# Not CF conforming and add_offset given:
324338
# A scale factor is entirely safe (vanishing into the mantissa),
@@ -354,7 +368,12 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable:
354368
scale_factor = pop_to(encoding, attrs, "scale_factor", name=name)
355369
add_offset = pop_to(encoding, attrs, "add_offset", name=name)
356370
if scale_factor or add_offset:
357-
dtype = _choose_float_dtype(data.dtype, attrs)
371+
# if we have a _FillValue/masked_value we do not want to cast now
372+
# but leave that to CFMaskCoder
373+
dtype = data.dtype
374+
if "_FillValue" not in encoding and "missing_value" not in encoding:
375+
dtype = _choose_float_dtype(data.dtype, attrs)
376+
# but still we need a copy prevent changing original data
358377
data = data.astype(dtype=dtype, copy=True)
359378
if add_offset:
360379
data -= add_offset
@@ -373,7 +392,13 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable:
373392
scale_factor = np.asarray(scale_factor).item()
374393
if np.ndim(add_offset) > 0:
375394
add_offset = np.asarray(add_offset).item()
376-
dtype = _choose_float_dtype(data.dtype, encoding)
395+
# if we have a _FillValue/masked_value we already have the wanted
396+
# floating point dtype here (via CFMaskCoder), so no check is necessary
397+
# only check in other cases
398+
dtype = data.dtype
399+
if "_FillValue" not in encoding and "missing_value" not in encoding:
400+
dtype = _choose_float_dtype(dtype, encoding)
401+
377402
transform = partial(
378403
_scale_offset_decoding,
379404
scale_factor=scale_factor,

0 commit comments

Comments
 (0)