Skip to content

Commit 5fe4e6b

Browse files
authored
Merge pull request #71 from csiro-coasts/timedelta-fill-value
Timedelta fill value
2 parents 78c67a7 + ba6f6b6 commit 5fe4e6b

File tree

11 files changed

+136
-50
lines changed

11 files changed

+136
-50
lines changed

docs/releases/development.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,6 @@
22
Next release (in development)
33
=============================
44

5-
* ...
5+
* Fixed an issue with ``_FillValue`` / ``missing_value``
6+
and variables with non-float types such as ``timedelta64``
7+
(:pr:`71`)

src/emsarray/masking.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import numpy as np
1616
import xarray as xr
17+
from xarray.core.dtypes import maybe_promote
1718

1819
from emsarray import utils
1920
from emsarray.types import Pathish
@@ -71,7 +72,6 @@ def mask_grid_dataset(
7172
# file system, at the added expense of having to recombine the dataset
7273
# afterwards.
7374
for key, data_array in dataset.data_vars.items():
74-
logger.debug("DataArray %s", key)
7575
masked_data_array = mask_grid_data_array(mask, data_array)
7676
variable_path = work_path / f"{key}.nc"
7777
mfdataset_names.append(variable_path)
@@ -130,19 +130,28 @@ def mask_grid_data_array(mask: xr.Dataset, data_array: xr.DataArray) -> xr.DataA
130130
try:
131131
fill_value = find_fill_value(data_array)
132132
except ValueError:
133+
logger.debug(
134+
"Data array %r has no valid fill value, leaving as is",
135+
data_array.name)
133136
return data_array
134137

135138
# Loop through each possible mask
136139
for mask_name, mask_data_array in mask.data_vars.items():
137140
# If every dimension of this mask exists in the data array, apply it
138141
if dimensions >= set(mask_data_array.dims):
142+
logger.debug(
143+
"Masking data array %r with mask %r",
144+
data_array.name, mask_name)
139145
new_data_array = cast(xr.DataArray, data_array.where(mask_data_array, other=fill_value))
140146
new_data_array.attrs = data_array.attrs
141147
new_data_array.encoding = data_array.encoding
142148
return new_data_array
143149

144150
# Fallback, no appropriate mask was found, so don't apply any.
145151
# This generally happens for data arrays such as time, record, x_grid, etc.
152+
logger.debug(
153+
"Data array %r had no relevant mask, leaving as is",
154+
data_array.name)
146155
return data_array
147156

148157

@@ -182,24 +191,16 @@ def find_fill_value(data_array: xr.DataArray) -> Any:
182191
# constructed a dataset using one...
183192
return np.ma.masked
184193

185-
if '_FillValue' in data_array.encoding:
186-
# The dataset was opened with mask_and_scale=True and a mask has been
187-
# applied. Masked values are now represented as np.nan, not _FillValue.
188-
return np.nan
189-
190-
if '_FillValue' in data_array.attrs:
191-
# The dataset was opened with mask_and_scale=False and a mask has not
192-
# been applied. Masked values should be represented using _FillValue.
193-
return data_array.attrs['_FillValue']
194-
195-
if issubclass(data_array.dtype.type, np.floating):
196-
# NaN is a useful fallback for a _FillValue, but only if the dtype
197-
# is some sort of float. We won't actually _set_ a _FillValue
198-
# attribute though, as that can play havok when trying to save
199-
# existing datasets. xarray gets real grumpy when you have
200-
# a _FillValue and a missing_value, and some existing datasets play
201-
# fast and loose with mixing the two.
202-
return np.nan
194+
attrs = ['_FillValue', 'missing_value']
195+
for attr in attrs:
196+
if attr in data_array.attrs:
197+
# The dataset was opened with mask_and_scale=False and a mask has not
198+
# been applied. Masked values should be represented using _FillValue/missing_value.
199+
return data_array.attrs[attr]
200+
201+
promoted_dtype, fill_value = maybe_promote(data_array.dtype)
202+
if promoted_dtype == data_array.dtype:
203+
return fill_value
203204

204205
raise ValueError("No appropriate fill value found")
205206

src/emsarray/utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from packaging.version import Version
3232
from xarray.coding import times
3333
from xarray.core.common import contains_cftime_datetimes
34+
from xarray.core.dtypes import maybe_promote
3435

3536
from emsarray.types import Pathish
3637

@@ -233,8 +234,10 @@ def disable_default_fill_value(dataset_or_array: Union[xr.Dataset, xr.DataArray]
233234
The :class:`xarray.Dataset` or :class:`xarray.DataArray` to update
234235
"""
235236
for variable in _get_variables(dataset_or_array):
237+
current_dtype = variable.dtype
238+
promoted_dtype, fill_value = maybe_promote(current_dtype)
236239
if (
237-
issubclass(variable.dtype.type, np.floating)
240+
current_dtype == promoted_dtype
238241
and "_FillValue" not in variable.encoding
239242
and "_FillValue" not in variable.attrs
240243
):
0 Bytes
Binary file not shown.
Binary file not shown.
Binary file not shown.

tests/datasets/masking/find_fill_value/make_datasets.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,19 @@
11
#!/usr/bin/env python3
22

3+
"""
4+
Make some datasets for testing fill values.
5+
Because of how xarray preprocesses variables to apply masks,
6+
it is easier to construct these datasets using the plain netCDF4 library,
7+
save the datasets to disk, and then load them using xarray.
8+
This guarantees that the behaviour in the tests will replicate real-world use.
9+
10+
Running this script will overwrite any datasets already constructed in this directory.
11+
This operation should result in byte-for-byte identical datasets each time it is run.
12+
However each netCDF4 dataset will encode the versions of the
13+
netCDF4, hdf5 and other relevant libraries used to construct the dataset.
14+
If the versions have changed, the script will create new files that git thinks have changed.
15+
"""
16+
317
import pathlib
418

519
import netCDF4
@@ -8,8 +22,10 @@
822
here = pathlib.Path(__file__).parent
923

1024

11-
def make_float_with_fill_value() -> None:
12-
ds = netCDF4.Dataset(here / "float_with_fill_value.nc", "w", "NETCDF4")
25+
def make_float_with_fill_value(
26+
output_path: pathlib.Path = here / "float_with_fill_value.nc"
27+
) -> None:
28+
ds = netCDF4.Dataset(output_path, "w", "NETCDF4")
1329
ds.createDimension("x", 2)
1430
ds.createDimension("y", 2)
1531

@@ -20,8 +36,10 @@ def make_float_with_fill_value() -> None:
2036
ds.close()
2137

2238

23-
def make_float_with_fill_value_and_offset() -> None:
24-
ds = netCDF4.Dataset(here / "float_with_fill_value_and_offset.nc", "w", "NETCDF4")
39+
def make_float_with_fill_value_and_offset(
40+
output_path: pathlib.Path = here / "float_with_fill_value_and_offset.nc",
41+
) -> None:
42+
ds = netCDF4.Dataset(output_path, "w", "NETCDF4")
2543
ds.createDimension("x", 2)
2644
ds.createDimension("y", 2)
2745

@@ -34,8 +52,27 @@ def make_float_with_fill_value_and_offset() -> None:
3452
ds.close()
3553

3654

37-
def make_int_with_fill_value_and_offset() -> None:
38-
ds = netCDF4.Dataset(here / "int_with_fill_value_and_offset.nc", "w", "NETCDF4")
55+
def make_timedelta_with_missing_value(
56+
output_path: pathlib.Path = here / "timedelta_with_missing_value.nc",
57+
) -> None:
58+
ds = netCDF4.Dataset(output_path, "w", "NETCDF4")
59+
ds.createDimension("x", 2)
60+
ds.createDimension("y", 2)
61+
62+
missing_value = np.float32(1.e+35)
63+
var = ds.createVariable("var", "f4", ["y", "x"], fill_value=False)
64+
var.missing_value = missing_value
65+
var.units = "days"
66+
var[:] = np.arange(4).reshape((2, 2))
67+
var[1, 1] = missing_value
68+
69+
ds.close()
70+
71+
72+
def make_int_with_fill_value_and_offset(
73+
output_path: pathlib.Path = here / "int_with_fill_value_and_offset.nc",
74+
) -> None:
75+
ds = netCDF4.Dataset(output_path, "w", "NETCDF4")
3976
ds.createDimension("x", 2)
4077
ds.createDimension("y", 2)
4178

@@ -51,4 +88,5 @@ def make_int_with_fill_value_and_offset() -> None:
5188
if __name__ == '__main__':
5289
make_float_with_fill_value()
5390
make_float_with_fill_value_and_offset()
91+
make_timedelta_with_missing_value()
5492
make_int_with_fill_value_and_offset()
Binary file not shown.

tests/masking/test_mask_dataset.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -155,10 +155,10 @@ def test_mask_dataset(tmp_path: pathlib.Path):
155155
data=np.random.normal(0, 0.2, (records, j_size, i_size)),
156156
dims=["record", "j_centre", "i_centre"],
157157
attrs={
158-
"units": "metre",
159-
"long_name": "Surface elevation",
160-
"standard_name": "sea_surface_height_above_geoid",
161-
}
158+
"units": "metre",
159+
"long_name": "Surface elevation",
160+
"standard_name": "sea_surface_height_above_geoid",
161+
}
162162
)
163163
temp = xr.DataArray(
164164
data=np.random.normal(12, 0.5, (records, k_size, j_size, i_size)),
@@ -262,7 +262,7 @@ def test_mask_dataset(tmp_path: pathlib.Path):
262262
assert nc_flag2.shape == (k_size, 4, 3)
263263
flag2_mask = np.stack([np.array([
264264
[0, 0, 0], [0, 0, 0], [0, 0, 1], [0, 1, 1]
265-
])]*k_size).astype(bool)
265+
])] * k_size).astype(bool)
266266
expected: np.ndarray = np.ma.masked_array(
267267
flag2.values[:, 1:5, 1:4].copy(),
268268
mask=flag2_mask,

tests/masking/test_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from numpy.testing import assert_equal
1010

1111
from emsarray import masking
12+
from emsarray.utils import to_netcdf_with_fixes
1213
from tests.utils import mask_from_strings
1314

1415

@@ -102,6 +103,26 @@ def test_find_fill_value_masked_and_scaled_int(datasets):
102103
assert_dtype_equal(masking.find_fill_value(data_array), np.int8(-1))
103104

104105

106+
def test_find_fill_value_timedelta_with_missing_value(
107+
datasets: pathlib.Path,
108+
tmp_path: pathlib.Path,
109+
) -> None:
110+
dataset_path = datasets / 'masking/find_fill_value/timedelta_with_missing_value.nc'
111+
112+
missing_value = np.float32(1.e35)
113+
assert_raw_values(
114+
dataset_path, 'var',
115+
np.array([[0, 1], [2, missing_value]], dtype=np.float32))
116+
117+
with xr.open_dataset(dataset_path) as dataset:
118+
data_array = dataset['var']
119+
assert dataset['var'].dtype == np.dtype('timedelta64[ns]')
120+
fill_value = masking.find_fill_value(data_array)
121+
assert np.isnat(fill_value)
122+
123+
to_netcdf_with_fixes(dataset, tmp_path / 'dataset.nc')
124+
125+
105126
def test_calculate_mask_bounds():
106127
mask = xr.Dataset(
107128
data_vars={

0 commit comments

Comments
 (0)