Skip to content

Commit 850156c

Browse files
jhammankeewis
andauthored
add reset_encoding to dataset/dataarray/variable (#7689)
* add reset_encoding to dataset/dataarray/variable * fix bad return type * update io docs --------- Co-authored-by: Justus Magin <[email protected]>
1 parent a61d73b commit 850156c

File tree

10 files changed

+89
-21
lines changed

10 files changed

+89
-21
lines changed

doc/api-hidden.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@
234234
Variable.dims
235235
Variable.dtype
236236
Variable.encoding
237+
Variable.reset_encoding
237238
Variable.imag
238239
Variable.nbytes
239240
Variable.ndim

doc/api.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ Dataset contents
112112
Dataset.drop_dims
113113
Dataset.set_coords
114114
Dataset.reset_coords
115+
Dataset.reset_encoding
115116
Dataset.convert_calendar
116117
Dataset.interp_calendar
117118
Dataset.get_index
@@ -303,6 +304,7 @@ DataArray contents
303304
DataArray.drop_indexes
304305
DataArray.drop_duplicates
305306
DataArray.reset_coords
307+
DataArray.reset_encoding
306308
DataArray.copy
307309
DataArray.convert_calendar
308310
DataArray.interp_calendar

doc/user-guide/io.rst

Lines changed: 11 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -254,31 +254,22 @@ You can view this encoding information (among others) in the
254254
:py:attr:`DataArray.encoding` and
255255
:py:attr:`DataArray.encoding` attributes:
256256

257-
.. ipython::
258-
:verbatim:
257+
.. ipython:: python
259258
260-
In [1]: ds_disk["y"].encoding
261-
Out[1]:
262-
{'zlib': False,
263-
'shuffle': False,
264-
'complevel': 0,
265-
'fletcher32': False,
266-
'contiguous': True,
267-
'chunksizes': None,
268-
'source': 'saved_on_disk.nc',
269-
'original_shape': (5,),
270-
'dtype': dtype('int64'),
271-
'units': 'days since 2000-01-01 00:00:00',
272-
'calendar': 'proleptic_gregorian'}
273-
274-
In [9]: ds_disk.encoding
275-
Out[9]:
276-
{'unlimited_dims': set(),
277-
'source': 'saved_on_disk.nc'}
259+
ds_disk["y"].encoding
260+
ds_disk.encoding
278261
279262
Note that all operations that manipulate variables other than indexing
280263
will remove encoding information.
281264

265+
In some cases it is useful to intentionally reset a dataset's original encoding values.
266+
This can be done with either the :py:meth:`Dataset.reset_encoding` or
267+
:py:meth:`DataArray.reset_encoding` methods.
268+
269+
.. ipython:: python
270+
271+
ds_no_encoding = ds_disk.reset_encoding()
272+
ds_no_encoding.encoding
282273
283274
.. _combining multiple files:
284275

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ v2023.04.0 (unreleased)
2222

2323
New Features
2424
~~~~~~~~~~~~
25+
- New methods to reset an objects encoding (:py:meth:`Dataset.reset_encoding`, :py:meth:`DataArray.reset_encoding`).
26+
(:issue:`7686`, :pull:`7689`).
27+
By `Joe Hamman <https://github.com/jhamman>`_.
2528

2629

2730
Breaking changes

xarray/core/dataarray.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,6 +877,12 @@ def encoding(self) -> dict[Any, Any]:
877877
def encoding(self, value: Mapping[Any, Any]) -> None:
878878
self.variable.encoding = dict(value)
879879

880+
def reset_encoding(self: T_DataArray) -> T_DataArray:
881+
"""Return a new DataArray without encoding on the array or any attached
882+
coords."""
883+
ds = self._to_temp_dataset().reset_encoding()
884+
return self._from_temp_dataset(ds)
885+
880886
@property
881887
def indexes(self) -> Indexes:
882888
"""Mapping of pandas.Index objects used for label based indexing.

xarray/core/dataset.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -666,6 +666,12 @@ def encoding(self) -> dict[Any, Any]:
666666
def encoding(self, value: Mapping[Any, Any]) -> None:
667667
self._encoding = dict(value)
668668

669+
def reset_encoding(self: T_Dataset) -> T_Dataset:
670+
"""Return a new Dataset without encoding on the dataset or any of its
671+
variables/coords."""
672+
variables = {k: v.reset_encoding() for k, v in self.variables.items()}
673+
return self._replace(variables=variables, encoding={})
674+
669675
@property
670676
def dims(self) -> Frozen[Hashable, int]:
671677
"""Mapping from dimension names to lengths.

xarray/core/variable.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,10 @@ def encoding(self, value):
977977
except ValueError:
978978
raise ValueError("encoding must be castable to a dictionary")
979979

980+
def reset_encoding(self: T_Variable) -> T_Variable:
981+
"""Return a new Variable without encoding."""
982+
return self._replace(encoding={})
983+
980984
def copy(
981985
self: T_Variable, deep: bool = True, data: ArrayLike | None = None
982986
) -> T_Variable:

xarray/tests/test_dataarray.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,25 @@ def test_encoding(self) -> None:
278278
self.dv.encoding = expected2
279279
assert expected2 is not self.dv.encoding
280280

281+
def test_reset_encoding(self) -> None:
282+
array = self.mda
283+
encoding = {"scale_factor": 10}
284+
array.encoding = encoding
285+
array["x"].encoding = encoding
286+
287+
assert array.encoding == encoding
288+
assert array["x"].encoding == encoding
289+
290+
actual = array.reset_encoding()
291+
292+
# did not modify in place
293+
assert array.encoding == encoding
294+
assert array["x"].encoding == encoding
295+
296+
# variable and coord encoding is empty
297+
assert actual.encoding == {}
298+
assert actual["x"].encoding == {}
299+
281300
def test_constructor(self) -> None:
282301
data = np.random.random((2, 3))
283302

xarray/tests/test_dataset.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2827,6 +2827,21 @@ def test_copy_with_data_errors(self) -> None:
28272827
with pytest.raises(ValueError, match=r"contain all variables in original"):
28282828
orig.copy(data={"var1": new_var1})
28292829

2830+
def test_reset_encoding(self) -> None:
2831+
orig = create_test_data()
2832+
vencoding = {"scale_factor": 10}
2833+
orig.encoding = {"foo": "bar"}
2834+
2835+
for k, v in orig.variables.items():
2836+
orig[k].encoding = vencoding
2837+
2838+
actual = orig.reset_encoding()
2839+
assert actual.encoding == {}
2840+
for k, v in actual.variables.items():
2841+
assert v.encoding == {}
2842+
2843+
assert_equal(actual, orig)
2844+
28302845
def test_rename(self) -> None:
28312846
data = create_test_data()
28322847
newnames = {

xarray/tests/test_variable.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,23 @@ def test_encoding_preserved(self):
455455
assert_identical(expected.to_base_variable(), actual.to_base_variable())
456456
assert expected.encoding == actual.encoding
457457

458+
def test_reset_encoding(self) -> None:
459+
encoding1 = {"scale_factor": 1}
460+
# encoding set via cls constructor
461+
v1 = self.cls(["a"], [0, 1, 2], encoding=encoding1)
462+
assert v1.encoding == encoding1
463+
v2 = v1.reset_encoding()
464+
assert v1.encoding == encoding1
465+
assert v2.encoding == {}
466+
467+
# encoding set via setter
468+
encoding3 = {"scale_factor": 10}
469+
v3 = self.cls(["a"], [0, 1, 2], encoding=encoding3)
470+
assert v3.encoding == encoding3
471+
v4 = v3.reset_encoding()
472+
assert v3.encoding == encoding3
473+
assert v4.encoding == {}
474+
458475
def test_concat(self):
459476
x = np.arange(5)
460477
y = np.arange(5, 10)
@@ -2201,9 +2218,13 @@ def test_coarsen_keep_attrs(self, operation="mean"):
22012218
assert new.attrs == _attrs
22022219

22032220

2221+
def _init_dask_variable(*args, **kwargs):
2222+
return Variable(*args, **kwargs).chunk()
2223+
2224+
22042225
@requires_dask
22052226
class TestVariableWithDask(VariableSubclassobjects):
2206-
cls = staticmethod(lambda *args: Variable(*args).chunk())
2227+
cls = staticmethod(_init_dask_variable)
22072228

22082229
def test_chunk(self):
22092230
unblocked = Variable(["dim_0", "dim_1"], np.ones((3, 4)))

0 commit comments

Comments
 (0)