Skip to content

Commit 55dbce7

Browse files
authored
Merge pull request #390 from davidhassell/persist-environment
Persist data after computation
2 parents d8da40b + 6a73eec commit 55dbce7

File tree

10 files changed

+288
-35
lines changed

10 files changed

+288
-35
lines changed

Changelog.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ Version NEXTVERSION
33

44
**2026-??-??**
55

6+
* New keyword parameter to `cfdm.Data.compute`: ``persist``
7+
(https://github.com/NCAS-CMS/cfdm/issues/389)
8+
* New function to control the persistence of computed data:
9+
`cfdm.persist_data` (https://github.com/NCAS-CMS/cfdm/issues/389)
610
* Support for HEALPix grids
711
(https://github.com/NCAS-CMS/cfdm/issues/370)
812
* New default backend for netCDF-4 in `cfdm.write`: ``h5netcdf-h5py``,

cfdm/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
integer_dtype,
6666
log_level,
6767
parse_indices,
68+
persist_data,
6869
rtol,
6970
unique_constructs,
7071
_disable_logging,

cfdm/core/data/data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,7 @@ def source(self, default=ValueError()):
834834
>>> f = {{package}}.read('file.nc')[0]
835835
>>> d = f.data
836836
>>> d.source()
837-
<{{repr}}NetCDF4Array(149, 182): file=file.nc variable=latitude>
837+
<{{repr}}PyfiveArray(149, 182): file.nc latitude(149, 182)>
838838
839839
"""
840840
return self._get_component("array", default=default)

cfdm/data/data.py

Lines changed: 67 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
display_data,
2323
is_log_level_info,
2424
parse_indices,
25+
persist_data,
2526
)
2627
from ..mixin.container import Container
2728
from ..mixin.files import Files
@@ -140,6 +141,9 @@ def __new__(cls, *args, **kwargs):
140141
# Function for determining whether or not to display data
141142
# elements during `__str__`
142143
instance._display_data = display_data
144+
# Function for determining whether or not to persist computed
145+
# data
146+
instance._persist_data = persist_data
143147
return instance
144148

145149
def __init__(
@@ -2721,11 +2725,17 @@ def array(self):
27212725
"""A numpy array copy of the data.
27222726
27232727
In-place changes to the returned numpy array do not affect the
2724-
underlying dask array.
2728+
underlying Dask array.
27252729
27262730
The returned numpy array has the same mask hardness and fill
27272731
values as the data.
27282732
2733+
.. note:: If the `{{package}}.persist_data` function returns
2734+
True then calling `array` will persist the
2735+
underlying lazy Dask array into an equivalent
2736+
chunked Dask array, but now with the results fully
2737+
computed and cached memory.
2738+
27292739
Compare with `compute`.
27302740
27312741
**Performance**
@@ -2734,7 +2744,8 @@ def array(self):
27342744
returned `numpy` array is a deep copy of that returned by
27352745
created `compute`.
27362746
2737-
.. seealso:: `datetime_array`, `compute`, `persist`
2747+
.. seealso:: `datetime_array`, `compute`, `persist`,
2748+
`{{package}}.persist_data`
27382749
27392750
**Examples**
27402751
@@ -4102,7 +4113,9 @@ def compressed(self, inplace=False):
41024113
d._set_dask(dx, clear=self._ALL, in_memory=True)
41034114
return d
41044115

4105-
def compute(self, _force_to_memory=True, _cache_elements=True):
4116+
def compute(
4117+
self, persist=None, _force_to_memory=True, _cache_elements=True
4118+
):
41064119
"""A view of the computed data.
41074120
41084121
In-place changes to the returned array *might* affect the
@@ -4120,11 +4133,30 @@ def compute(self, _force_to_memory=True, _cache_elements=True):
41204133
41214134
.. versionadded:: (cfdm) 1.11.2.0
41224135
4123-
.. seealso:: `persist`, `array`, `datetime_array`,
4124-
`sparse_array`
4136+
.. seealso:: `array`, `datetime_array`, `sparse_array`,
4137+
`persist`, `{{package}}.persist_data`
41254138
41264139
:Parameters:
41274140
4141+
persist: `None` or `bool`, optional
4142+
Control the persistence of computed data. Persisting
4143+
turns the underlying lazy dask array into an
4144+
equivalent chunked dask array, but now with the
4145+
results fully computed and cached memory. This can
4146+
avoid the expense of re-reading the data from disk, or
4147+
re-computing it, when the data is accessed on multiple
4148+
occasions.
4149+
4150+
If *persist* is `None` (the default) then the value of
4151+
*persist* will be taken from the
4152+
`{{package}}.persist_data` function. If *persist* is
4153+
True then the data is persisted, regardless of value
4154+
returned by `{{package}}.persist_data`. If *persist*
4155+
is False then the data is not persisted, regardless of
4156+
value returned by `{{package}}.persist_data`.
4157+
4158+
.. versionadded:: (cfdm) NEXTVERSION
4159+
41284160
_force_to_memory: `bool`, optional
41294161
If True (the default) then force the data resulting
41304162
from computing the returned Dask graph to be in
@@ -4172,12 +4204,28 @@ def compute(self, _force_to_memory=True, _cache_elements=True):
41724204
[0.029 0.059 0.039 0.07 0.058 0.072 0.009 0.017]
41734205
[0.006 0.036 0.019 0.035 0.018 0.037 0.034 0.013]]
41744206
>>> f.data.compute(_force_to_memory=False)
4175-
<{{repr}}NetCDF4Array(5, 8): file.nc, q(5, 8)>
4207+
<{{repr}}PyfiveArray(5, 8): file.nc, q(5, 8)>
41764208
41774209
"""
41784210
dx = self.to_dask_array(
41794211
_force_mask_hardness=False, _force_to_memory=_force_to_memory
41804212
)
4213+
4214+
if persist is None:
4215+
persist = self._persist_data()
4216+
4217+
if persist:
4218+
dx = dx.persist()
4219+
4220+
# Note to developers: If the following `_set_dask` call is
4221+
# changed, consider making the same
4222+
# changes in `persist`.
4223+
self._set_dask(
4224+
dx,
4225+
clear=self._ALL ^ self._ARRAY ^ self._CACHE,
4226+
in_memory=True,
4227+
)
4228+
41814229
a = dx.compute()
41824230

41834231
if np.ma.isMA(a) and a is not np.ma.masked:
@@ -4189,11 +4237,16 @@ def compute(self, _force_to_memory=True, _cache_elements=True):
41894237
a.set_fill_value(self.get_fill_value(None))
41904238

41914239
if _cache_elements:
4192-
from scipy.sparse import issparse
4240+
ok = False
4241+
if isinstance(a, (np.ndarray, int, float, bool, str)):
4242+
ok = True
4243+
else:
4244+
from scipy.sparse import issparse
41934245

4194-
if isinstance(a, (np.ndarray, int, float, bool, str)) or issparse(
4195-
a
4196-
):
4246+
if issparse(a):
4247+
ok = True
4248+
4249+
if ok:
41974250
self.cache_elements(_array=a)
41984251

41994252
return a
@@ -6584,6 +6637,10 @@ def persist(self, inplace=False):
65846637
_force_mask_hardness=False, _force_to_memory=True
65856638
)
65866639
dx = dx.persist()
6640+
6641+
# Note to developers: If the following `_set_dask` call is
6642+
# changed, consider making the same
6643+
# changes in `compute`.
65876644
d._set_dask(
65886645
dx, clear=self._ALL ^ self._ARRAY ^ self._CACHE, in_memory=True
65896646
)

cfdm/docstring/docstring.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -834,7 +834,7 @@
834834
# persist
835835
"{{persist description}}": """Persisting turns an underlying lazy dask array into an
836836
equivalent chunked dask array, but now with the results fully
837-
computed and in memory. This can avoid the expense of
837+
computed and cached in memory. This can avoid the expense of
838838
re-reading the data from disk, or re-computing it, when the
839839
data is accessed on multiple occasions.""",
840840
# ----------------------------------------------------------------

0 commit comments

Comments
 (0)