-
Notifications
You must be signed in to change notification settings - Fork 71
keep_longest_run is ressource-heavy #2131
Copy link
Copy link
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Generic Issue
- xclim version: 0.54.0
- Python version:
- Operating System:
Description
xc.indices.run_length.keep_longest_run is slow and expensive in memory. With no resampling, for a 200 MB boolean dataset, it runs in 50 seconds and temporarily consumes 6 GB of memory.
What I Did
>>> season.nbytes / 1e6 # MB
206.7168
>>> run_season_longest_native = xc.indices.run_length.keep_longest_run(season)
CPU times: user 27.9 s, sys: 24.7 s, total: 52.6 s
Wall time: 52.9 s
>>> %memit keep_longest_run(season)
peak memory: 14061.95 MiB, increment: 6505.66 MiB
Memory profile information
Filename: /exec/sgammon/.conda/envs/fwi/lib/python3.10/site-packages/xclim/indices/run_length.py
Line # Mem usage Increment Occurrences Line Contents
=============================================================
178 3542.7 MiB 3542.7 MiB 1 def rle(
179 da: xr.DataArray,
180 dim: str = "time",
181 index: str = "first",
182 ) -> xr.DataArray:
183 """
184 Run length.
185
186 Despite its name, this is not an actual run-length encoder : it returns an array of the same shape
187 as the input with 0 where the input was <= 0, nan where the input was > 0, except on the first (or last) element
188 of each run of consecutive > 0 values, where it is set to the sum of the elements within the run.
189 For an actual run length encoder, see :py:func:`rle_1d`.
190
191 Usually, the input would be a boolean mask and the first element of each run would then be set to the run's length (thus the name).
192 But the function also accepts int and float inputs.
193
194 Parameters
195 ----------
196 da : xr.DataArray
197 Input array.
198 dim : str
199 Dimension name.
200 index : {'first', 'last'}
201 If 'first' (default), the run length is indexed with the first element in the run.
202 If 'last', with the last element in the run.
203
204 Returns
205 -------
206 xr.DataArray
207 The run length array.
208 """
209 3542.7 MiB 0.0 MiB 1 if da.dtype == bool:
210 5119.8 MiB 1577.1 MiB 1 da = da.astype(int)
211
212 # "first" case: Algorithm is applied on inverted array and output is inverted back
213 5119.8 MiB 0.0 MiB 1 if index == "first":
214 5119.8 MiB 0.0 MiB 1 da = da[{dim: slice(None, None, -1)}]
215
216 # Get cumulative sum for each series of 1, e.g. da == 100110111 -> cs_s == 100120123
217 6697.0 MiB 1577.2 MiB 1 cs_s = _cumsum_reset(da, dim)
218
219 # Keep total length of each series (and also keep 0's), e.g. 100120123 -> 100N20NN3
220 # Keep numbers with a 0 to the right and also the last number
221 6697.0 MiB 0.0 MiB 1 cs_s = cs_s.where(da.shift({dim: -1}, fill_value=0) == 0)
222 8274.1 MiB 1577.1 MiB 1 out = cs_s.where(da > 0, 0) # Reinsert 0's at their original place
223
224 # Inverting back if needed e.g. 100N20NN3 -> 3NN02N001. This is the output of
225 # `rle` for 111011001 with index == "first"
226 8274.1 MiB 0.0 MiB 1 if index == "first":
227 8274.1 MiB 0.0 MiB 1 out = out[{dim: slice(None, None, -1)}]
228
229 8274.1 MiB 0.0 MiB 1 return out
Filename: /exec/sgammon/.conda/envs/fwi/lib/python3.10/site-packages/xclim/indices/run_length.py
Line # Mem usage Increment Occurrences Line Contents
=============================================================
757 3542.7 MiB 3542.7 MiB 1 def keep_longest_run(
758 da: xr.DataArray, dim: str = "time", freq: str | None = None
759 ) -> xr.DataArray:
760 """
761 Keep the longest run along a dimension.
762
763 Parameters
764 ----------
765 da : xr.DataArray
766 Boolean array.
767 dim : str
768 Dimension along which to check for the longest run.
769 freq : str
770 Resampling frequency.
771
772 Returns
773 -------
774 xr.DataArray, [bool]
775 Boolean array similar to da but with only one run, the (first) longest.
776 """
777 # Get run lengths
778 5119.9 MiB 5119.9 MiB 1 rls = rle(da, dim)
779
780 5119.9 MiB 0.0 MiB 2 def _get_out(_rls): # numpydoc ignore=GL08
781 6697.1 MiB -197.1 MiB 2 _out = xr.where(
782 # Construct an integer array and find the max
783 5317.1 MiB 197.3 MiB 1 _rls[dim].copy(data=np.arange(_rls[dim].size)) == _rls.argmax(dim),
784 6894.3 MiB 1577.1 MiB 1 _rls + 1, # Add one to the First longest run
785 6894.3 MiB 0.0 MiB 1 _rls,
786 )
787 5321.3 MiB -1375.8 MiB 1 _out = _out.ffill(dim) == _out.max(dim)
788 5321.3 MiB 0.0 MiB 1 return _out
789
790 5119.9 MiB 0.0 MiB 1 if freq is not None:
791 out = resample_map(rls, dim, freq, _get_out)
792 else:
793 5321.3 MiB 0.0 MiB 1 out = _get_out(rls)
794
795 5321.3 MiB 0.0 MiB 1 return da.copy(data=out.transpose(*da.dims).data)
Time profile information
Timer unit: 1e-09 s
Total time: 32.9214 s
File: /exec/sgammon/.conda/envs/fwi/lib/python3.10/site-packages/xclim/indices/run_length.py
Function: rle at line 178
Line # Hits Time Per Hit % Time Line Contents
==============================================================
178 def rle(
179 da: xr.DataArray,
180 dim: str = "time",
181 index: str = "first",
182 ) -> xr.DataArray:
183 """
184 Run length.
185
186 Despite its name, this is not an actual run-length encoder : it returns an array of the same shape
187 as the input with 0 where the input was <= 0, nan where the input was > 0, except on the first (or last) element
188 of each run of consecutive > 0 values, where it is set to the sum of the elements within the run.
189 For an actual run length encoder, see :py:func:`rle_1d`.
190
191 Usually, the input would be a boolean mask and the first element of each run would then be set to the run's length (thus the name).
192 But the function also accepts int and float inputs.
193
194 Parameters
195 ----------
196 da : xr.DataArray
197 Input array.
198 dim : str
199 Dimension name.
200 index : {'first', 'last'}
201 If 'first' (default), the run length is indexed with the first element in the run.
202 If 'last', with the last element in the run.
203
204 Returns
205 -------
206 xr.DataArray
207 The run length array.
208 """
209 1 32926.0 32926.0 0.0 if da.dtype == bool:
210 1 1752066664.0 2e+09 5.3 da = da.astype(int)
211
212 # "first" case: Algorithm is applied on inverted array and output is inverted back
213 1 868.0 868.0 0.0 if index == "first":
214 1 832655.0 832655.0 0.0 da = da[{dim: slice(None, None, -1)}]
215
216 # Get cumulative sum for each series of 1, e.g. da == 100110111 -> cs_s == 100120123
217 1 2e+10 2e+10 56.9 cs_s = _cumsum_reset(da, dim)
218
219 # Keep total length of each series (and also keep 0's), e.g. 100120123 -> 100N20NN3
220 # Keep numbers with a 0 to the right and also the last number
221 1 9642794323.0 1e+10 29.3 cs_s = cs_s.where(da.shift({dim: -1}, fill_value=0) == 0)
222 1 2795720305.0 3e+09 8.5 out = cs_s.where(da > 0, 0) # Reinsert 0's at their original place
223
224 # Inverting back if needed e.g. 100N20NN3 -> 3NN02N001. This is the output of
225 # `rle` for 111011001 with index == "first"
226 1 3227.0 3227.0 0.0 if index == "first":
227 1 1038014.0 1e+06 0.0 out = out[{dim: slice(None, None, -1)}]
228
229 1 450.0 450.0 0.0 return out
Total time: 47.9595 s
File: /exec/sgammon/.conda/envs/fwi/lib/python3.10/site-packages/xclim/indices/run_length.py
Function: keep_longest_run at line 757
Line # Hits Time Per Hit % Time Line Contents
==============================================================
757 def keep_longest_run(
758 da: xr.DataArray, dim: str = "time", freq: str | None = None
759 ) -> xr.DataArray:
760 """
761 Keep the longest run along a dimension.
762
763 Parameters
764 ----------
765 da : xr.DataArray
766 Boolean array.
767 dim : str
768 Dimension along which to check for the longest run.
769 freq : str
770 Resampling frequency.
771
772 Returns
773 -------
774 xr.DataArray, [bool]
775 Boolean array similar to da but with only one run, the (first) longest.
776 """
777 # Get run lengths
778 1 3e+10 3e+10 69.7 rls = rle(da, dim)
779
780 1 7573.0 7573.0 0.0 def _get_out(_rls): # numpydoc ignore=GL08
781 _out = xr.where(
782 # Construct an integer array and find the max
783 _rls[dim].copy(data=np.arange(_rls[dim].size)) == _rls.argmax(dim),
784 _rls + 1, # Add one to the First longest run
785 _rls,
786 )
787 _out = _out.ffill(dim) == _out.max(dim)
788 return _out
789
790 1 1988.0 1988.0 0.0 if freq is not None:
791 out = resample_map(rls, dim, freq, _get_out)
792 else:
793 1 1e+10 1e+10 30.3 out = _get_out(rls)
794
795 1 6575041.0 7e+06 0.0 return da.copy(data=out.transpose(*da.dims).data)
- I agree to follow this project's Code of Conduct
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request