Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ Utilities
.. autosummary::
:toctree: generated/

minmax
maxabs
variance_to_weights
grid_to_table
Expand Down
2 changes: 1 addition & 1 deletion verde/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from .scipygridder import Cubic, Linear, ScipyGridder
from .spline import Spline, SplineCV
from .trend import Trend
from .utils import grid_to_table, make_xarray_grid, maxabs, variance_to_weights
from .utils import grid_to_table, make_xarray_grid, maxabs, minmax, variance_to_weights
from .vector import Vector, VectorSpline2D


Expand Down
88 changes: 88 additions & 0 deletions verde/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
maxabs,
meshgrid_from_1d,
meshgrid_to_1d,
minmax,
parse_engine,
partition_by_sum,
)
Expand Down Expand Up @@ -337,6 +338,93 @@ def test_check_ndim_easting_northing():
get_ndim_horizontal_coords(easting, northing)


def test_minmax_nans():
"""
Test minmax handles nans correctly
"""
assert tuple(map(float, minmax((-1, 100, 1, 2, np.nan)))) == (-1, 100)
assert tuple(map(float, minmax((np.nan, -3.2, -1, -2, 3.1)))) == (-3.2, 3.1)
assert np.all(np.isnan(minmax((np.nan, -3, -1, 3), nan=False)))


def test_minmax_percentile():
"""
Test minmax with percentile option
"""
data = np.arange(0, 101)

# test generic functionality
result = tuple(map(float, minmax(data, min_percentile=0, max_percentile=100)))
assert result == (0, 100)
result = tuple(map(float, minmax(data, min_percentile=0.0, max_percentile=100.0)))
assert result == (0, 100)
result = tuple(map(float, minmax(data, min_percentile=10, max_percentile=90)))
assert pytest.approx(result, 0.1) == (10, 90)
result = tuple(map(float, minmax(data, min_percentile=10.0, max_percentile=90.0)))
assert pytest.approx(result, 0.1) == (10, 90)

# test with nans
data_with_nans = np.append(data, np.nan)
result = tuple(
map(float, minmax(data_with_nans, min_percentile=0, max_percentile=100))
)
assert result == (0, 100)
result = tuple(
map(float, minmax(data_with_nans, min_percentile=10, max_percentile=90))
)
assert pytest.approx(result, 0.1) == (10, 90)
result = tuple(
map(
float,
minmax(data_with_nans, min_percentile=10, max_percentile=90, nan=True),
)
)
assert pytest.approx(result, 0.1) == (10, 90)
result = minmax(data_with_nans, min_percentile=10, max_percentile=90, nan=False)
assert np.all(np.isnan(result))

# test with varying array sizes
result = tuple(
map(
float,
minmax(
[0, 1, 2, 3, 4], [[-2, 2], [0, 5]], min_percentile=0, max_percentile=100
),
)
)
assert result == (-2, 5)
result = tuple(
map(
float,
minmax(
[0, 1, 2, 3, 4], [[-2, 2], [0, 5]], min_percentile=1, max_percentile=99
),
)
)
assert pytest.approx(result, 0.1) == (-1.84, 4.92)

# test invalid percentile types
msg = "Invalid 'min_percentile' of type '"
with pytest.raises(TypeError, match=msg):
minmax(data, min_percentile="90")
with pytest.raises(TypeError, match=msg):
minmax(data, min_percentile=None)
msg = "Invalid 'max_percentile' of type '"
with pytest.raises(TypeError, match=msg):
minmax(data, max_percentile=[90])

# test invalid percentile values
msg = "'min_percentile'"
with pytest.raises(ValueError, match=msg):
minmax(data, min_percentile=99, max_percentile=90)
msg = "Invalid value for 'min_percentile'"
with pytest.raises(ValueError, match=msg):
minmax(data, min_percentile=-10)
msg = "Invalid value for 'max_percentile'"
with pytest.raises(ValueError, match=msg):
minmax(data, max_percentile=110)


def test_maxabs_nans():
"""
Test maxabs handles nans correctly
Expand Down
131 changes: 131 additions & 0 deletions verde/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,137 @@ def maxabs(*args, nan=True, percentile=100):
return nppercentile(combined_array, percentile)


def minmax(*args, nan=True, min_percentile=0, max_percentile=100):
"""
Calculate the minimum and maximum values (or percentiles) of the given
array(s).

Use this to set the limits of your colorbars for non-diverging data.

Parameters
----------
args
One or more arrays. If more than one are given, a minimum and maximum
will be calculated across all arrays.
nan : bool, optional
If True, will use the ``nan`` version of numpy functions to ignore
NaNs.
min_percentile: float
Return the supplied percentile (0 to 100) instead of the minimum value
of the arrays. Defaults to 0, giving the minimum value.
max_percentile : float
Return the supplied percentile (0 to 100) instead of the maximum value
of the arrays. Defaults to 100, giving the maximum value.

Returns
-------
min, max : float
The minimum and maximum (or percentile) values across all arrays.

Examples
--------

>>> result = minmax((1, -10, 25, 2, 3))
>>> tuple(map(float, result))
(-10.0, 25.0)
>>> result = minmax(
... (1, -10.5, 25, 2), (0.1, 100, -500), (-200, -300, -0.1, -499)
... )
>>> tuple(map(float, result))
(-500.0, 100.0)

If the array contains NaNs, we'll use the ``nan`` version of of the numpy
functions by default. You can turn this off through the *nan* argument.

>>> import numpy as np
>>> result = minmax((1, -10, 25, 2, 3, np.nan))
>>> tuple(map(float, result))
(-10.0, 25.0)
>>> result = minmax((1, -10, 25, 2, 3, np.nan), nan=False)
>>> tuple(map(float, result))
(nan, nan)

If a more robust statistic is desired, you can use ``min_percentile`` and
or ``max_percentile`` to get the values at given percentiles instead of
the minimum and maximum.

>>> import numpy as np
>>> result = minmax(
... (1, -10, 25, 2, 3), min_percentile=2, max_percentile=98
... )
>>> tuple(map(float, result))
(-9.12, 23.24)
>>> result = minmax(
... (1, -10, 25, 2, 3), min_percentile=0, max_percentile=100
... )
>>> tuple(map(float, result))
(-10.0, 25.0)

"""
arrays = [np.atleast_1d(i) for i in args]

# checks
if not isinstance(min_percentile, (float, int)):
raise TypeError(
f"Invalid 'min_percentile' of type '{type(min_percentile).__name__}'. Must be a float or int."
)
if not isinstance(max_percentile, (float, int)):
raise TypeError(
f"Invalid 'max_percentile' of type '{type(max_percentile).__name__}'. Must be a float or int."
)
if not min_percentile <= max_percentile:
raise ValueError(
f"'min_percentile' ({min_percentile}) must not be larger than 'max_percentile' ({max_percentile})."
)
if min_percentile < 0 or min_percentile > 100:
raise ValueError(
f"Invalid value for 'min_percentile' ({min_percentile}). Must be between 0 and 100."
)
if max_percentile < 0 or max_percentile > 100:
raise ValueError(
f"Invalid value for 'max_percentile' ({max_percentile}). Must be between 0 and 100."
)

# determine which functions to use
if nan:
npmin = np.nanmin
npmax = np.nanmax
nppercentile = np.nanpercentile
elif not nan:
npmin = np.min
npmax = np.max
nppercentile = np.percentile
else:
raise TypeError(f"The 'nan' parameter ({nan}) must be a boolean.")

# get min value
if min_percentile == 0:
min_ = npmin([npmin(i) for i in arrays])

# get max value
if max_percentile == 100:
max_ = npmax([npmax(i) for i in arrays])

# calculate min, max or both percentiles
if min_percentile != 0 or max_percentile != 100:
# concatenate values of all arrays
combined_array = np.concatenate([a.ravel() for a in arrays])

# if neither percentiles are defaults, calculate them together
if min_percentile != 0 and max_percentile != 100:
min_, max_ = nppercentile(combined_array, [min_percentile, max_percentile])

# only calculate min percentile
elif min_percentile != 0:
min_ = nppercentile(combined_array, min_percentile)

# only calculate max percentile
if max_percentile != 100:
max_ = nppercentile(combined_array, max_percentile)

return min_, max_
Comment on lines +367 to +392
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could simplify this logic a bit. What about this:

Suggested change
# get min value
if min_percentile == 0:
min_ = npmin([npmin(i) for i in arrays])
# get max value
if max_percentile == 100:
max_ = npmax([npmax(i) for i in arrays])
# calculate min, max or both percentiles
if min_percentile != 0 or max_percentile != 100:
# concatenate values of all arrays
combined_array = np.concatenate([a.ravel() for a in arrays])
# if neither percentiles are defaults, calculate them together
if min_percentile != 0 and max_percentile != 100:
min_, max_ = nppercentile(combined_array, [min_percentile, max_percentile])
# only calculate min percentile
elif min_percentile != 0:
min_ = nppercentile(combined_array, min_percentile)
# only calculate max percentile
if max_percentile != 100:
max_ = nppercentile(combined_array, max_percentile)
return min_, max_
if min_percentile == 0 and max_percentile == 0:
min_ = npmin([npmin(i) for i in arrays])
max_ = npmax([npmax(i) for i in arrays])
return min_, max_
# concatenate values of all arrays
combined_array = np.concatenate([a.ravel() for a in arrays])
min_, max_ = nppercentile(combined_array, [min_percentile, max_percentile])
return min_, max_

If any of the min_percentile and max_percentile is different from the defaults, then we'll use nppercentile anyways. And a min_percentile=0 will always lead to the minimum value. Moreover, I suspect there's no significant computational hit in passing two percentiles, since I guess that Numpy is computing each while looping over the elements in the array once.

So, we would only use npmin and npmax if min_percentile==0 and max_percentile==100.

What do you think?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is a bit of time savings from only calculating one percentile and using min/max for the other. But maybe the time savings is not worth the extra code? At least that's how I interpreted the benchmarking results from below. The first uses min and max, with no percentiles, the second uses 1 and 99 percentiles, and the third uses min and 99 percentiles. As you can see, using 1 percentile and 1 min/max calculation is a bit fast than using 2 percentile calculations.

What do you think?

%timeit vd.minmax(a, min_percentile=0, max_percentile=100, nan=True)
%timeit vd.minmax(a, min_percentile=1, max_percentile=99, nan=True)
%timeit vd.minmax(a, min_percentile=0, max_percentile=99, nan=True)

>>>50.5 μs ± 2.56 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
>>>289 μs ± 26.5 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
>>>179 μs ± 3.93 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I would have guessed that passing multiple percentiles would not impact the computation time that much. But nonetheless, those differences are not significant, so I think I would keep the code simpler. Do you agree?

Even for a large array with 100 million elements, the difference is not very noticeable by users:

import numpy as np

a = np.random.default_rng().uniform(size=100_000_000)

# benchmark np.min and np.percentile with a single value
%timeit np.min(a)
%timeit np.percentile(a, 90)
38.1 ms ± 351 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
862 ms ± 3.75 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# benchmark percentile with two values
%timeit np.percentile(a, [0, 90])
1.01 s ± 5.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)



def make_xarray_grid(
coordinates,
data,
Expand Down
Loading