Skip to content

Commit 9d40d05

Browse files
committed
Handle floating point boundaries
1 parent 642d244 commit 9d40d05

File tree

4 files changed

+33
-26
lines changed

4 files changed

+33
-26
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,7 @@ Groupby/resample/rolling
615615

616616
Reshaping
617617
^^^^^^^^^
618+
- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
618619
- Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
619620
- Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
620621
- Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)

pandas/core/array_algos/quantile.py

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,9 @@ def quantile_with_mask(
9494
flat = np.array([fill_value] * len(qs))
9595
result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
9696
else:
97-
result = _nanpercentile(
97+
result = _nanquantile(
9898
values,
99-
qs * 100.0,
99+
qs,
100100
na_value=fill_value,
101101
mask=mask,
102102
interpolation=interpolation,
@@ -108,15 +108,15 @@ def quantile_with_mask(
108108
return result
109109

110110

111-
def _nanpercentile_1d(
111+
def _nanquantile_1d(
112112
values: np.ndarray,
113113
mask: npt.NDArray[np.bool_],
114114
qs: npt.NDArray[np.float64],
115115
na_value: Scalar,
116116
interpolation: str,
117117
) -> Scalar | np.ndarray:
118118
"""
119-
Wrapper for np.percentile that skips missing values, specialized to
119+
Wrapper for np.quantile that skips missing values, specialized to
120120
1-dimensional case.
121121
122122
Parameters
@@ -142,17 +142,10 @@ def _nanpercentile_1d(
142142
# equiv: 'np.array([na_value] * len(qs))' but much faster
143143
return np.full(len(qs), na_value)
144144

145-
return np.percentile(
146-
values,
147-
qs,
148-
# error: No overload variant of "percentile" matches argument
149-
# types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]"
150-
# , "Dict[str, str]" [call-overload]
151-
method=interpolation, # type: ignore[call-overload]
152-
)
145+
return np.quantile(values, qs, method=interpolation)
153146

154147

155-
def _nanpercentile(
148+
def _nanquantile(
156149
values: np.ndarray,
157150
qs: npt.NDArray[np.float64],
158151
*,
@@ -161,7 +154,7 @@ def _nanpercentile(
161154
interpolation: str,
162155
):
163156
"""
164-
Wrapper for np.percentile that skips missing values.
157+
Wrapper for np.quantile that skips missing values.
165158
166159
Parameters
167160
----------
@@ -180,7 +173,7 @@ def _nanpercentile(
180173

181174
if values.dtype.kind in "mM":
182175
# need to cast to integer to avoid rounding errors in numpy
183-
result = _nanpercentile(
176+
result = _nanquantile(
184177
values.view("i8"),
185178
qs=qs,
186179
na_value=na_value.view("i8"),
@@ -196,7 +189,7 @@ def _nanpercentile(
196189
# Caller is responsible for ensuring mask shape match
197190
assert mask.shape == values.shape
198191
result = [
199-
_nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
192+
_nanquantile_1d(val, m, qs, na_value, interpolation=interpolation)
200193
for (val, m) in zip(list(values), list(mask))
201194
]
202195
if values.dtype.kind == "f":
@@ -215,12 +208,4 @@ def _nanpercentile(
215208
result = result.astype(values.dtype, copy=False)
216209
return result
217210
else:
218-
return np.percentile(
219-
values,
220-
qs,
221-
axis=1,
222-
# error: No overload variant of "percentile" matches argument types
223-
# "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
224-
# "int", "Dict[str, str]" [call-overload]
225-
method=interpolation, # type: ignore[call-overload]
226-
)
211+
return np.quantile(values, qs, axis=1, method=interpolation)

pandas/core/reshape/tile.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,16 @@ def qcut(
358358
x_idx = _preprocess_for_cut(x)
359359
x_idx, _ = _coerce_to_type(x_idx)
360360

361-
quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
361+
if is_integer(q):
362+
quantiles = np.linspace(0, 1, q + 1)
363+
# Round up rather than to nearest if not representable in base 2
364+
np.putmask(
365+
quantiles,
366+
q * quantiles != np.arange(q + 1),
367+
np.nextafter(quantiles, 1),
368+
)
369+
else:
370+
quantiles = q
362371

363372
bins = x_idx.to_series().dropna().quantile(quantiles)
364373

pandas/tests/reshape/test_qcut.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,3 +307,15 @@ def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
307307
expected = qcut(arr.astype(float), q)
308308

309309
tm.assert_categorical_equal(result, expected)
310+
311+
312+
@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0])
313+
@pytest.mark.parametrize("q", [3, 7, 9])
314+
@pytest.mark.parametrize("precision", [1, 3, 16])
315+
def test_qcut_contains(scale, q, precision):
316+
# GH-59355
317+
arr = (scale * np.arange(q + 1)).round(precision)
318+
result = qcut(arr, q, precision=precision)
319+
320+
for value, bucket in zip(arr, result):
321+
assert value in bucket

0 commit comments

Comments
 (0)