Handle floating point boundaries

rob-sil · rob-sil · commit 9d40d0572426 · 2024-08-04T09:37:24.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -615,6 +615,7 @@ Groupby/resample/rolling
 
 Reshaping
 ^^^^^^^^^
+- Bug in :func:`qcut` where values at the quantile boundaries could be incorrectly assigned (:issue:`59355`)
 - Bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when ``sort=False`` (:issue:`54987`, :issue:`55516`)
 - Bug in :meth:`DataFrame.unstack` producing incorrect results when manipulating empty :class:`DataFrame` with an :class:`ExtentionDtype` (:issue:`59123`)
diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py
@@ -94,9 +94,9 @@ def quantile_with_mask(
         flat = np.array([fill_value] * len(qs))
         result = np.repeat(flat, len(values)).reshape(len(values), len(qs))
     else:
-        result = _nanpercentile(
+        result = _nanquantile(
             values,
-            qs * 100.0,
+            qs,
             na_value=fill_value,
             mask=mask,
             interpolation=interpolation,
@@ -108,15 +108,15 @@ def quantile_with_mask(
     return result
 
 
-def _nanpercentile_1d(
+def _nanquantile_1d(
     values: np.ndarray,
     mask: npt.NDArray[np.bool_],
     qs: npt.NDArray[np.float64],
     na_value: Scalar,
     interpolation: str,
 ) -> Scalar | np.ndarray:
     """
-    Wrapper for np.percentile that skips missing values, specialized to
+    Wrapper for np.quantile that skips missing values, specialized to
     1-dimensional case.
 
     Parameters
@@ -142,17 +142,10 @@ def _nanpercentile_1d(
         # equiv: 'np.array([na_value] * len(qs))' but much faster
         return np.full(len(qs), na_value)
 
-    return np.percentile(
-        values,
-        qs,
-        # error: No overload variant of "percentile" matches argument
-        # types "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]"
-        # , "Dict[str, str]"  [call-overload]
-        method=interpolation,  # type: ignore[call-overload]
-    )
+    return np.quantile(values, qs, method=interpolation)
 
 
-def _nanpercentile(
+def _nanquantile(
     values: np.ndarray,
     qs: npt.NDArray[np.float64],
     *,
@@ -161,7 +154,7 @@ def _nanpercentile(
     interpolation: str,
 ):
     """
-    Wrapper for np.percentile that skips missing values.
+    Wrapper for np.quantile that skips missing values.
 
     Parameters
     ----------
@@ -180,7 +173,7 @@ def _nanpercentile(
 
     if values.dtype.kind in "mM":
         # need to cast to integer to avoid rounding errors in numpy
-        result = _nanpercentile(
+        result = _nanquantile(
             values.view("i8"),
             qs=qs,
             na_value=na_value.view("i8"),
@@ -196,7 +189,7 @@ def _nanpercentile(
         # Caller is responsible for ensuring mask shape match
         assert mask.shape == values.shape
         result = [
-            _nanpercentile_1d(val, m, qs, na_value, interpolation=interpolation)
+            _nanquantile_1d(val, m, qs, na_value, interpolation=interpolation)
             for (val, m) in zip(list(values), list(mask))
         ]
         if values.dtype.kind == "f":
@@ -215,12 +208,4 @@ def _nanpercentile(
                 result = result.astype(values.dtype, copy=False)
         return result
     else:
-        return np.percentile(
-            values,
-            qs,
-            axis=1,
-            # error: No overload variant of "percentile" matches argument types
-            # "ndarray[Any, Any]", "ndarray[Any, dtype[floating[_64Bit]]]",
-            # "int", "Dict[str, str]"  [call-overload]
-            method=interpolation,  # type: ignore[call-overload]
-        )
+        return np.quantile(values, qs, axis=1, method=interpolation)
diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py
@@ -358,7 +358,16 @@ def qcut(
     x_idx = _preprocess_for_cut(x)
     x_idx, _ = _coerce_to_type(x_idx)
 
-    quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
+    if is_integer(q):
+        quantiles = np.linspace(0, 1, q + 1)
+        # Round up rather than to nearest if not representable in base 2
+        np.putmask(
+            quantiles,
+            q * quantiles != np.arange(q + 1),
+            np.nextafter(quantiles, 1),
+        )
+    else:
+        quantiles = q
 
     bins = x_idx.to_series().dropna().quantile(quantiles)
 
diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py
@@ -307,3 +307,15 @@ def test_qcut_nullable_integer(q, any_numeric_ea_dtype):
     expected = qcut(arr.astype(float), q)
 
     tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize("scale", [1.0, 1 / 3, 17.0])
+@pytest.mark.parametrize("q", [3, 7, 9])
+@pytest.mark.parametrize("precision", [1, 3, 16])
+def test_qcut_contains(scale, q, precision):
+    # GH-59355
+    arr = (scale * np.arange(q + 1)).round(precision)
+    result = qcut(arr, q, precision=precision)
+
+    for value, bucket in zip(arr, result):
+        assert value in bucket