SNOW-2396047: Support np.percentile(DataFrame | Series) (#3940)

sfc-gh-helmeleegy · web-flow · commit 4ef6ebbd1bb1 · 2025-10-23T15:35:51.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -93,6 +93,7 @@
 - Added support for `nunique` in `df.pivot_table`, `df.agg` and other places where aggregate functions can be used.
 - Added support for `DataFrame.interpolate` and `Series.interpolate` with the "linear", "ffill"/"pad", and "backfill"/bfill" methods. These use the SQL `INTERPOLATE_LINEAR`, `INTERPOLATE_FFILL`, and `INTERPOLATE_BFILL` functions (PuPr).
 - Added support for `Dataframe.groupby.rolling()`.
+- Added support for mapping `np.percentile` with DataFrame and Series inputs to `Series.quantile`.
 
 #### Improvements
 
diff --git a/docs/source/modin/numpy.rst b/docs/source/modin/numpy.rst
@@ -37,6 +37,12 @@ NumPy ufuncs called with Snowpark pandas arguments will ignore kwargs.
 | ``np.full_like``            | Mapped to pd.DataFrame(value, index=range(height), |
 |                             |                        columns=range(width))       |
 +-----------------------------+----------------------------------------------------+
+| ``np.percentile``           | Mapped to Series.quantile, will stack a DataFrame  |
+|                             | to convert to Series. Always returns an ndarray or |
+|                             | scalar like np.percentile. Does not implement any  |
+|                             | arguments other than the input array and           |
+|                             | percentage(s).                                     |
++-----------------------------+----------------------------------------------------+
 | ``np.may_share_memory``     | Returns False                                      |
 +-----------------------------+----------------------------------------------------+
 | ``np.abs``                  | Mapped to df.abs()                                 |
diff --git a/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py b/src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py
@@ -223,6 +223,77 @@ def full_like_mapper(
     return NotImplemented
 
 
+def percentile_mapper(
+    a: Union[pd.DataFrame, pd.Series],
+    q: Union[float, list[float]],
+    axis: Union[int, tuple[int], None] = None,
+    out: Optional[np.ndarray] = None,
+    overwrite_input: Optional[bool] = False,
+    method: Optional[str] = "linear",
+    keepdims: Optional[bool] = False,
+    *,
+    weights: Optional[list[float]] = None,
+    interpolation: Optional[str] = None,
+) -> np.ndarray:
+    """
+    Maps and executes the numpy percentile signature to the pandas quantile signature
+    if it can be handled, otherwise returns NotImplemented. No parameters
+    are supported beyond the input a and q parameters.
+
+    Numpy np.percentile signature:
+    Return the q-th percentile(s) of an array, Series, or DataFrame (a).
+
+    Pandas Series.quantile signature:
+    Return the q-th quantile(s) of a Series or DataFrame.
+
+    Parameters
+    ----------
+    a : A modin pandas DataFrame or Series
+    q : array_like of float
+    axis: NotImplemented
+    out: NotImplemented
+    overwrite_input: NotImplemented
+    method: NotImplemented
+    keepdims: NotImplemented
+    weights: NotImplemented
+    interpolation: NotImplemented
+
+    Returns
+    -------
+    Returns an ndarray
+
+    """
+    if axis is not None:
+        return NotImplemented
+    if out is not None:
+        return NotImplemented
+    if overwrite_input:
+        return NotImplemented
+    if method != "linear":
+        return NotImplemented
+    if keepdims:
+        return NotImplemented
+    if weights is not None:
+        return NotImplemented
+    if interpolation is not None:
+        return NotImplemented
+    input_values = a
+    if isinstance(q, (float, int)):
+        q = q / 100
+    else:
+        q = [percentage / 100 for percentage in q]
+
+    # We stack any input dataframe into a Series to match numpy behavior.
+    if isinstance(input_values, pd.DataFrame):
+        input_values = input_values.stack().reset_index(drop=True)
+
+    result = input_values.quantile(q)
+    if isinstance(result, pd.Series):
+        result = result.values
+
+    return result
+
+
 # We also need to convert everything to booleans, since numpy will
 # do this implicitly on logical operators and pandas does not.
 def map_to_bools(inputs: Any) -> Any:
@@ -238,6 +309,7 @@ def map_to_bools(inputs: Any) -> Any:
     "unique": unique_mapper,
     "may_share_memory": may_share_memory_mapper,
     "full_like": full_like_mapper,
+    "percentile": percentile_mapper,
 }
 
 # Map that associates a numpy universal function name that operates on
diff --git a/tests/integ/modin/test_numpy.py b/tests/integ/modin/test_numpy.py
@@ -154,6 +154,46 @@ def test_full_like():
         np.full_like(snow_df, 1234, dtype=int)
 
 
+class TestPercentile:
+    @pytest.mark.parametrize("q", [50, [50, 75]])
+    @sql_count_checker(query_count=1)
+    def test_np_percentile(self, q):
+        numpy_a = np.array([0, 0, 1, 1])
+        snow_a = pd.Series([0, 0, 1, 1])
+        numpy_res = np.percentile(numpy_a, q)
+        snow_res = np.percentile(snow_a, q)
+        assert (numpy_res == snow_res).all()
+
+    @pytest.mark.parametrize("q", [50, [50, 75]])
+    @sql_count_checker(query_count=1)
+    def test_np_percentile_2d(self, q):
+        numpy_a_2d = np.array([[1, 2, 5, 6], [1, 2, 3, 4]])
+        snow_a_2d = pd.DataFrame({"a": [1, 2, 5, 6], "b": [1, 2, 3, 4]})
+        numpy_res = np.percentile(numpy_a_2d, q)
+        snow_res = np.percentile(snow_a_2d, q)
+        assert (numpy_res == snow_res).all()
+
+    @pytest.mark.parametrize(
+        "kwargs",
+        [
+            {"axis": 1},
+            {"out": np.zeros(1)},
+            {"overwrite_input": True},
+            {"method": "inverted_cdf"},
+            {"keepdims": True},
+            {"weights": [0.25, 0.25, 0.25, 0.25]},
+            {"interpolation": "inverted_cdf"},
+        ],
+    )
+    @sql_count_checker(query_count=0)
+    def test_np_percentile_neg(self, kwargs):
+        snow_a_2d = pd.DataFrame({"a": [1, 2, 5, 6], "b": [1, 2, 3, 4]})
+        # Verify that numpy throws type errors when we return NotImplemented
+        # when using optional parameters
+        with pytest.raises(TypeError):
+            np.percentile(snow_a_2d, 50, **kwargs)
+
+
 def test_logical_operators():
     data = {
         "A": [0, 1, 2, 0, 1, 2, 0, 1, 2],