Skip to content

Commit 4ef6ebb

Browse files
SNOW-2396047: Support np.percentile(DataFrame | Series) (#3940)
1 parent 30f43d7 commit 4ef6ebb

File tree

4 files changed

+119
-0
lines changed

4 files changed

+119
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@
9393
- Added support for `nunique` in `df.pivot_table`, `df.agg` and other places where aggregate functions can be used.
9494
- Added support for `DataFrame.interpolate` and `Series.interpolate` with the "linear", "ffill"/"pad", and "backfill"/bfill" methods. These use the SQL `INTERPOLATE_LINEAR`, `INTERPOLATE_FFILL`, and `INTERPOLATE_BFILL` functions (PuPr).
9595
- Added support for `Dataframe.groupby.rolling()`.
96+
- Added support for mapping `np.percentile` with DataFrame and Series inputs to `Series.quantile`.
9697

9798
#### Improvements
9899

docs/source/modin/numpy.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ NumPy ufuncs called with Snowpark pandas arguments will ignore kwargs.
3737
| ``np.full_like`` | Mapped to pd.DataFrame(value, index=range(height), |
3838
| | columns=range(width)) |
3939
+-----------------------------+----------------------------------------------------+
40+
| ``np.percentile`` | Mapped to Series.quantile, will stack a DataFrame |
41+
| | to convert to Series. Always returns an ndarray or |
42+
| | scalar like np.percentile. Does not implement any |
43+
| | arguments other than the input array and |
44+
| | percentage(s). |
45+
+-----------------------------+----------------------------------------------------+
4046
| ``np.may_share_memory`` | Returns False |
4147
+-----------------------------+----------------------------------------------------+
4248
| ``np.abs`` | Mapped to df.abs() |

src/snowflake/snowpark/modin/plugin/utils/numpy_to_pandas.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,77 @@ def full_like_mapper(
223223
return NotImplemented
224224

225225

226+
def percentile_mapper(
227+
a: Union[pd.DataFrame, pd.Series],
228+
q: Union[float, list[float]],
229+
axis: Union[int, tuple[int], None] = None,
230+
out: Optional[np.ndarray] = None,
231+
overwrite_input: Optional[bool] = False,
232+
method: Optional[str] = "linear",
233+
keepdims: Optional[bool] = False,
234+
*,
235+
weights: Optional[list[float]] = None,
236+
interpolation: Optional[str] = None,
237+
) -> np.ndarray:
238+
"""
239+
Maps and executes the numpy percentile signature to the pandas quantile signature
240+
if it can be handled, otherwise returns NotImplemented. No parameters
241+
are supported beyond the input a and q parameters.
242+
243+
Numpy np.percentile signature:
244+
Return the q-th percentile(s) of an array, Series, or DataFrame (a).
245+
246+
Pandas Series.quantile signature:
247+
Return the q-th quantile(s) of a Series or DataFrame.
248+
249+
Parameters
250+
----------
251+
a : A modin pandas DataFrame or Series
252+
q : array_like of float
253+
axis: NotImplemented
254+
out: NotImplemented
255+
overwrite_input: NotImplemented
256+
method: NotImplemented
257+
keepdims: NotImplemented
258+
weights: NotImplemented
259+
interpolation: NotImplemented
260+
261+
Returns
262+
-------
263+
Returns an ndarray
264+
265+
"""
266+
if axis is not None:
267+
return NotImplemented
268+
if out is not None:
269+
return NotImplemented
270+
if overwrite_input:
271+
return NotImplemented
272+
if method != "linear":
273+
return NotImplemented
274+
if keepdims:
275+
return NotImplemented
276+
if weights is not None:
277+
return NotImplemented
278+
if interpolation is not None:
279+
return NotImplemented
280+
input_values = a
281+
if isinstance(q, (float, int)):
282+
q = q / 100
283+
else:
284+
q = [percentage / 100 for percentage in q]
285+
286+
# We stack any input dataframe into a Series to match numpy behavior.
287+
if isinstance(input_values, pd.DataFrame):
288+
input_values = input_values.stack().reset_index(drop=True)
289+
290+
result = input_values.quantile(q)
291+
if isinstance(result, pd.Series):
292+
result = result.values
293+
294+
return result
295+
296+
226297
# We also need to convert everything to booleans, since numpy will
227298
# do this implicitly on logical operators and pandas does not.
228299
def map_to_bools(inputs: Any) -> Any:
@@ -238,6 +309,7 @@ def map_to_bools(inputs: Any) -> Any:
238309
"unique": unique_mapper,
239310
"may_share_memory": may_share_memory_mapper,
240311
"full_like": full_like_mapper,
312+
"percentile": percentile_mapper,
241313
}
242314

243315
# Map that associates a numpy universal function name that operates on

tests/integ/modin/test_numpy.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,46 @@ def test_full_like():
154154
np.full_like(snow_df, 1234, dtype=int)
155155

156156

157+
class TestPercentile:
158+
@pytest.mark.parametrize("q", [50, [50, 75]])
159+
@sql_count_checker(query_count=1)
160+
def test_np_percentile(self, q):
161+
numpy_a = np.array([0, 0, 1, 1])
162+
snow_a = pd.Series([0, 0, 1, 1])
163+
numpy_res = np.percentile(numpy_a, q)
164+
snow_res = np.percentile(snow_a, q)
165+
assert (numpy_res == snow_res).all()
166+
167+
@pytest.mark.parametrize("q", [50, [50, 75]])
168+
@sql_count_checker(query_count=1)
169+
def test_np_percentile_2d(self, q):
170+
numpy_a_2d = np.array([[1, 2, 5, 6], [1, 2, 3, 4]])
171+
snow_a_2d = pd.DataFrame({"a": [1, 2, 5, 6], "b": [1, 2, 3, 4]})
172+
numpy_res = np.percentile(numpy_a_2d, q)
173+
snow_res = np.percentile(snow_a_2d, q)
174+
assert (numpy_res == snow_res).all()
175+
176+
@pytest.mark.parametrize(
177+
"kwargs",
178+
[
179+
{"axis": 1},
180+
{"out": np.zeros(1)},
181+
{"overwrite_input": True},
182+
{"method": "inverted_cdf"},
183+
{"keepdims": True},
184+
{"weights": [0.25, 0.25, 0.25, 0.25]},
185+
{"interpolation": "inverted_cdf"},
186+
],
187+
)
188+
@sql_count_checker(query_count=0)
189+
def test_np_percentile_neg(self, kwargs):
190+
snow_a_2d = pd.DataFrame({"a": [1, 2, 5, 6], "b": [1, 2, 3, 4]})
191+
# Verify that numpy throws type errors when we return NotImplemented
192+
# when using optional parameters
193+
with pytest.raises(TypeError):
194+
np.percentile(snow_a_2d, 50, **kwargs)
195+
196+
157197
def test_logical_operators():
158198
data = {
159199
"A": [0, 1, 2, 0, 1, 2, 0, 1, 2],

0 commit comments

Comments
 (0)