Skip to content

API: rank with nullable dtypes preserve NA #62043

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ Other enhancements
- :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
- :meth:`Series.map` now accepts an ``engine`` parameter to allow execution with a third-party execution engine (:issue:`61125`)
- :meth:`Series.rank` and :meth:`DataFrame.rank` with numpy-nullable dtypes preserve ``NA`` values and return ``UInt64`` dtype where appropriate instead of casting ``NA`` to ``NaN`` with ``float64`` dtype (:issue:`62043`)
- :meth:`Series.str.get_dummies` now accepts a ``dtype`` parameter to specify the dtype of the resulting DataFrame (:issue:`47872`)
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)
- :py:class:`frozenset` elements in pandas objects are now natively printed (:issue:`60690`)
Expand Down
44 changes: 44 additions & 0 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np

from pandas._libs import (
algos as libalgos,
lib,
missing as libmissing,
)
Expand Down Expand Up @@ -992,6 +993,49 @@ def copy(self) -> Self:
mask = self._mask.copy()
return self._simple_new(data, mask)

def _rank(
self,
*,
axis: AxisInt = 0,
method: str = "average",
na_option: str = "keep",
ascending: bool = True,
pct: bool = False,
):
# GH#62043 Avoid going through copy-making ensure_data in algorithms.rank
if axis != 0 or self.ndim != 1:
raise NotImplementedError

from pandas.core.arrays import FloatingArray

data = self._data
if data.dtype.kind == "b":
data = data.view("uint8")

result = libalgos.rank_1d(
data,
is_datetimelike=False,
ties_method=method,
ascending=ascending,
na_option=na_option,
pct=pct,
mask=self.isna(),
)
if na_option in ["top", "bottom"]:
mask = np.zeros(self.shape, dtype=bool)
else:
mask = self._mask.copy()

if method != "average" and not pct:
if na_option not in ["top", "bottom"]:
result[self._mask] = 0 # avoid warning on casting
result = result.astype("uint64", copy=False)
from pandas.core.arrays import IntegerArray

return IntegerArray(result, mask=mask)

return FloatingArray(result, mask=mask)

@doc(ExtensionArray.duplicated)
def duplicated(
self, keep: Literal["first", "last", False] = "first"
Expand Down
18 changes: 15 additions & 3 deletions pandas/tests/series/methods/test_rank.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ def expected_dtype(dtype, method, pct=False):
exp_dtype = "double[pyarrow]"
else:
exp_dtype = "uint64[pyarrow]"
elif dtype in ["Float64", "Int64"]:
if method == "average" or pct:
exp_dtype = "Float64"
else:
exp_dtype = "UInt64"

return exp_dtype

Expand Down Expand Up @@ -257,7 +262,7 @@ def test_rank_nullable_integer(self):
exp = Series([None, 2, None, 3, 3, 2, 3, 1], dtype="Int64")
result = exp.rank(na_option="keep")

expected = Series([np.nan, 2.5, np.nan, 5.0, 5.0, 2.5, 5.0, 1.0])
expected = Series([None, 2.5, None, 5.0, 5.0, 2.5, 5.0, 1.0], dtype="Float64")

tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -302,6 +307,12 @@ def test_rank_tie_methods_on_infs_nans(
exp_dtype = "float64[pyarrow]"
else:
exp_dtype = "uint64[pyarrow]"
elif dtype == "Float64":
# GH#62043
if rank_method == "average":
exp_dtype = "Float64"
else:
exp_dtype = "UInt64"
else:
exp_dtype = "float64"

Expand All @@ -327,7 +338,8 @@ def test_rank_tie_methods_on_infs_nans(
result = iseries.rank(
method=rank_method, na_option=na_option, ascending=ascending
)
tm.assert_series_equal(result, Series(expected, dtype=exp_dtype))
exp_ser = Series(expected, dtype=exp_dtype)
tm.assert_series_equal(result, exp_ser)

def test_rank_desc_mix_nans_infs(self):
# GH 19538
Expand Down Expand Up @@ -439,7 +451,7 @@ def test_rank_ea_small_values(self):
dtype="Float64",
)
result = ser.rank(method="min")
expected = Series([4, 1, 3, np.nan, 2])
expected = Series([4, 1, 3, NA, 2], dtype="UInt64")
tm.assert_series_equal(result, expected)


Expand Down
Loading