From 98a70dffd7150c86348804e4ed3a13fad74d1198 Mon Sep 17 00:00:00 2001 From: sharkipelago Date: Mon, 25 Aug 2025 12:31:07 -0400 Subject: [PATCH 1/8] using _mgr apply with 2 failing tests --- pandas/core/algorithms.py | 1 - pandas/core/arrays/base.py | 1 + pandas/core/generic.py | 28 ++++++++----------- pandas/tests/frame/methods/test_rank.py | 36 +++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 18 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 533b9b689af0b..9261e361ff392 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1088,7 +1088,6 @@ def rank( ) else: raise TypeError("Array with ndim > 2 are not supported.") - return ranks diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1cd10a9eef9d1..0384515e63770 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2408,6 +2408,7 @@ def _rank( """ See Series.rank.__doc__. """ + if axis != 0: raise NotImplementedError diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 33fcc94f906d5..71be77a08bcf2 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9275,17 +9275,10 @@ def rank( msg = "na_option must be one of 'keep', 'top', or 'bottom'" raise ValueError(msg) - def ranker(data): - if data.ndim == 2: - # i.e. DataFrame, we cast to ndarray - values = data.values - else: - # i.e. Series, can dispatch to EA - values = data._values - - if isinstance(values, ExtensionArray): - ranks = values._rank( - axis=axis_int, + def ranker(blk_values): + if isinstance(blk_values, ExtensionArray) and blk_values.ndim == 1: + ranks = blk_values._rank( + axis=0, method=method, ascending=ascending, na_option=na_option, @@ -9293,16 +9286,14 @@ def ranker(data): ) else: ranks = algos.rank( - values, - axis=axis_int, + blk_values, + axis=1 - axis_int, method=method, ascending=ascending, na_option=na_option, pct=pct, ) - - ranks_obj = self._constructor(ranks, **data._construct_axes_dict()) - return ranks_obj.__finalize__(self, method="rank") + return ranks if numeric_only: if self.ndim == 1 and not is_numeric_dtype(self.dtype): @@ -9315,7 +9306,10 @@ def ranker(data): else: data = self - return ranker(data) + result = data._mgr.apply(ranker) + return self._constructor_from_mgr(result, axes=result.axes).__finalize__( + self, method="rank" + ) @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"]) def compare( diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 6c6c208ee0c78..ab1c1b3e6c1dd 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -405,6 +405,7 @@ def test_rank_inf_and_nan(self, contents, dtype, frame_or_series): # Shuffle the testing array and expected results in the same way random_order = np.random.default_rng(2).permutation(len(values)) obj = frame_or_series(values[random_order]) + print("TYPE", type(obj)) expected = frame_or_series(exp_order[random_order], dtype="float64") result = obj.rank() tm.assert_equal(result, expected) @@ -498,3 +499,38 @@ def test_rank_string_dtype(self, string_dtype_no_object): exp_dtype = "float64" expected = Series([1, 2, None, 3], dtype=exp_dtype) tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( + "method,og_dtype,expected_dtype", + [ + ("average", "UInt32", "Float64"), + ("average", "Float32", "Float64"), + ("average", "int32[pyarrow]", "double[pyarrow]"), + ("min", "Int32", "Float64"), + ("min", "Float32", "Float64"), + ("min", "int32[pyarrow]", "double[pyarrow]"), + ], + ) + def test_rank_extension_array_dtype(self, method, og_dtype, expected_dtype): + # GH#52829 + result = DataFrame([4, 89, 33], dtype=og_dtype).rank() + if method == "average": + expected = DataFrame([1.0, 3.0, 2.0], dtype=expected_dtype) + else: + expected = DataFrame([1, 3, 2], dtype=expected_dtype) + tm.assert_frame_equal(result, expected) + + def test_rank_mixed_extension_array_dtype(self): + result = DataFrame( + { + "base": Series([4, 5, 6]), + "extension": Series([7, 8, 9], dtype="int32[pyarrow]"), + } + ).rank(method="min") + expected = DataFrame( + { + "base": Series([1.0, 2.0, 3.0], dtype="float64"), + "extension": Series([1, 2, 3], dtype="uint64[pyarrow]"), + } + ) + tm.assert_frame_equal(result, expected) From af9f3de1548a56633e593dd1aba8546a4ced893c Mon Sep 17 00:00:00 2001 From: sharkipelago Date: Mon, 25 Aug 2025 12:46:55 -0400 Subject: [PATCH 2/8] transposed blocks to keep axis_int parameter intact --- pandas/core/generic.py | 6 +++--- pandas/tests/frame/methods/test_rank.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 71be77a08bcf2..3a8055818cf36 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9286,13 +9286,13 @@ def ranker(blk_values): ) else: ranks = algos.rank( - blk_values, - axis=1 - axis_int, + blk_values.T, + axis=axis_int, method=method, ascending=ascending, na_option=na_option, pct=pct, - ) + ).T return ranks if numeric_only: diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index ab1c1b3e6c1dd..48fdcb4641624 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -405,7 +405,6 @@ def test_rank_inf_and_nan(self, contents, dtype, frame_or_series): # Shuffle the testing array and expected results in the same way random_order = np.random.default_rng(2).permutation(len(values)) obj = frame_or_series(values[random_order]) - print("TYPE", type(obj)) expected = frame_or_series(exp_order[random_order], dtype="float64") result = obj.rank() tm.assert_equal(result, expected) From 00ef2ea3c4c66f02c9e21c636776e35012d1402b Mon Sep 17 00:00:00 2001 From: sharkipelago Date: Mon, 25 Aug 2025 13:03:07 -0400 Subject: [PATCH 3/8] added rst --- doc/source/whatsnew/v3.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 03ad8ed162c95..89a8d23a9934b 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -204,6 +204,7 @@ Other enhancements - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) +- :meth:`DataFrame.rank` now uses internal ``_mgr.apply`` and preserves the dtype for extension arrays (:issue:`52829`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) From f78ffa50c8889f754278b2fc22ce43a1432c14af Mon Sep 17 00:00:00 2001 From: sharkipelago Date: Mon, 25 Aug 2025 16:53:21 -0400 Subject: [PATCH 4/8] updated rst --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 89a8d23a9934b..a38996888cf2d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -204,7 +204,7 @@ Other enhancements - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) -- :meth:`DataFrame.rank` now uses internal ``_mgr.apply`` and preserves the dtype for extension arrays (:issue:`52829`) +- :meth:`DataFrame.rank` now preserves the dtype for extension arrays (:issue:`52829`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`) From 178f4e38f3bd825033dbbe33ddf99a97b13c402a Mon Sep 17 00:00:00 2001 From: sharkipelago Date: Mon, 25 Aug 2025 22:40:47 -0400 Subject: [PATCH 5/8] dataframe level transpose --- pandas/core/generic.py | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d7c10b466c9a1..170da3b129f7f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9286,14 +9286,24 @@ def ranker(blk_values): pct=pct, ) else: - ranks = algos.rank( - blk_values.T, - axis=axis_int, - method=method, - ascending=ascending, - na_option=na_option, - pct=pct, - ).T + if blk_values.ndim > 1 and axis_int == 0: + ranks = algos.rank( + blk_values.T, + axis=axis_int, + method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ).T + else: + ranks = algos.rank( + blk_values, + axis=axis_int, + method=method, + ascending=ascending, + na_option=na_option, + pct=pct, + ) return ranks if numeric_only: @@ -9307,10 +9317,16 @@ def ranker(blk_values): else: data = self - result = data._mgr.apply(ranker) - return self._constructor_from_mgr(result, axes=result.axes).__finalize__( - self, method="rank" - ) + should_transpose = data.ndim > 1 and axis_int == 1 + + if should_transpose: + data = data.T + applied = data._mgr.apply(ranker) + result = self._constructor_from_mgr(applied, axes=applied.axes) + if should_transpose: + result = result.T + + return result.__finalize__(self, method="rank") @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"]) def compare( From 94893f0369f4c7d45d3075b823c443290738e83e Mon Sep 17 00:00:00 2001 From: sharkipelago Date: Mon, 25 Aug 2025 22:44:40 -0400 Subject: [PATCH 6/8] removed redundant ndim checks --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 170da3b129f7f..43839c681df29 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9286,7 +9286,7 @@ def ranker(blk_values): pct=pct, ) else: - if blk_values.ndim > 1 and axis_int == 0: + if axis_int == 0: ranks = algos.rank( blk_values.T, axis=axis_int, @@ -9317,7 +9317,7 @@ def ranker(blk_values): else: data = self - should_transpose = data.ndim > 1 and axis_int == 1 + should_transpose = axis_int == 1 if should_transpose: data = data.T From 7ee79ef58c5612b4ae6f5d152119170fe85ffd06 Mon Sep 17 00:00:00 2001 From: sharkipelago Date: Tue, 26 Aug 2025 10:46:42 -0400 Subject: [PATCH 7/8] added pytest skips if no pyarrow module --- pandas/tests/frame/methods/test_rank.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py index 48fdcb4641624..b0b84782dabf5 100644 --- a/pandas/tests/frame/methods/test_rank.py +++ b/pandas/tests/frame/methods/test_rank.py @@ -10,6 +10,7 @@ Infinity, NegInfinity, ) +import pandas.util._test_decorators as td from pandas import ( DataFrame, @@ -504,10 +505,20 @@ def test_rank_string_dtype(self, string_dtype_no_object): [ ("average", "UInt32", "Float64"), ("average", "Float32", "Float64"), - ("average", "int32[pyarrow]", "double[pyarrow]"), + pytest.param( + "average", + "int32[pyarrow]", + "double[pyarrow]", + marks=td.skip_if_no("pyarrow"), + ), ("min", "Int32", "Float64"), ("min", "Float32", "Float64"), - ("min", "int32[pyarrow]", "double[pyarrow]"), + pytest.param( + "min", + "int32[pyarrow]", + "double[pyarrow]", + marks=td.skip_if_no("pyarrow"), + ), ], ) def test_rank_extension_array_dtype(self, method, og_dtype, expected_dtype): @@ -520,6 +531,7 @@ def test_rank_extension_array_dtype(self, method, og_dtype, expected_dtype): tm.assert_frame_equal(result, expected) def test_rank_mixed_extension_array_dtype(self): + pytest.importorskip("pyarrow") result = DataFrame( { "base": Series([4, 5, 6]), From 2e7ca27eb246f335bb762a97dea2f7adcd79231a Mon Sep 17 00:00:00 2001 From: sharkipelago Date: Tue, 26 Aug 2025 14:00:33 -0400 Subject: [PATCH 8/8] corrected to dtype_backend --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index a38996888cf2d..a660ac346abbb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -204,7 +204,7 @@ Other enhancements - :meth:`.DataFrameGroupBy.transform`, :meth:`.SeriesGroupBy.transform`, :meth:`.DataFrameGroupBy.agg`, :meth:`.SeriesGroupBy.agg`, :meth:`.SeriesGroupBy.apply`, :meth:`.DataFrameGroupBy.apply` now support ``kurt`` (:issue:`40139`) - :meth:`DataFrame.apply` supports using third-party execution engines like the Bodo.ai JIT compiler (:issue:`60668`) - :meth:`DataFrame.iloc` and :meth:`Series.iloc` now support boolean masks in ``__getitem__`` for more consistent indexing behavior (:issue:`60994`) -- :meth:`DataFrame.rank` now preserves the dtype for extension arrays (:issue:`52829`) +- :meth:`DataFrame.rank` now preserves the ``dtype_backend`` for extension arrays (:issue:`52829`) - :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` now support Python's new-style format strings (e.g., ``"{:.6f}"``) for the ``float_format`` parameter, in addition to old-style ``%`` format strings and callables. This allows for more flexible and modern formatting of floating point numbers when exporting to CSV. (:issue:`49580`) - :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`) - :meth:`Rolling.agg`, :meth:`Expanding.agg` and :meth:`ExponentialMovingWindow.agg` now accept :class:`NamedAgg` aggregations through ``**kwargs`` (:issue:`28333`)