BUG: rank with object dtype and small values #62036 (#62227)

TeeVee99 · Tejaswini V · iabhi4 · commit e5ef6dec0524 · 2025-09-12T18:21:27.000-07:00
Co-authored-by: Tejaswini V &lt;tejaswiniv@Tejaswinis-MacBook-Pro.local&gt;
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -950,8 +950,8 @@ Conversion
 
 Strings
 ^^^^^^^
+- Bug in :meth:`Series.str.zfill` raising ``AttributeError`` for ``ArrowDtype(pa.string())``. Now supported via ``_str_zfill`` implementation in ``ArrowExtensionArray`` (:issue:`61485`)
 - Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
--
 
 Interval
 ^^^^^^^^
@@ -1158,6 +1158,7 @@ Other
 - Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
 - Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
 - Deprecated the keyword ``check_datetimelike_compat`` in :meth:`testing.assert_frame_equal` and :meth:`testing.assert_series_equal` (:issue:`55638`)
+- Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`)
 - Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`)
 - Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`)
 -
diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx
@@ -1,7 +1,6 @@
 cimport cython
 from cython cimport Py_ssize_t
 from libc.math cimport (
-    fabs,
     sqrt,
 )
 from libc.stdlib cimport (
@@ -72,13 +71,6 @@ tiebreakers = {
 }
 
 
-cdef bint are_diff(object left, object right):
-    try:
-        return fabs(left - right) > FP_ERR
-    except TypeError:
-        return left != right
-
-
 class Infinity:
     """
     Provide a positive Infinity comparison method for ranking.
@@ -1135,12 +1127,8 @@ cdef void rank_sorted_1d(
             dups += 1
             sum_ranks += i - grp_start + 1
 
-            if numeric_object_t is object:
-                next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
-                                                   masked_vals[sort_indexer[i+1]])
-            else:
-                next_val_diff = at_end or (masked_vals[sort_indexer[i]]
-                                           != masked_vals[sort_indexer[i+1]])
+            next_val_diff = at_end or (masked_vals[sort_indexer[i]]
+                                       != masked_vals[sort_indexer[i+1]])
 
             # We'll need this check later anyway to determine group size, so just
             # compute it here since shortcircuiting won't help
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -2716,6 +2716,13 @@ def _str_wrap(self, width: int, **kwargs) -> Self:
         result = self._apply_elementwise(predicate)
         return self._from_pyarrow_array(pa.chunked_array(result))
 
+    def _str_zfill(self, width: int) -> Self:
+        # TODO: Replace with pc.utf8_zfill when supported by arrow
+        # Arrow ENH - https://github.com/apache/arrow/issues/46683
+        predicate = lambda val: val.zfill(width)
+        result = self._apply_elementwise(predicate)
+        return type(self)(pa.chunked_array(result))
+
     @property
     def _dt_days(self) -> Self:
         return self._from_pyarrow_array(
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -1128,3 +1128,6 @@ def _cmp_method(self, other, op):
             return res_arr
 
     _arith_method = _cmp_method
+
+    def _str_zfill(self, width: int) -> Self:
+        return self._str_map(lambda x: x.zfill(width))
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1912,8 +1912,8 @@ def zfill(self, width: int):
         if not is_integer(width):
             msg = f"width must be of integer type, not {type(width).__name__}"
             raise TypeError(msg)
-        f = lambda x: x.zfill(width)
-        result = self._data.array._str_map(f)
+
+        result = self._data.array._str_zfill(width)
         return self._wrap_result(result)
 
     def slice(self, start=None, stop=None, step=None):
diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py
@@ -544,3 +544,6 @@ def f(x):
                 return empty_row
 
         return [f(val) for val in np.asarray(self)]
+
+    def _str_zfill(self, width: int):
+        return self._str_map(lambda x: x.zfill(width))
diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
@@ -110,3 +110,19 @@ def test_string_array_extract(nullable_string_dtype):
 
     result = result.astype(object)
     tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "values, width, expected",
+    [
+        (["a", "ab", "abc", None], 4, ["000a", "00ab", "0abc", None]),
+        (["1", "-1", "+1", None], 4, ["0001", "-001", "+001", None]),
+        (["1234", "-1234"], 3, ["1234", "-1234"]),
+    ],
+)
+def test_string_array_zfill(nullable_string_dtype, values, width, expected):
+    # GH #61485
+    s = Series(values, dtype=nullable_string_dtype)
+    result = s.str.zfill(width)
+    expected = Series(expected, dtype=nullable_string_dtype)
+    tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -1819,6 +1819,18 @@ def test_uint64_overflow(self, dtype):
         s = Series([1, 2**63], dtype=dtype)
         tm.assert_numpy_array_equal(algos.rank(s), exp)
 
+    @pytest.mark.parametrize("method", ["average", "min", "max"])
+    def test_rank_tiny_values(self, method):
+        # GH62036: regression test for ranking with tiny float values
+        exp = np.array([4.0, 1.0, 3.0, np.nan, 2.0], dtype=np.float64)
+        s = Series(
+            [5.4954145e29, -9.791984e-21, 9.3715776e-26, pd.NA, 1.8790257e-28],
+            dtype="Float64",
+        )
+        s = s.astype(object)
+        result = algos.rank(s, method=method)
+        tm.assert_numpy_array_equal(result, exp)
+
     def test_too_many_ndims(self):
         arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
         msg = "Array with ndim > 2 are not supported"