Skip to content

Commit e5ef6de

Browse files
TeeVee99Tejaswini V
authored andcommitted
BUG: rank with object dtype and small values #62036 (#62227)
Co-authored-by: Tejaswini V <[email protected]>
1 parent 4afee55 commit e5ef6de

File tree

8 files changed

+47
-17
lines changed

8 files changed

+47
-17
lines changed

doc/source/whatsnew/v3.0.0.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -950,8 +950,8 @@ Conversion
950950

951951
Strings
952952
^^^^^^^
953+
- Bug in :meth:`Series.str.zfill` raising ``AttributeError`` for ``ArrowDtype(pa.string())``. Now supported via ``_str_zfill`` implementation in ``ArrowExtensionArray`` (:issue:`61485`)
953954
- Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
954-
-
955955

956956
Interval
957957
^^^^^^^^
@@ -1158,6 +1158,7 @@ Other
11581158
- Bug in printing a :class:`DataFrame` with a :class:`DataFrame` stored in :attr:`DataFrame.attrs` raised a ``ValueError`` (:issue:`60455`)
11591159
- Bug in printing a :class:`Series` with a :class:`DataFrame` stored in :attr:`Series.attrs` raised a ``ValueError`` (:issue:`60568`)
11601160
- Deprecated the keyword ``check_datetimelike_compat`` in :meth:`testing.assert_frame_equal` and :meth:`testing.assert_series_equal` (:issue:`55638`)
1161+
- Fixed bug in the :meth:`Series.rank` with object dtype and extremely small float values (:issue:`62036`)
11611162
- Fixed bug where the :class:`DataFrame` constructor misclassified array-like objects with a ``.name`` attribute as :class:`Series` or :class:`Index` (:issue:`61443`)
11621163
- Fixed regression in :meth:`DataFrame.from_records` not initializing subclasses properly (:issue:`57008`)
11631164
-

pandas/_libs/algos.pyx

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
cimport cython
22
from cython cimport Py_ssize_t
33
from libc.math cimport (
4-
fabs,
54
sqrt,
65
)
76
from libc.stdlib cimport (
@@ -72,13 +71,6 @@ tiebreakers = {
7271
}
7372

7473

75-
cdef bint are_diff(object left, object right):
76-
try:
77-
return fabs(left - right) > FP_ERR
78-
except TypeError:
79-
return left != right
80-
81-
8274
class Infinity:
8375
"""
8476
Provide a positive Infinity comparison method for ranking.
@@ -1135,12 +1127,8 @@ cdef void rank_sorted_1d(
11351127
dups += 1
11361128
sum_ranks += i - grp_start + 1
11371129

1138-
if numeric_object_t is object:
1139-
next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]],
1140-
masked_vals[sort_indexer[i+1]])
1141-
else:
1142-
next_val_diff = at_end or (masked_vals[sort_indexer[i]]
1143-
!= masked_vals[sort_indexer[i+1]])
1130+
next_val_diff = at_end or (masked_vals[sort_indexer[i]]
1131+
!= masked_vals[sort_indexer[i+1]])
11441132

11451133
# We'll need this check later anyway to determine group size, so just
11461134
# compute it here since shortcircuiting won't help

pandas/core/arrays/arrow/array.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2716,6 +2716,13 @@ def _str_wrap(self, width: int, **kwargs) -> Self:
27162716
result = self._apply_elementwise(predicate)
27172717
return self._from_pyarrow_array(pa.chunked_array(result))
27182718

2719+
def _str_zfill(self, width: int) -> Self:
2720+
# TODO: Replace with pc.utf8_zfill when supported by arrow
2721+
# Arrow ENH - https://github.com/apache/arrow/issues/46683
2722+
predicate = lambda val: val.zfill(width)
2723+
result = self._apply_elementwise(predicate)
2724+
return type(self)(pa.chunked_array(result))
2725+
27192726
@property
27202727
def _dt_days(self) -> Self:
27212728
return self._from_pyarrow_array(

pandas/core/arrays/string_.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1128,3 +1128,6 @@ def _cmp_method(self, other, op):
11281128
return res_arr
11291129

11301130
_arith_method = _cmp_method
1131+
1132+
def _str_zfill(self, width: int) -> Self:
1133+
return self._str_map(lambda x: x.zfill(width))

pandas/core/strings/accessor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1912,8 +1912,8 @@ def zfill(self, width: int):
19121912
if not is_integer(width):
19131913
msg = f"width must be of integer type, not {type(width).__name__}"
19141914
raise TypeError(msg)
1915-
f = lambda x: x.zfill(width)
1916-
result = self._data.array._str_map(f)
1915+
1916+
result = self._data.array._str_zfill(width)
19171917
return self._wrap_result(result)
19181918

19191919
def slice(self, start=None, stop=None, step=None):

pandas/core/strings/object_array.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,3 +544,6 @@ def f(x):
544544
return empty_row
545545

546546
return [f(val) for val in np.asarray(self)]
547+
548+
def _str_zfill(self, width: int):
549+
return self._str_map(lambda x: x.zfill(width))

pandas/tests/strings/test_string_array.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,3 +110,19 @@ def test_string_array_extract(nullable_string_dtype):
110110

111111
result = result.astype(object)
112112
tm.assert_equal(result, expected)
113+
114+
115+
@pytest.mark.parametrize(
116+
"values, width, expected",
117+
[
118+
(["a", "ab", "abc", None], 4, ["000a", "00ab", "0abc", None]),
119+
(["1", "-1", "+1", None], 4, ["0001", "-001", "+001", None]),
120+
(["1234", "-1234"], 3, ["1234", "-1234"]),
121+
],
122+
)
123+
def test_string_array_zfill(nullable_string_dtype, values, width, expected):
124+
# GH #61485
125+
s = Series(values, dtype=nullable_string_dtype)
126+
result = s.str.zfill(width)
127+
expected = Series(expected, dtype=nullable_string_dtype)
128+
tm.assert_series_equal(result, expected)

pandas/tests/test_algos.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1819,6 +1819,18 @@ def test_uint64_overflow(self, dtype):
18191819
s = Series([1, 2**63], dtype=dtype)
18201820
tm.assert_numpy_array_equal(algos.rank(s), exp)
18211821

1822+
@pytest.mark.parametrize("method", ["average", "min", "max"])
1823+
def test_rank_tiny_values(self, method):
1824+
# GH62036: regression test for ranking with tiny float values
1825+
exp = np.array([4.0, 1.0, 3.0, np.nan, 2.0], dtype=np.float64)
1826+
s = Series(
1827+
[5.4954145e29, -9.791984e-21, 9.3715776e-26, pd.NA, 1.8790257e-28],
1828+
dtype="Float64",
1829+
)
1830+
s = s.astype(object)
1831+
result = algos.rank(s, method=method)
1832+
tm.assert_numpy_array_equal(result, exp)
1833+
18221834
def test_too_many_ndims(self):
18231835
arr = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]])
18241836
msg = "Array with ndim > 2 are not supported"

0 commit comments

Comments
 (0)