Skip to content

Commit 2c0c5b1

Browse files
Merge remote-tracking branch 'upstream/2.3.x' into string-dtype-2.3.x-downcast-string
2 parents 60ff8f9 + 4c2d6b4 commit 2c0c5b1

File tree

114 files changed

+1058
-710
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

114 files changed

+1058
-710
lines changed

doc/source/whatsnew/v2.3.0.rst

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,10 @@ Conversion
107107
Strings
108108
^^^^^^^
109109
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
110+
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
110111
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
111112
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
112113
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
113-
-
114114

115115
Interval
116116
^^^^^^^^
@@ -119,7 +119,7 @@ Interval
119119

120120
Indexing
121121
^^^^^^^^
122-
-
122+
- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
123123
-
124124

125125
Missing
@@ -174,7 +174,8 @@ Styler
174174

175175
Other
176176
^^^^^
177-
-
177+
- Fixed usage of ``inspect`` when the optional dependencies ``pyarrow`` or ``jinja2``
178+
are not installed (:issue:`60196`)
178179
-
179180

180181
.. ---------------------------------------------------------------------------

pandas/_libs/index.pyi

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
6868
class MaskedUInt8Engine(MaskedIndexEngine): ...
6969
class MaskedBoolEngine(MaskedUInt8Engine): ...
7070

71+
class StringObjectEngine(ObjectEngine):
72+
def __init__(self, values: object, na_value) -> None: ...
73+
7174
class BaseMultiIndexCodesEngine:
7275
levels: list[np.ndarray]
7376
offsets: np.ndarray # ndarray[uint64_t, ndim=1]

pandas/_libs/index.pyx

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,32 @@ cdef class ObjectEngine(IndexEngine):
532532
return loc
533533

534534

535+
cdef class StringObjectEngine(ObjectEngine):
536+
537+
cdef:
538+
object na_value
539+
bint uses_na
540+
541+
def __init__(self, ndarray values, na_value):
542+
super().__init__(values)
543+
self.na_value = na_value
544+
self.uses_na = na_value is C_NA
545+
546+
cdef bint _checknull(self, object val):
547+
if self.uses_na:
548+
return val is C_NA
549+
else:
550+
return util.is_nan(val)
551+
552+
cdef _check_type(self, object val):
553+
if isinstance(val, str):
554+
return val
555+
elif self._checknull(val):
556+
return self.na_value
557+
else:
558+
raise KeyError(val)
559+
560+
535561
cdef class DatetimeEngine(Int64Engine):
536562

537563
cdef:

pandas/_libs/lib.pyx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2742,7 +2742,13 @@ def maybe_convert_objects(ndarray[object] objects,
27422742
seen.object_ = True
27432743

27442744
elif seen.str_:
2745-
if (
2745+
if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
2746+
from pandas.core.arrays.string_ import StringDtype
2747+
2748+
dtype = StringDtype()
2749+
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
2750+
2751+
elif (
27462752
convert_string
27472753
and using_string_dtype()
27482754
and is_string_array(objects, skipna=True)

pandas/_testing/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,8 @@ def shares_memory(left, right) -> bool:
519519
if isinstance(left, MultiIndex):
520520
return shares_memory(left._codes, right)
521521
if isinstance(left, (Index, Series)):
522+
if isinstance(right, (Index, Series)):
523+
return shares_memory(left._values, right._values)
522524
return shares_memory(left._values, right)
523525

524526
if isinstance(left, NDArrayBackedExtensionArray):

pandas/compat/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
pa_version_under14p1,
3434
pa_version_under16p0,
3535
pa_version_under17p0,
36+
pa_version_under18p0,
3637
)
3738

3839
if TYPE_CHECKING:
@@ -191,6 +192,7 @@ def get_bz2_file() -> type[pandas.compat.compressors.BZ2File]:
191192
"pa_version_under14p1",
192193
"pa_version_under16p0",
193194
"pa_version_under17p0",
195+
"pa_version_under18p0",
194196
"HAS_PYARROW",
195197
"IS64",
196198
"ISMUSL",

pandas/compat/pyarrow.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
pa_version_under15p0 = _palv < Version("15.0.0")
1818
pa_version_under16p0 = _palv < Version("16.0.0")
1919
pa_version_under17p0 = _palv < Version("17.0.0")
20+
pa_version_under18p0 = _palv < Version("18.0.0")
2021
HAS_PYARROW = True
2122
except ImportError:
2223
pa_version_under10p1 = True
@@ -28,4 +29,5 @@
2829
pa_version_under15p0 = True
2930
pa_version_under16p0 = True
3031
pa_version_under17p0 = True
32+
pa_version_under18p0 = False
3133
HAS_PYARROW = False

pandas/conftest.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,7 @@ def multiindex_year_month_day_dataframe_random_data():
548548
"""
549549
tdf = DataFrame(
550550
np.random.default_rng(2).standard_normal((100, 4)),
551-
columns=Index(list("ABCD"), dtype=object),
551+
columns=Index(list("ABCD")),
552552
index=date_range("2000-01-01", periods=100, freq="B"),
553553
)
554554
ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()
@@ -743,7 +743,7 @@ def string_series() -> Series:
743743
"""
744744
return Series(
745745
np.arange(30, dtype=np.float64) * 1.1,
746-
index=Index([f"i_{i}" for i in range(30)], dtype=object),
746+
index=Index([f"i_{i}" for i in range(30)]),
747747
name="series",
748748
)
749749

@@ -754,7 +754,7 @@ def object_series() -> Series:
754754
Fixture for Series of dtype object with Index of unique strings
755755
"""
756756
data = [f"foo_{i}" for i in range(30)]
757-
index = Index([f"bar_{i}" for i in range(30)], dtype=object)
757+
index = Index([f"bar_{i}" for i in range(30)])
758758
return Series(data, index=index, name="objects", dtype=object)
759759

760760

@@ -846,8 +846,8 @@ def int_frame() -> DataFrame:
846846
"""
847847
return DataFrame(
848848
np.ones((30, 4), dtype=np.int64),
849-
index=Index([f"foo_{i}" for i in range(30)], dtype=object),
850-
columns=Index(list("ABCD"), dtype=object),
849+
index=Index([f"foo_{i}" for i in range(30)]),
850+
columns=Index(list("ABCD")),
851851
)
852852

853853

pandas/core/arrays/arrow/accessors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def _is_valid_pyarrow_dtype(self, pyarrow_dtype) -> bool:
4646

4747
def _validate(self, data):
4848
dtype = data.dtype
49-
if not isinstance(dtype, ArrowDtype):
49+
if pa_version_under10p1 or not isinstance(dtype, ArrowDtype):
5050
# Raise AttributeError so that inspect can handle non-struct Series.
5151
raise AttributeError(self._validation_msg.format(dtype=dtype))
5252

pandas/core/arrays/arrow/array.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,7 +1134,7 @@ def fillna(
11341134
try:
11351135
fill_value = self._box_pa(value, pa_type=self._pa_array.type)
11361136
except pa.ArrowTypeError as err:
1137-
msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
1137+
msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
11381138
raise TypeError(msg) from err
11391139

11401140
try:
@@ -1633,7 +1633,11 @@ def _accumulate(
16331633
else:
16341634
data_to_accum = data_to_accum.cast(pa.int64())
16351635

1636-
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1636+
try:
1637+
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1638+
except pa.ArrowNotImplementedError as err:
1639+
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
1640+
raise TypeError(msg) from err
16371641

16381642
if convert_to_int:
16391643
result = result.cast(pa_dtype)
@@ -2126,7 +2130,7 @@ def _maybe_convert_setitem_value(self, value):
21262130
try:
21272131
value = self._box_pa(value, self._pa_array.type)
21282132
except pa.ArrowTypeError as err:
2129-
msg = f"Invalid value '{str(value)}' for dtype {self.dtype}"
2133+
msg = f"Invalid value '{value!s}' for dtype '{self.dtype}'"
21302134
raise TypeError(msg) from err
21312135
return value
21322136

@@ -2285,6 +2289,20 @@ def _groupby_op(
22852289
**kwargs,
22862290
):
22872291
if isinstance(self.dtype, StringDtype):
2292+
if how in [
2293+
"prod",
2294+
"mean",
2295+
"median",
2296+
"cumsum",
2297+
"cumprod",
2298+
"std",
2299+
"sem",
2300+
"var",
2301+
"skew",
2302+
]:
2303+
raise TypeError(
2304+
f"dtype '{self.dtype}' does not support operation '{how}'"
2305+
)
22882306
return super()._groupby_op(
22892307
how=how,
22902308
has_dropped_na=has_dropped_na,

0 commit comments

Comments
 (0)