Skip to content

Commit dd2cdc6

Browse files
Merge remote-tracking branch 'upstream/2.3.x' into backport-60295
2 parents 362d8f7 + eb22bf8 commit dd2cdc6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+584
-347
lines changed

.circleci/config.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ jobs:
1515
- checkout
1616
- run: .circleci/setup_env.sh
1717
- run: |
18-
sudo apt-get update && sudo apt-get install -y libegl1 libopengl0
1918
PATH=$HOME/miniconda3/envs/pandas-dev/bin:$HOME/miniconda3/condabin:$PATH \
2019
LD_PRELOAD=$HOME/miniconda3/envs/pandas-dev/lib/libgomp.so.1:$LD_PRELOAD \
2120
ci/run_tests.sh

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ Interval
119119

120120
Indexing
121121
^^^^^^^^
122-
-
122+
- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
123123
-
124124

125125
Missing

pandas/_libs/index.pyi

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
6868
class MaskedUInt8Engine(MaskedIndexEngine): ...
6969
class MaskedBoolEngine(MaskedUInt8Engine): ...
7070

71+
class StringObjectEngine(ObjectEngine):
72+
def __init__(self, values: object, na_value) -> None: ...
73+
7174
class BaseMultiIndexCodesEngine:
7275
levels: list[np.ndarray]
7376
offsets: np.ndarray # ndarray[uint64_t, ndim=1]

pandas/_libs/index.pyx

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -532,6 +532,32 @@ cdef class ObjectEngine(IndexEngine):
532532
return loc
533533

534534

535+
cdef class StringObjectEngine(ObjectEngine):
536+
537+
cdef:
538+
object na_value
539+
bint uses_na
540+
541+
def __init__(self, ndarray values, na_value):
542+
super().__init__(values)
543+
self.na_value = na_value
544+
self.uses_na = na_value is C_NA
545+
546+
cdef bint _checknull(self, object val):
547+
if self.uses_na:
548+
return val is C_NA
549+
else:
550+
return util.is_nan(val)
551+
552+
cdef _check_type(self, object val):
553+
if isinstance(val, str):
554+
return val
555+
elif self._checknull(val):
556+
return self.na_value
557+
else:
558+
raise KeyError(val)
559+
560+
535561
cdef class DatetimeEngine(Int64Engine):
536562

537563
cdef:

pandas/_libs/lib.pyi

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def maybe_convert_objects(
8686
safe: bool = ...,
8787
convert_numeric: bool = ...,
8888
convert_non_numeric: Literal[False] = ...,
89+
convert_string: Literal[False] = ...,
8990
convert_to_nullable_dtype: Literal[False] = ...,
9091
dtype_if_all_nat: DtypeObj | None = ...,
9192
) -> npt.NDArray[np.object_ | np.number]: ...
@@ -97,6 +98,7 @@ def maybe_convert_objects(
9798
safe: bool = ...,
9899
convert_numeric: bool = ...,
99100
convert_non_numeric: bool = ...,
101+
convert_string: bool = ...,
100102
convert_to_nullable_dtype: Literal[True] = ...,
101103
dtype_if_all_nat: DtypeObj | None = ...,
102104
) -> ArrayLike: ...
@@ -108,6 +110,7 @@ def maybe_convert_objects(
108110
safe: bool = ...,
109111
convert_numeric: bool = ...,
110112
convert_non_numeric: bool = ...,
113+
convert_string: bool = ...,
111114
convert_to_nullable_dtype: bool = ...,
112115
dtype_if_all_nat: DtypeObj | None = ...,
113116
) -> ArrayLike: ...

pandas/_libs/lib.pyx

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2498,6 +2498,7 @@ def maybe_convert_objects(ndarray[object] objects,
24982498
bint convert_numeric=True, # NB: different default!
24992499
bint convert_to_nullable_dtype=False,
25002500
bint convert_non_numeric=False,
2501+
bint convert_string=True,
25012502
object dtype_if_all_nat=None) -> "ArrayLike":
25022503
"""
25032504
Type inference function-- convert object array to proper dtype
@@ -2747,7 +2748,11 @@ def maybe_convert_objects(ndarray[object] objects,
27472748
dtype = StringDtype()
27482749
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
27492750

2750-
elif using_string_dtype() and is_string_array(objects, skipna=True):
2751+
elif (
2752+
convert_string
2753+
and using_string_dtype()
2754+
and is_string_array(objects, skipna=True)
2755+
):
27512756
from pandas.core.arrays.string_ import StringDtype
27522757

27532758
dtype = StringDtype(na_value=np.nan)

pandas/core/arrays/arrow/array.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1633,7 +1633,11 @@ def _accumulate(
16331633
else:
16341634
data_to_accum = data_to_accum.cast(pa.int64())
16351635

1636-
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1636+
try:
1637+
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1638+
except pa.ArrowNotImplementedError as err:
1639+
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
1640+
raise TypeError(msg) from err
16371641

16381642
if convert_to_int:
16391643
result = result.cast(pa_dtype)

pandas/core/dtypes/cast.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@
8787

8888
if TYPE_CHECKING:
8989
from collections.abc import (
90+
Collection,
9091
Sequence,
91-
Sized,
9292
)
9393

9494
from pandas._typing import (
@@ -1586,7 +1586,7 @@ def _maybe_box_and_unbox_datetimelike(value: Scalar, dtype: DtypeObj):
15861586
return _maybe_unbox_datetimelike(value, dtype)
15871587

15881588

1589-
def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
1589+
def construct_1d_object_array_from_listlike(values: Collection) -> np.ndarray:
15901590
"""
15911591
Transform any list-like object in a 1-dimensional numpy array of object
15921592
dtype.
@@ -1604,10 +1604,11 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray:
16041604
-------
16051605
1-dimensional numpy array of dtype object
16061606
"""
1607-
# numpy will try to interpret nested lists as further dimensions, hence
1608-
# making a 1D array that contains list-likes is a bit tricky:
1607+
# numpy will try to interpret nested lists as further dimensions in np.array(),
1608+
# hence explicitly making a 1D array using np.fromiter
16091609
result = np.empty(len(values), dtype="object")
1610-
result[:] = values
1610+
for i, obj in enumerate(values):
1611+
result[i] = obj
16111612
return result
16121613

16131614

pandas/core/indexes/base.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,8 @@ def _engine(
884884
# error: Item "ExtensionArray" of "Union[ExtensionArray,
885885
# ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr]
886886
target_values = self._data._ndarray # type: ignore[union-attr]
887+
elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype):
888+
return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr]
887889

888890
# error: Argument 1 to "ExtensionEngine" has incompatible type
889891
# "ndarray[Any, Any]"; expected "ExtensionArray"
@@ -6133,7 +6135,6 @@ def _should_fallback_to_positional(self) -> bool:
61336135
def get_indexer_non_unique(
61346136
self, target
61356137
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
6136-
target = ensure_index(target)
61376138
target = self._maybe_cast_listlike_indexer(target)
61386139

61396140
if not self._should_compare(target) and not self._should_partial_index(target):
@@ -6695,7 +6696,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index:
66956696
"""
66966697
Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
66976698
"""
6698-
return ensure_index(target)
6699+
target_index = ensure_index(target)
6700+
if (
6701+
not hasattr(target, "dtype")
6702+
and self.dtype == object
6703+
and target_index.dtype == "string"
6704+
):
6705+
# If we started with a list-like, avoid inference to string dtype if self
6706+
# is object dtype (coercing to string dtype will alter the missing values)
6707+
target_index = Index(target, dtype=self.dtype)
6708+
return target_index
66996709

67006710
@final
67016711
def _validate_indexer(

pandas/core/interchange/from_dataframe.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import numpy as np
88

9+
from pandas._config import using_string_dtype
10+
911
from pandas.compat._optional import import_optional_dependency
1012
from pandas.errors import SettingWithCopyError
1113

@@ -124,8 +126,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
124126
-------
125127
pd.DataFrame
126128
"""
127-
# We need a dict of columns here, with each column being a NumPy array (at
128-
# least for now, deal with non-NumPy dtypes later).
129129
columns: dict[str, Any] = {}
130130
buffers = [] # hold on to buffers, keeps memory alive
131131
for name in df.column_names():
@@ -324,8 +324,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
324324
# Add to our list of strings
325325
str_list[i] = string
326326

327-
# Convert the string list to a NumPy array
328-
return np.asarray(str_list, dtype="object"), buffers
327+
if using_string_dtype():
328+
res = pd.Series(str_list, dtype="str")
329+
else:
330+
res = np.asarray(str_list, dtype="object") # type: ignore[assignment]
331+
332+
return res, buffers # type: ignore[return-value]
329333

330334

331335
def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:

0 commit comments

Comments
 (0)