Skip to content

Commit 4cb581d

Browse files
authored
Merge branch 'main' into 60237
2 parents 5ae19ae + 6a7685f commit 4cb581d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+365
-411
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
8484
-i "pandas.Timestamp.resolution PR02" \
8585
-i "pandas.Timestamp.tzinfo GL08" \
8686
-i "pandas.api.types.is_re_compilable PR07,SA01" \
87-
-i "pandas.api.types.pandas_dtype PR07,RT03,SA01" \
8887
-i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
8988
-i "pandas.arrays.IntegerArray SA01" \
9089
-i "pandas.arrays.IntervalArray.length SA01" \

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ Interval
118118

119119
Indexing
120120
^^^^^^^^
121-
-
121+
- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
122122
-
123123

124124
Missing

pandas/core/array_algos/replace.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,6 @@ def re_replacer(s):
151151
if mask is None:
152152
values[:] = f(values)
153153
else:
154+
if values.ndim != mask.ndim:
155+
mask = np.broadcast_to(mask, values.shape)
154156
values[mask] = f(values[mask])

pandas/core/arrays/arrow/array.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1644,7 +1644,11 @@ def _accumulate(
16441644
else:
16451645
data_to_accum = data_to_accum.cast(pa.int64())
16461646

1647-
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1647+
try:
1648+
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1649+
except pa.ArrowNotImplementedError as err:
1650+
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
1651+
raise TypeError(msg) from err
16481652

16491653
if convert_to_int:
16501654
result = result.cast(pa_dtype)

pandas/core/dtypes/common.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1785,16 +1785,22 @@ def pandas_dtype(dtype) -> DtypeObj:
17851785
17861786
Parameters
17871787
----------
1788-
dtype : object to be converted
1788+
dtype : object
1789+
The object to be converted into a dtype.
17891790
17901791
Returns
17911792
-------
17921793
np.dtype or a pandas dtype
1794+
The converted dtype, which can be either a numpy dtype or a pandas dtype.
17931795
17941796
Raises
17951797
------
17961798
TypeError if not a dtype
17971799
1800+
See Also
1801+
--------
1802+
api.types.is_dtype : Return true if the condition is satisfied for the arr_or_dtype.
1803+
17981804
Examples
17991805
--------
18001806
>>> pd.api.types.pandas_dtype(int)

pandas/core/indexes/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6556,7 +6556,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index:
65566556
"""
65576557
Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
65586558
"""
6559-
return ensure_index(target)
6559+
target_index = ensure_index(target)
6560+
if (
6561+
not hasattr(target, "dtype")
6562+
and self.dtype == object
6563+
and target_index.dtype == "string"
6564+
):
6565+
# If we started with a list-like, avoid inference to string dtype if self
6566+
# is object dtype (coercing to string dtype will alter the missing values)
6567+
target_index = Index(target, dtype=self.dtype)
6568+
return target_index
65606569

65616570
@final
65626571
def _validate_indexer(

pandas/core/interchange/from_dataframe.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
import numpy as np
1111

12+
from pandas._config import using_string_dtype
13+
1214
from pandas.compat._optional import import_optional_dependency
1315

1416
import pandas as pd
@@ -147,8 +149,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
147149
-------
148150
pd.DataFrame
149151
"""
150-
# We need a dict of columns here, with each column being a NumPy array (at
151-
# least for now, deal with non-NumPy dtypes later).
152152
columns: dict[str, Any] = {}
153153
buffers = [] # hold on to buffers, keeps memory alive
154154
for name in df.column_names():
@@ -347,8 +347,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
347347
# Add to our list of strings
348348
str_list[i] = string
349349

350-
# Convert the string list to a NumPy array
351-
return np.asarray(str_list, dtype="object"), buffers
350+
if using_string_dtype():
351+
res = pd.Series(str_list, dtype="str")
352+
else:
353+
res = np.asarray(str_list, dtype="object") # type: ignore[assignment]
354+
355+
return res, buffers # type: ignore[return-value]
352356

353357

354358
def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:

pandas/core/internals/blocks.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1688,6 +1688,13 @@ def where(self, other, cond) -> list[Block]:
16881688
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
16891689
# TestSetitemFloatIntervalWithIntIntervalValues
16901690
blk = self.coerce_to_target_dtype(orig_other, raise_on_upcast=False)
1691+
if (
1692+
self.ndim == 2
1693+
and isinstance(orig_cond, np.ndarray)
1694+
and orig_cond.ndim == 1
1695+
and not is_1d_only_ea_dtype(blk.dtype)
1696+
):
1697+
orig_cond = orig_cond[:, None]
16911698
return blk.where(orig_other, orig_cond)
16921699

16931700
elif isinstance(self, NDArrayBackedExtensionBlock):

pandas/core/reshape/concat.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import numpy as np
1818

1919
from pandas._libs import lib
20+
from pandas.util._decorators import set_module
2021
from pandas.util._exceptions import find_stack_level
2122

2223
from pandas.core.dtypes.common import (
@@ -149,6 +150,7 @@ def concat(
149150
) -> DataFrame | Series: ...
150151

151152

153+
@set_module("pandas")
152154
def concat(
153155
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
154156
*,

pandas/io/_util.py

Lines changed: 40 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING
3+
from typing import (
4+
TYPE_CHECKING,
5+
Literal,
6+
)
47

58
import numpy as np
69

10+
from pandas._config import using_string_dtype
11+
12+
from pandas._libs import lib
713
from pandas.compat import pa_version_under18p0
814
from pandas.compat._optional import import_optional_dependency
915

@@ -12,6 +18,10 @@
1218
if TYPE_CHECKING:
1319
from collections.abc import Callable
1420

21+
import pyarrow
22+
23+
from pandas._typing import DtypeBackend
24+
1525

1626
def _arrow_dtype_mapping() -> dict:
1727
pa = import_optional_dependency("pyarrow")
@@ -33,7 +43,7 @@ def _arrow_dtype_mapping() -> dict:
3343
}
3444

3545

36-
def arrow_string_types_mapper() -> Callable:
46+
def _arrow_string_types_mapper() -> Callable:
3747
pa = import_optional_dependency("pyarrow")
3848

3949
mapping = {
@@ -44,3 +54,31 @@ def arrow_string_types_mapper() -> Callable:
4454
mapping[pa.string_view()] = pd.StringDtype(na_value=np.nan)
4555

4656
return mapping.get
57+
58+
59+
def arrow_table_to_pandas(
60+
table: pyarrow.Table,
61+
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
62+
null_to_int64: bool = False,
63+
) -> pd.DataFrame:
64+
pa = import_optional_dependency("pyarrow")
65+
66+
types_mapper: type[pd.ArrowDtype] | None | Callable
67+
if dtype_backend == "numpy_nullable":
68+
mapping = _arrow_dtype_mapping()
69+
if null_to_int64:
70+
# Modify the default mapping to also map null to Int64
71+
# (to match other engines - only for CSV parser)
72+
mapping[pa.null()] = pd.Int64Dtype()
73+
types_mapper = mapping.get
74+
elif dtype_backend == "pyarrow":
75+
types_mapper = pd.ArrowDtype
76+
elif using_string_dtype():
77+
types_mapper = _arrow_string_types_mapper()
78+
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
79+
types_mapper = None
80+
else:
81+
raise NotImplementedError
82+
83+
df = table.to_pandas(types_mapper=types_mapper)
84+
return df

0 commit comments

Comments
 (0)