Skip to content

Commit f42fffd

Browse files
Merge remote-tracking branch 'upstream/main' into fix-interchange-strings
2 parents ad45ebc + 34c080c commit f42fffd

File tree

8 files changed

+62
-17
lines changed

8 files changed

+62
-17
lines changed

doc/source/whatsnew/v2.3.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ Interval
118118

119119
Indexing
120120
^^^^^^^^
121-
-
121+
- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
122122
-
123123

124124
Missing

pandas/core/indexes/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6556,7 +6556,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index:
65566556
"""
65576557
Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
65586558
"""
6559-
return ensure_index(target)
6559+
target_index = ensure_index(target)
6560+
if (
6561+
not hasattr(target, "dtype")
6562+
and self.dtype == object
6563+
and target_index.dtype == "string"
6564+
):
6565+
# If we started with a list-like, avoid inference to string dtype if self
6566+
# is object dtype (coercing to string dtype will alter the missing values)
6567+
target_index = Index(target, dtype=self.dtype)
6568+
return target_index
65606569

65616570
@final
65626571
def _validate_indexer(

pandas/core/reshape/concat.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import numpy as np
1818

1919
from pandas._libs import lib
20+
from pandas.util._decorators import set_module
2021
from pandas.util._exceptions import find_stack_level
2122

2223
from pandas.core.dtypes.common import (
@@ -149,6 +150,7 @@ def concat(
149150
) -> DataFrame | Series: ...
150151

151152

153+
@set_module("pandas")
152154
def concat(
153155
objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
154156
*,

pandas/tests/api/test_api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,7 @@ def test_set_module():
417417
assert pd.Period.__module__ == "pandas"
418418
assert pd.Timestamp.__module__ == "pandas"
419419
assert pd.Timedelta.__module__ == "pandas"
420+
assert pd.concat.__module__ == "pandas"
420421
assert pd.isna.__module__ == "pandas"
421422
assert pd.notna.__module__ == "pandas"
422423
assert pd.merge.__module__ == "pandas"

pandas/tests/indexes/object/test_indexing.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,15 @@ def test_get_indexer_with_NA_values(
6262
expected = np.array([0, 1, -1], dtype=np.intp)
6363
tm.assert_numpy_array_equal(result, expected)
6464

65+
def test_get_indexer_infer_string_missing_values(self):
66+
# ensure the passed list is not cast to string but to object so that
67+
# the None value is matched in the index
68+
# https://github.com/pandas-dev/pandas/issues/55834
69+
idx = Index(["a", "b", None], dtype="object")
70+
result = idx.get_indexer([None, "x"])
71+
expected = np.array([2, -1], dtype=np.intp)
72+
tm.assert_numpy_array_equal(result, expected)
73+
6574

6675
class TestGetIndexerNonUnique:
6776
def test_get_indexer_non_unique_nas(self, nulls_fixture):

pandas/tests/io/test_fsspec.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
from pandas._config import using_string_dtype
77

8+
from pandas.compat import HAS_PYARROW
9+
810
from pandas import (
911
DataFrame,
1012
date_range,
@@ -176,7 +178,9 @@ def test_excel_options(fsspectest):
176178
assert fsspectest.test[0] == "read"
177179

178180

179-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
181+
@pytest.mark.xfail(
182+
using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet"
183+
)
180184
def test_to_parquet_new_file(cleared_fs, df1):
181185
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
182186
pytest.importorskip("fastparquet")

pandas/tests/io/test_gcs.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import numpy as np
88
import pytest
99

10-
from pandas._config import using_string_dtype
11-
1210
from pandas.compat.pyarrow import pa_version_under17p0
1311

1412
from pandas import (
@@ -207,7 +205,6 @@ def test_to_csv_compression_encoding_gcs(
207205
tm.assert_frame_equal(df, read_df)
208206

209207

210-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
211208
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
212209
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
213210
pytest.importorskip("fastparquet")

pandas/tests/io/test_parquet.py

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,9 +1174,17 @@ def test_non_nanosecond_timestamps(self, temp_file):
11741174

11751175

11761176
class TestParquetFastParquet(Base):
1177-
@pytest.mark.xfail(reason="datetime_with_nat gets incorrect values")
1178-
def test_basic(self, fp, df_full):
1177+
def test_basic(self, fp, df_full, request):
11791178
pytz = pytest.importorskip("pytz")
1179+
import fastparquet
1180+
1181+
if Version(fastparquet.__version__) < Version("2024.11.0"):
1182+
request.applymarker(
1183+
pytest.mark.xfail(
1184+
reason=("datetime_with_nat gets incorrect values"),
1185+
)
1186+
)
1187+
11801188
tz = pytz.timezone("US/Eastern")
11811189
df = df_full
11821190

@@ -1213,11 +1221,17 @@ def test_duplicate_columns(self, fp):
12131221
msg = "Cannot create parquet dataset with duplicate column names"
12141222
self.check_error_on_write(df, fp, ValueError, msg)
12151223

1216-
@pytest.mark.xfail(
1217-
Version(np.__version__) >= Version("2.0.0"),
1218-
reason="fastparquet uses np.float_ in numpy2",
1219-
)
1220-
def test_bool_with_none(self, fp):
1224+
def test_bool_with_none(self, fp, request):
1225+
import fastparquet
1226+
1227+
if Version(fastparquet.__version__) < Version("2024.11.0") and Version(
1228+
np.__version__
1229+
) >= Version("2.0.0"):
1230+
request.applymarker(
1231+
pytest.mark.xfail(
1232+
reason=("fastparquet uses np.float_ in numpy2"),
1233+
)
1234+
)
12211235
df = pd.DataFrame({"a": [True, None, False]})
12221236
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
12231237
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
@@ -1331,10 +1345,19 @@ def test_empty_dataframe(self, fp):
13311345
expected = df.copy()
13321346
check_round_trip(df, fp, expected=expected)
13331347

1334-
@pytest.mark.xfail(
1335-
reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929"
1336-
)
1337-
def test_timezone_aware_index(self, fp, timezone_aware_date_list):
1348+
def test_timezone_aware_index(self, fp, timezone_aware_date_list, request):
1349+
import fastparquet
1350+
1351+
if Version(fastparquet.__version__) < Version("2024.11.0"):
1352+
request.applymarker(
1353+
pytest.mark.xfail(
1354+
reason=(
1355+
"fastparquet bug, see "
1356+
"https://github.com/dask/fastparquet/issues/929"
1357+
),
1358+
)
1359+
)
1360+
13381361
idx = 5 * [timezone_aware_date_list]
13391362

13401363
df = pd.DataFrame(index=idx, data={"index_as_col": idx})

0 commit comments

Comments
 (0)