Merge remote-tracking branch 'upstream/main' into fix-interchange-strings

jorisvandenbossche · jorisvandenbossche · commit f42fffd865f7 · 2024-11-17T10:19:15.000+01:00
diff --git a/doc/source/whatsnew/v2.3.0.rst b/doc/source/whatsnew/v2.3.0.rst
@@ -118,7 +118,7 @@ Interval
 
 Indexing
 ^^^^^^^^
--
+- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
 -
 
 Missing
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -6556,7 +6556,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index:
         """
         Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
         """
-        return ensure_index(target)
+        target_index = ensure_index(target)
+        if (
+            not hasattr(target, "dtype")
+            and self.dtype == object
+            and target_index.dtype == "string"
+        ):
+            # If we started with a list-like, avoid inference to string dtype if self
+            # is object dtype (coercing to string dtype will alter the missing values)
+            target_index = Index(target, dtype=self.dtype)
+        return target_index
 
     @final
     def _validate_indexer(
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -17,6 +17,7 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas.util._decorators import set_module
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
@@ -149,6 +150,7 @@ def concat(
 ) -> DataFrame | Series: ...
 
 
+@set_module("pandas")
 def concat(
     objs: Iterable[Series | DataFrame] | Mapping[HashableT, Series | DataFrame],
     *,
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
@@ -417,6 +417,7 @@ def test_set_module():
     assert pd.Period.__module__ == "pandas"
     assert pd.Timestamp.__module__ == "pandas"
     assert pd.Timedelta.__module__ == "pandas"
+    assert pd.concat.__module__ == "pandas"
     assert pd.isna.__module__ == "pandas"
     assert pd.notna.__module__ == "pandas"
     assert pd.merge.__module__ == "pandas"
diff --git a/pandas/tests/indexes/object/test_indexing.py b/pandas/tests/indexes/object/test_indexing.py
@@ -62,6 +62,15 @@ def test_get_indexer_with_NA_values(
         expected = np.array([0, 1, -1], dtype=np.intp)
         tm.assert_numpy_array_equal(result, expected)
 
+    def test_get_indexer_infer_string_missing_values(self):
+        # ensure the passed list is not cast to string but to object so that
+        # the None value is matched in the index
+        # https://github.com/pandas-dev/pandas/issues/55834
+        idx = Index(["a", "b", None], dtype="object")
+        result = idx.get_indexer([None, "x"])
+        expected = np.array([2, -1], dtype=np.intp)
+        tm.assert_numpy_array_equal(result, expected)
+
 
 class TestGetIndexerNonUnique:
     def test_get_indexer_non_unique_nas(self, nulls_fixture):
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
@@ -5,6 +5,8 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
+
 from pandas import (
     DataFrame,
     date_range,
@@ -176,7 +178,9 @@ def test_excel_options(fsspectest):
     assert fsspectest.test[0] == "read"
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
+@pytest.mark.xfail(
+    using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string) fastparquet"
+)
 def test_to_parquet_new_file(cleared_fs, df1):
     """Regression test for writing to a not-yet-existent GCS Parquet file."""
     pytest.importorskip("fastparquet")
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
@@ -7,8 +7,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.compat.pyarrow import pa_version_under17p0
 
 from pandas import (
@@ -207,7 +205,6 @@ def test_to_csv_compression_encoding_gcs(
     tm.assert_frame_equal(df, read_df)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) fastparquet")
 def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
     """Regression test for writing to a not-yet-existent GCS Parquet file."""
     pytest.importorskip("fastparquet")
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -1174,9 +1174,17 @@ def test_non_nanosecond_timestamps(self, temp_file):
 
 
 class TestParquetFastParquet(Base):
-    @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values")
-    def test_basic(self, fp, df_full):
+    def test_basic(self, fp, df_full, request):
         pytz = pytest.importorskip("pytz")
+        import fastparquet
+
+        if Version(fastparquet.__version__) < Version("2024.11.0"):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=("datetime_with_nat gets incorrect values"),
+                )
+            )
+
         tz = pytz.timezone("US/Eastern")
         df = df_full
 
@@ -1213,11 +1221,17 @@ def test_duplicate_columns(self, fp):
         msg = "Cannot create parquet dataset with duplicate column names"
         self.check_error_on_write(df, fp, ValueError, msg)
 
-    @pytest.mark.xfail(
-        Version(np.__version__) >= Version("2.0.0"),
-        reason="fastparquet uses np.float_ in numpy2",
-    )
-    def test_bool_with_none(self, fp):
+    def test_bool_with_none(self, fp, request):
+        import fastparquet
+
+        if Version(fastparquet.__version__) < Version("2024.11.0") and Version(
+            np.__version__
+        ) >= Version("2.0.0"):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=("fastparquet uses np.float_ in numpy2"),
+                )
+            )
         df = pd.DataFrame({"a": [True, None, False]})
         expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
         # Fastparquet bug in 0.7.1 makes it so that this dtype becomes
@@ -1331,10 +1345,19 @@ def test_empty_dataframe(self, fp):
         expected = df.copy()
         check_round_trip(df, fp, expected=expected)
 
-    @pytest.mark.xfail(
-        reason="fastparquet bug, see https://github.com/dask/fastparquet/issues/929"
-    )
-    def test_timezone_aware_index(self, fp, timezone_aware_date_list):
+    def test_timezone_aware_index(self, fp, timezone_aware_date_list, request):
+        import fastparquet
+
+        if Version(fastparquet.__version__) < Version("2024.11.0"):
+            request.applymarker(
+                pytest.mark.xfail(
+                    reason=(
+                        "fastparquet bug, see "
+                        "https://github.com/dask/fastparquet/issues/929"
+                    ),
+                )
+            )
+
         idx = 5 * [timezone_aware_date_list]
 
         df = pd.DataFrame(index=idx, data={"index_as_col": idx})