Merge remote-tracking branch 'upstream/main' into pyarrow-nameerror

yuanx749 · yuanx749 · commit f2d140d51a29 · 2024-11-27T09:30:28.000+08:00
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Timestamp.min PR02" \
         -i "pandas.Timestamp.resolution PR02" \
         -i "pandas.Timestamp.tzinfo GL08" \
-        -i "pandas.api.types.is_re_compilable PR07,SA01" \
         -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \
         -i "pandas.arrays.IntegerArray SA01" \
         -i "pandas.arrays.IntervalArray.length SA01" \
diff --git a/doc/source/conf.py b/doc/source/conf.py
@@ -242,7 +242,6 @@
     "external_links": [],
     "footer_start": ["pandas_footer", "sphinx-version"],
     "github_url": "https://github.com/pandas-dev/pandas",
-    "twitter_url": "https://twitter.com/pandas_dev",
     "analytics": {
         "plausible_analytics_domain": "pandas.pydata.org",
         "plausible_analytics_url": "https://views.scientific-python.org/js/script.js",
@@ -258,6 +257,11 @@
     # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be)
     "show_version_warning_banner": False,
     "icon_links": [
+        {
+            "name": "X",
+            "url": "https://x.com/pandas_dev",
+            "icon": "fa-brands fa-square-x-twitter",
+        },
         {
             "name": "Mastodon",
             "url": "https://fosstodon.org/@pandas_dev",
diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst
@@ -321,7 +321,7 @@ The missing value can be filled with a specific value with the ``fill_value`` ar
 .. image:: ../_static/reshaping_melt.png
 
 The top-level :func:`~pandas.melt` function and the corresponding :meth:`DataFrame.melt`
-are useful to massage a :class:`DataFrame` into a format where one or more columns
+are useful to reshape a :class:`DataFrame` into a format where one or more columns
 are *identifier variables*, while all other columns, considered *measured
 variables*, are "unpivoted" to the row axis, leaving just two non-identifier
 columns, "variable" and "value". The names of those columns can be customized
diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi
@@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ...
 class MaskedUInt8Engine(MaskedIndexEngine): ...
 class MaskedBoolEngine(MaskedUInt8Engine): ...
 
+class StringObjectEngine(ObjectEngine):
+    def __init__(self, values: object, na_value) -> None: ...
+
 class BaseMultiIndexCodesEngine:
     levels: list[np.ndarray]
     offsets: np.ndarray  # np.ndarray[..., ndim=1]
diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx
@@ -557,6 +557,31 @@ cdef class StringEngine(IndexEngine):
             raise KeyError(val)
         return str(val)
 
+cdef class StringObjectEngine(ObjectEngine):
+
+    cdef:
+        object na_value
+        bint uses_na
+
+    def __init__(self, ndarray values, na_value):
+        super().__init__(values)
+        self.na_value = na_value
+        self.uses_na = na_value is C_NA
+
+    cdef bint _checknull(self, object val):
+        if self.uses_na:
+            return val is C_NA
+        else:
+            return util.is_nan(val)
+
+    cdef _check_type(self, object val):
+        if isinstance(val, str):
+            return val
+        elif self._checknull(val):
+            return self.na_value
+        else:
+            raise KeyError(val)
+
 
 cdef class DatetimeEngine(Int64Engine):
 
diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py
@@ -190,12 +190,17 @@ def is_re_compilable(obj: object) -> bool:
     Parameters
     ----------
     obj : The object to check
+        The object to check if the object can be compiled into a regex pattern instance.
 
     Returns
     -------
     bool
         Whether `obj` can be compiled as a regex pattern.
 
+    See Also
+    --------
+    api.types.is_re : Check if the object is a regex pattern instance.
+
     Examples
     --------
     >>> from pandas.api.types import is_re_compilable
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -838,7 +838,7 @@ def pop(self, item: Hashable) -> Series | Any:
         return result
 
     @final
-    def squeeze(self, axis: Axis | None = None):
+    def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame:
         """
         Squeeze 1 dimensional axis objects into scalars.
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -876,7 +876,7 @@ def _engine(
             # ndarray[Any, Any]]" has no attribute "_ndarray"  [union-attr]
             target_values = self._data._ndarray  # type: ignore[union-attr]
         elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype):
-            return libindex.StringEngine(target_values)
+            return libindex.StringObjectEngine(target_values, self.dtype.na_value)  # type: ignore[union-attr]
 
         # error: Argument 1 to "ExtensionEngine" has incompatible type
         # "ndarray[Any, Any]"; expected "ExtensionArray"
@@ -5974,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool:
     def get_indexer_non_unique(
         self, target
     ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
-        target = ensure_index(target)
         target = self._maybe_cast_listlike_indexer(target)
 
         if not self._should_compare(target) and not self._should_partial_index(target):
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
@@ -51,9 +51,9 @@ def melt(
     """
     Unpivot a DataFrame from wide to long format, optionally leaving identifiers set.
 
-    This function is useful to massage a DataFrame into a format where one
+    This function is useful to reshape a DataFrame into a format where one
     or more columns are identifier variables (`id_vars`), while all other
-    columns, considered measured variables (`value_vars`), are "unpivoted" to
+    columns are considered measured variables (`value_vars`), and are "unpivoted" to
     the row axis, leaving just two non-identifier columns, 'variable' and
     'value'.
 
diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py
@@ -6,6 +6,51 @@
 import pandas._testing as tm
 
 
+def _isnan(val):
+    try:
+        return val is not pd.NA and np.isnan(val)
+    except TypeError:
+        return False
+
+
+class TestGetLoc:
+    def test_get_loc(self, any_string_dtype):
+        index = Index(["a", "b", "c"], dtype=any_string_dtype)
+        assert index.get_loc("b") == 1
+
+    def test_get_loc_raises(self, any_string_dtype):
+        index = Index(["a", "b", "c"], dtype=any_string_dtype)
+        with pytest.raises(KeyError, match="d"):
+            index.get_loc("d")
+
+    def test_get_loc_invalid_value(self, any_string_dtype):
+        index = Index(["a", "b", "c"], dtype=any_string_dtype)
+        with pytest.raises(KeyError, match="1"):
+            index.get_loc(1)
+
+    def test_get_loc_non_unique(self, any_string_dtype):
+        index = Index(["a", "b", "a"], dtype=any_string_dtype)
+        result = index.get_loc("a")
+        expected = np.array([True, False, True])
+        tm.assert_numpy_array_equal(result, expected)
+
+    def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture):
+        index = Index(["a", "b", "c"], dtype=any_string_dtype)
+        with pytest.raises(KeyError):
+            index.get_loc(nulls_fixture)
+
+    def test_get_loc_missing(self, any_string_dtype, nulls_fixture):
+        index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype)
+        if any_string_dtype == "string" and (
+            (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA)
+            or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture))
+        ):
+            with pytest.raises(KeyError):
+                index.get_loc(nulls_fixture)
+        else:
+            assert index.get_loc(nulls_fixture) == 2
+
+
 class TestGetIndexer:
     @pytest.mark.parametrize(
         "method,expected",
@@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype):
                 ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2]
             )
 
+    @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA])
+    def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string):
+        # NaT and Decimal("NaN") from null_fixture are not supported for string dtype
+        index = Index(["a", "b", null], dtype=any_string_dtype)
+        result = index.get_indexer(["a", null, "c"])
+        if using_infer_string:
+            expected = np.array([0, 2, -1], dtype=np.intp)
+        elif any_string_dtype == "string" and (
+            (any_string_dtype.na_value is pd.NA and null is not pd.NA)
+            or (_isnan(any_string_dtype.na_value) and not _isnan(null))
+        ):
+            expected = np.array([0, -1, -1], dtype=np.intp)
+        else:
+            expected = np.array([0, 2, -1], dtype=np.intp)
 
-class TestGetIndexerNonUnique:
-    @pytest.mark.xfail(reason="TODO(infer_string)", strict=False)
-    def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture):
-        index = Index(["a", "b", None], dtype=any_string_dtype)
-        indexer, missing = index.get_indexer_non_unique([nulls_fixture])
+        tm.assert_numpy_array_equal(result, expected)
 
-        expected_indexer = np.array([2], dtype=np.intp)
-        expected_missing = np.array([], dtype=np.intp)
+
+class TestGetIndexerNonUnique:
+    @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA])
+    def test_get_indexer_non_unique_nas(
+        self, any_string_dtype, null, using_infer_string
+    ):
+        index = Index(["a", "b", null], dtype=any_string_dtype)
+        indexer, missing = index.get_indexer_non_unique(["a", null])
+
+        if using_infer_string:
+            expected_indexer = np.array([0, 2], dtype=np.intp)
+            expected_missing = np.array([], dtype=np.intp)
+        elif any_string_dtype == "string" and (
+            (any_string_dtype.na_value is pd.NA and null is not pd.NA)
+            or (_isnan(any_string_dtype.na_value) and not _isnan(null))
+        ):
+            expected_indexer = np.array([0, -1], dtype=np.intp)
+            expected_missing = np.array([1], dtype=np.intp)
+        else:
+            expected_indexer = np.array([0, 2], dtype=np.intp)
+            expected_missing = np.array([], dtype=np.intp)
         tm.assert_numpy_array_equal(indexer, expected_indexer)
         tm.assert_numpy_array_equal(missing, expected_missing)
 
         # actually non-unique
-        index = Index(["a", None, "b", None], dtype=any_string_dtype)
-        indexer, missing = index.get_indexer_non_unique([nulls_fixture])
-
-        expected_indexer = np.array([1, 3], dtype=np.intp)
+        index = Index(["a", null, "b", null], dtype=any_string_dtype)
+        indexer, missing = index.get_indexer_non_unique(["a", null])
+
+        if using_infer_string:
+            expected_indexer = np.array([0, 1, 3], dtype=np.intp)
+        elif any_string_dtype == "string" and (
+            (any_string_dtype.na_value is pd.NA and null is not pd.NA)
+            or (_isnan(any_string_dtype.na_value) and not _isnan(null))
+        ):
+            pass
+        else:
+            expected_indexer = np.array([0, 1, 3], dtype=np.intp)
         tm.assert_numpy_array_equal(indexer, expected_indexer)
         tm.assert_numpy_array_equal(missing, expected_missing)
 
diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py
@@ -15,6 +15,7 @@
 
 from pandas._config import using_string_dtype
 
+from pandas.compat import HAS_PYARROW
 from pandas.errors import (
     EmptyDataError,
     ParserError,
@@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers):
     tm.assert_frame_equal(result, expected)
 
 
-@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
+@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)")
 @xfail_pyarrow  # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
 def test_encoding_surrogatepass(all_parsers):
     # GH39017