Merge branch 'main' into bugfix-spss-kwargs

astronights · web-flow · commit ee08fd58a80d · 2024-01-29T16:00:19.000+08:00
diff --git a/doc/source/whatsnew/v2.2.1.rst b/doc/source/whatsnew/v2.2.1.rst
@@ -16,6 +16,7 @@ Fixed regressions
 - Fixed memory leak in :func:`read_csv` (:issue:`57039`)
 - Fixed performance regression in :meth:`Series.combine_first` (:issue:`55845`)
 - Fixed regression in :func:`merge_ordered` raising ``TypeError`` for ``fill_method="ffill"`` and ``how="left"`` (:issue:`57010`)
+- Fixed regression in :func:`wide_to_long` raising an ``AttributeError`` for string columns (:issue:`57066`)
 - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` ignoring the ``skipna`` argument (:issue:`57040`)
 - Fixed regression in :meth:`DataFrameGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmax` where values containing the minimum or maximum value for the dtype could produce incorrect results (:issue:`57040`)
 - Fixed regression in :meth:`Series.pct_change` raising a ``ValueError`` for an empty :class:`Series` (:issue:`57056`)
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -142,7 +142,7 @@ Timezones
 
 Numeric
 ^^^^^^^
--
+- Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
 -
 
 Conversion
@@ -152,7 +152,7 @@ Conversion
 
 Strings
 ^^^^^^^
--
+- Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
 -
 
 Interval
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -542,7 +542,7 @@ def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
     def value_counts(self, dropna: bool = True) -> Series:
         from pandas.core.algorithms import value_counts_internal as value_counts
 
-        result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
+        result = value_counts(self._ndarray, sort=False, dropna=dropna).astype("Int64")
         result.index = result.index.astype(self.dtype)
         return result
 
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -951,6 +951,9 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str_t, *inputs, **kwargs):
         elif method == "reduce":
             result = lib.item_from_zerodim(result)
             return result
+        elif is_scalar(result):
+            # e.g. matmul
+            return result
 
         if result.dtype == np.float16:
             result = result.astype(np.float32)
diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py
@@ -458,8 +458,7 @@ def wide_to_long(
 
     def get_var_names(df, stub: str, sep: str, suffix: str):
         regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
-        pattern = re.compile(regex)
-        return df.columns[df.columns.str.match(pattern)]
+        return df.columns[df.columns.str.match(regex)]
 
     def melt_stub(df, stub: str, i, j, value_vars, sep: str):
         newdf = melt(
diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -1332,14 +1332,14 @@ def contains(
         return self._wrap_result(result, fill_value=na, returns_string=False)
 
     @forbid_nonstring_types(["bytes"])
-    def match(self, pat, case: bool = True, flags: int = 0, na=None):
+    def match(self, pat: str, case: bool = True, flags: int = 0, na=None):
         """
         Determine if each string starts with a match of a regular expression.
 
         Parameters
         ----------
         pat : str
-            Character sequence or regular expression.
+            Character sequence.
         case : bool, default True
             If True, case sensitive.
         flags : int, default 0 (no flags)
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -584,6 +584,19 @@ def test_value_counts_with_normalize(dtype):
     tm.assert_series_equal(result, expected)
 
 
+def test_value_counts_sort_false(dtype):
+    if getattr(dtype, "storage", "") == "pyarrow":
+        exp_dtype = "int64[pyarrow]"
+    elif getattr(dtype, "storage", "") == "pyarrow_numpy":
+        exp_dtype = "int64"
+    else:
+        exp_dtype = "Int64"
+    ser = pd.Series(["a", "b", "c", "b"], dtype=dtype)
+    result = ser.value_counts(sort=False)
+    expected = pd.Series([1, 2, 1], index=ser[:3], dtype=exp_dtype, name="count")
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "values, expected",
     [
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
@@ -1217,3 +1217,33 @@ def test_missing_stubname(self, dtype):
         new_level = expected.index.levels[0].astype(dtype)
         expected.index = expected.index.set_levels(new_level, level=0)
         tm.assert_frame_equal(result, expected)
+
+
+def test_wide_to_long_pyarrow_string_columns():
+    # GH 57066
+    pytest.importorskip("pyarrow")
+    df = DataFrame(
+        {
+            "ID": {0: 1},
+            "R_test1": {0: 1},
+            "R_test2": {0: 1},
+            "R_test3": {0: 2},
+            "D": {0: 1},
+        }
+    )
+    df.columns = df.columns.astype("string[pyarrow_numpy]")
+    result = wide_to_long(
+        df, stubnames="R", i="ID", j="UNPIVOTED", sep="_", suffix=".*"
+    )
+    expected = DataFrame(
+        [[1, 1], [1, 1], [1, 2]],
+        columns=Index(["D", "R"], dtype=object),
+        index=pd.MultiIndex.from_arrays(
+            [
+                [1, 1, 1],
+                Index(["test1", "test2", "test3"], dtype="string[pyarrow_numpy]"),
+            ],
+            names=["ID", "UNPIVOTED"],
+        ),
+    )
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/test_ufunc.py b/pandas/tests/series/test_ufunc.py
@@ -427,6 +427,13 @@ def test_np_matmul():
     tm.assert_frame_equal(expected, result)
 
 
+@pytest.mark.parametrize("box", [pd.Index, pd.Series])
+def test_np_matmul_1D(box):
+    result = np.matmul(box([1, 2]), box([2, 3]))
+    assert result == 8
+    assert isinstance(result, np.int64)
+
+
 def test_array_ufuncs_for_many_arguments():
     # GH39853
     def add3(x, y, z):

Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ Timezones`
`142`	`142`
`143`	`143`	`Numeric`
`144`	`144`	`^^^^^^^`
`145`		`--`
	`145`	+- Bug in ``np.matmul`` with :class:`Index` inputs raising a ``TypeError`` (:issue:`57079`)
`146`	`146`	`-`
`147`	`147`
`148`	`148`	`Conversion`
`@@ -152,7 +152,7 @@ Conversion`
`152`	`152`
`153`	`153`	`Strings`
`154`	`154`	`^^^^^^^`
`155`		`--`
	`155`	+- Bug in :meth:`Series.value_counts` would not respect ``sort=False`` for series having ``string`` dtype (:issue:`55224`)
`156`	`156`	`-`
`157`	`157`
`158`	`158`	`Interval`