Merge remote-tracking branch 'upstream/main' into rls-133

jorisvandenbossche · jorisvandenbossche · commit cb96c2e3de85 · 2025-09-29T20:48:06.000+02:00
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -162,7 +162,7 @@ jobs:
         run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV"
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.1.4
+        uses: pypa/cibuildwheel@v3.2.0
         with:
          package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }}
         env:
diff --git a/doc/source/whatsnew/v2.3.3.rst b/doc/source/whatsnew/v2.3.3.rst
@@ -51,6 +51,8 @@ Bug fixes
 - Fix bug in :meth:`~DataFrame.groupby` with ``sum()`` and unobserved categories resulting in ``0`` instead of the empty string ``""`` (:issue:`61909`)
 - Fix :meth:`Series.str.isdigit` to correctly recognize unicode superscript
   characters as digits for :class:`StringDtype` backed by PyArrow (:issue:`61466`)
+- Fix comparing a :class:`StringDtype` Series with mixed objects raising an error (:issue:`60228`)
+- Fix error being raised when using a numpy ufunc with a Python-backed string array (:issue:`40800`)
 
 Other changes
 ~~~~~~~~~~~~~
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -883,22 +883,27 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray:
         ltype = self._pa_array.type
 
         if isinstance(other, (ExtensionArray, np.ndarray, list)):
-            boxed = self._box_pa(other)
-            rtype = boxed.type
-            if (pa.types.is_timestamp(ltype) and pa.types.is_date(rtype)) or (
-                pa.types.is_timestamp(rtype) and pa.types.is_date(ltype)
-            ):
-                # GH#62157 match non-pyarrow behavior
-                result = ops.invalid_comparison(self, other, op)
-                result = pa.array(result, type=pa.bool_())
+            try:
+                boxed = self._box_pa(other)
+            except pa.lib.ArrowInvalid:
+                # e.g. GH#60228 [1, "b"] we have to operate pointwise
+                res_values = [op(x, y) for x, y in zip(self, other)]
+                result = pa.array(res_values, type=pa.bool_(), from_pandas=True)
             else:
-                try:
-                    result = pc_func(self._pa_array, boxed)
-                except pa.ArrowNotImplementedError:
-                    # TODO: could this be wrong if other is object dtype?
-                    #  in which case we need to operate pointwise?
+                rtype = boxed.type
+                if (pa.types.is_timestamp(ltype) and pa.types.is_date(rtype)) or (
+                    pa.types.is_timestamp(rtype) and pa.types.is_date(ltype)
+                ):
+                    # GH#62157 match non-pyarrow behavior
                     result = ops.invalid_comparison(self, other, op)
                     result = pa.array(result, type=pa.bool_())
+                else:
+                    try:
+                        result = pc_func(self._pa_array, boxed)
+                    except pa.ArrowNotImplementedError:
+                        result = ops.invalid_comparison(self, other, op)
+                        result = pa.array(result, type=pa.bool_())
+
         elif is_scalar(other):
             if (isinstance(other, datetime) and pa.types.is_date(ltype)) or (
                 type(other) is date and pa.types.is_timestamp(ltype)
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -30,8 +30,6 @@
 from pandas.compat.numpy import function as nv
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import (
-    Appender,
-    Substitution,
     cache_readonly,
 )
 from pandas.util._validators import (
@@ -1669,9 +1667,48 @@ def factorize(
         Categories (3, str): ['a', 'b', 'c']
         """
 
-    @Substitution(klass="ExtensionArray")
-    @Appender(_extension_array_shared_docs["repeat"])
     def repeat(self, repeats: int | Sequence[int], axis: AxisInt | None = None) -> Self:
+        """
+        Repeat elements of an ExtensionArray.
+
+        Returns a new ExtensionArray where each element of the current ExtensionArray
+        is repeated consecutively a given number of times.
+
+        Parameters
+        ----------
+        repeats : int or array of ints
+            The number of repetitions for each element. This should be a
+            non-negative integer. Repeating 0 times will return an empty
+            ExtensionArray.
+        axis : None
+            Must be ``None``. Has no effect but is accepted for compatibility
+            with numpy.
+
+        Returns
+        -------
+        ExtensionArray
+            Newly created ExtensionArray with repeated elements.
+
+        See Also
+        --------
+        Series.repeat : Equivalent function for Series.
+        Index.repeat : Equivalent function for Index.
+        numpy.repeat : Similar method for :class:`numpy.ndarray`.
+        ExtensionArray.take : Take arbitrary positions.
+
+        Examples
+        --------
+        >>> cat = pd.Categorical(["a", "b", "c"])
+        >>> cat
+        ['a', 'b', 'c']
+        Categories (3, str): ['a', 'b', 'c']
+        >>> cat.repeat(2)
+        ['a', 'a', 'b', 'b', 'c', 'c']
+        Categories (3, str): ['a', 'b', 'c']
+        >>> cat.repeat([1, 2, 3])
+        ['a', 'b', 'b', 'c', 'c', 'c']
+        Categories (3, str): ['a', 'b', 'c']
+        """
         nv.validate_repeat((), {"axis": axis})
         ind = np.arange(len(self)).repeat(repeats)
         return self.take(ind)
diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py
@@ -5,6 +5,7 @@
     Any,
     Literal,
     Self,
+    cast,
 )
 
 import numpy as np
@@ -48,6 +49,7 @@
     )
 
     from pandas import Index
+    from pandas.arrays import StringArray
 
 
 class NumpyExtensionArray(
@@ -234,6 +236,16 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
             # e.g. test_np_max_nested_tuples
             return result
         else:
+            if self.dtype.type is str:  # type: ignore[comparison-overlap]
+                # StringDtype
+                self = cast("StringArray", self)
+                try:
+                    # specify dtype to preserve storage/na_value
+                    return type(self)(result, dtype=self.dtype)
+                except ValueError:
+                    # if validation of input fails (no strings)
+                    # -> fallback to returning raw numpy array
+                    return result
             # one return value; re-box array-like results
             return type(self)(result)
 
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
@@ -962,7 +962,10 @@ def _get_call_args(backend_name: str, data: Series | DataFrame, args, kwargs):
         if args and isinstance(data, ABCSeries):
             positional_args = str(args)[1:-1]
             keyword_args = ", ".join(
-                [f"{name}={value!r}" for (name, _), value in zip(arg_def, args)]
+                [
+                    f"{name}={value!r}"
+                    for (name, _), value in zip(arg_def, args, strict=False)
+                ]
             )
             msg = (
                 "`Series.plot()` should not be called with positional "
@@ -973,7 +976,9 @@ def _get_call_args(backend_name: str, data: Series | DataFrame, args, kwargs):
             )
             raise TypeError(msg)
 
-        pos_args = {name: value for (name, _), value in zip(arg_def, args)}
+        pos_args = {
+            name: value for (name, _), value in zip(arg_def, args, strict=False)
+        }
         if backend_name == "pandas.plotting._matplotlib":
             kwargs = dict(arg_def, **pos_args, **kwargs)
         else:
diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
@@ -321,9 +321,9 @@ def _grouped_plot_by_column(
 
     ax_values = []
 
-    for ax, col in zip(flatten_axes(axes), columns):
+    for ax, col in zip(flatten_axes(axes), columns, strict=False):
         gp_col = grouped[col]
-        keys, values = zip(*gp_col)
+        keys, values = zip(*gp_col, strict=True)
         re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs)
         ax.set_title(col)
         ax_values.append(re_plotf)
@@ -380,7 +380,7 @@ def _get_colors():
                 # taken from the colors dict parameter
                 # "boxes" value placed in position 0, "whiskers" in 1, etc.
                 valid_keys = ["boxes", "whiskers", "medians", "caps"]
-                key_to_index = dict(zip(valid_keys, range(4)))
+                key_to_index = dict(zip(valid_keys, range(4), strict=True))
                 for key, value in colors.items():
                     if key in valid_keys:
                         result[key_to_index[key]] = value
@@ -530,7 +530,7 @@ def boxplot_frame_groupby(
             layout=layout,
         )
         data = {}
-        for (key, group), ax in zip(grouped, flatten_axes(axes)):
+        for (key, group), ax in zip(grouped, flatten_axes(axes), strict=False):
             d = group.boxplot(
                 ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds
             )
@@ -539,7 +539,7 @@ def boxplot_frame_groupby(
         ret = pd.Series(data)
         maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
     else:
-        keys, frames = zip(*grouped)
+        keys, frames = zip(*grouped, strict=True)
         df = pd.concat(frames, keys=keys, axis=1)
 
         # GH 16748, DataFrameGroupby fails when subplots=False and `column` argument
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
@@ -817,7 +817,7 @@ def _adorn_subplots(self, fig: Figure) -> None:
                             f"number of columns = {self.nseries}"
                         )
 
-                    for ax, title in zip(self.axes, self.title):
+                    for ax, title in zip(self.axes, self.title, strict=False):
                         ax.set_title(title)
                 else:
                     fig.suptitle(self.title)
@@ -1216,7 +1216,7 @@ def _get_errorbars(
     ) -> dict[str, Any]:
         errors = {}
 
-        for kw, flag in zip(["xerr", "yerr"], [xerr, yerr]):
+        for kw, flag in zip(["xerr", "yerr"], [xerr, yerr], strict=True):
             if flag:
                 err = self.errors[kw]
                 # user provided label-matched dataframe of errors
@@ -1457,7 +1457,7 @@ def _get_color_mapping(self, c_values: Series) -> dict[str, np.ndarray]:
         cmap = mpl.colormaps.get_cmap(self.colormap)
         colors = cmap(np.linspace(0, 1, n_colors))  # RGB tuples
 
-        return dict(zip(unique, colors))
+        return dict(zip(unique, colors, strict=True))
 
     def _get_norm_and_cmap(self, c_values, color_by_categorical: bool):
         c = self.c
@@ -2178,7 +2178,10 @@ def blank_labeler(label, value):
             # Blank out labels for values of 0 so they don't overlap
             # with nonzero wedges
             if labels is not None:
-                blabels = [blank_labeler(left, value) for left, value in zip(labels, y)]
+                blabels = [
+                    blank_labeler(left, value)
+                    for left, value in zip(labels, y, strict=True)
+                ]
             else:
                 blabels = None
             results = ax.pie(y, labels=blabels, **kwds)
@@ -2197,7 +2200,7 @@ def blank_labeler(label, value):
 
             # leglabels is used for legend labels
             leglabels = labels if labels is not None else idx
-            for _patch, _leglabel in zip(patches, leglabels):
+            for _patch, _leglabel in zip(patches, leglabels, strict=True):
                 self._append_legend_handles_labels(_patch, _leglabel)
 
     def _post_plot_logic(self, ax: Axes, data) -> None:
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
@@ -323,7 +323,7 @@ def _grouped_plot(
         naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout
     )
 
-    for ax, (key, group) in zip(flatten_axes(axes), grouped):
+    for ax, (key, group) in zip(flatten_axes(axes), grouped, strict=False):
         if numeric_only and isinstance(group, ABCDataFrame):
             group = group._get_numeric_data()
         plotf(group, ax, **kwargs)
@@ -557,7 +557,7 @@ def hist_frame(
     )
     can_set_label = "label" not in kwds
 
-    for ax, col in zip(flatten_axes(axes), data.columns):
+    for ax, col in zip(flatten_axes(axes), data.columns, strict=False):
         if legend and can_set_label:
             kwds["label"] = col
         ax.hist(data[col].dropna().values, bins=bins, **kwds)
diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py
@@ -191,7 +191,7 @@ def normalize(series):
 
     ax.add_patch(mpl.patches.Circle((0.0, 0.0), radius=1.0, facecolor="none"))
 
-    for xy, name in zip(s, df.columns):
+    for xy, name in zip(s, df.columns, strict=True):
         ax.add_patch(mpl.patches.Circle(xy, radius=0.025, facecolor="gray"))
 
         if xy[0] < 0.0 and xy[1] < 0.0:
@@ -266,7 +266,7 @@ def f(t):
     color_values = get_standard_colors(
         num_colors=len(classes), colormap=colormap, color_type="random", color=color
     )
-    colors = dict(zip(classes, color_values))
+    colors = dict(zip(classes, color_values, strict=False))
     if ax is None:
         ax = plt.gca()
         ax.set_xlim(-np.pi, np.pi)
@@ -399,7 +399,7 @@ def parallel_coordinates(
     if sort_labels:
         classes = sorted(classes)
         color_values = sorted(color_values)
-    colors = dict(zip(classes, color_values))
+    colors = dict(zip(classes, color_values, strict=True))
 
     for i in range(n):
         y = df.iloc[i].values
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -840,3 +840,30 @@ def test_string_array_view_type_error():
     arr = pd.array(["a", "b", "c"], dtype="string")
     with pytest.raises(TypeError, match="Cannot change data-type for string array."):
         arr.view("i8")
+
+
+@pytest.mark.parametrize("box", [pd.Series, pd.array])
+def test_numpy_array_ufunc(dtype, box):
+    arr = box(["a", "bb", "ccc"], dtype=dtype)
+
+    # custom ufunc that works with string (object) input -> returning numeric
+    str_len_ufunc = np.frompyfunc(lambda x: len(x), 1, 1)
+    result = str_len_ufunc(arr)
+    expected_cls = pd.Series if box is pd.Series else np.array
+    # TODO we should infer int64 dtype here?
+    expected = expected_cls([1, 2, 3], dtype=object)
+    tm.assert_equal(result, expected)
+
+    # custom ufunc returning strings
+    str_multiply_ufunc = np.frompyfunc(lambda x: x * 2, 1, 1)
+    result = str_multiply_ufunc(arr)
+    expected = box(["aa", "bbbb", "cccccc"], dtype=dtype)
+    if dtype.storage == "pyarrow":
+        # TODO ArrowStringArray should also preserve the class / dtype
+        if box is pd.array:
+            expected = np.array(["aa", "bbbb", "cccccc"], dtype=object)
+        else:
+            # not specifying the dtype because the exact dtype is not yet preserved
+            expected = pd.Series(["aa", "bbbb", "cccccc"])
+
+    tm.assert_equal(result, expected)
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -288,3 +288,19 @@ def test_searchsorted_with_na_raises(data_for_sorting, as_series):
     )
     with pytest.raises(ValueError, match=msg):
         arr.searchsorted(b)
+
+
+def test_mixed_object_comparison(dtype):
+    # GH#60228
+    ser = pd.Series(["a", "b"], dtype=dtype)
+
+    mixed = pd.Series([1, "b"], dtype=object)
+
+    result = ser == mixed
+    expected = pd.Series([False, True], dtype=bool)
+    if dtype.storage == "python" and dtype.na_value is pd.NA:
+        expected = expected.astype("boolean")
+    elif dtype.storage == "pyarrow" and dtype.na_value is pd.NA:
+        expected = expected.astype("bool[pyarrow]")
+
+    tm.assert_series_equal(result, expected)
diff --git a/pyproject.toml b/pyproject.toml
@@ -494,11 +494,6 @@ exclude = [
 "pandas/io/sql.py" = ["B905"]
 "pandas/io/stata.py" = ["B905"]
 "pandas/io/xml.py" = ["B905"]
-"pandas/plotting/_core.py" = ["B905"]
-"pandas/plotting/_matplotlib/boxplot.py" = ["B905"]
-"pandas/plotting/_matplotlib/core.py" = ["B905"]
-"pandas/plotting/_matplotlib/hist.py" = ["B905"]
-"pandas/plotting/_matplotlib/misc.py" = ["B905"]
 "pandas/_testing/asserters.py" = ["B905"]
 "pandas/_testing/_warnings.py" = ["B905"]
 "pandas/tests/apply/test_series_apply.py" = ["B905"]