Merge remote-tracking branch 'upstream/main' into string-dtype-alias

jorisvandenbossche · jorisvandenbossche · commit 53c8f75d5898 · 2024-08-08T10:28:27.000+02:00
diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst
@@ -74,7 +74,6 @@ If you are new to Git, you can reference some of these resources for learning Gi
 to the :ref:`contributor community <community>` for help if needed:
 
 * `Git documentation <https://git-scm.com/doc>`_.
-* `Numpy's Git resources <https://numpy.org/doc/stable/dev/gitwash/git_resources.html>`_ tutorial.
 
 Also, the project follows a forking workflow further described on this page whereby
 contributors fork the repository, make changes and then create a pull request.
diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -1176,7 +1176,10 @@ def assert_frame_equal(
         Specify how to compare internal data. If False, compare by columns.
         If True, compare by blocks.
     check_exact : bool, default False
-        Whether to compare number exactly.
+        Whether to compare number exactly. If False, the comparison uses the
+        relative tolerance (``rtol``) and absolute tolerance (``atol``)
+        parameters to determine if two values are considered close,
+        according to the formula: ``|a - b| <= (atol + rtol * |b|)``.
 
         .. versionchanged:: 2.2.0
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -331,6 +331,8 @@ class BaseStringArray(ExtensionArray):
     Mixin class for StringArray, ArrowStringArray.
     """
 
+    dtype: StringDtype
+
     @doc(ExtensionArray.tolist)
     def tolist(self) -> list:
         if self.ndim > 1:
@@ -344,6 +346,37 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
             raise ValueError
         return cls._from_sequence(scalars, dtype=dtype)
 
+    def _str_map_str_or_object(
+        self,
+        dtype,
+        na_value,
+        arr: np.ndarray,
+        f,
+        mask: npt.NDArray[np.bool_],
+        convert: bool,
+    ):
+        # _str_map helper for case where dtype is either string dtype or object
+        if is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # i.e. StringDtype
+            result = lib.map_infer_mask(
+                arr, f, mask.view("uint8"), convert=False, na_value=na_value
+            )
+            if self.dtype.storage == "pyarrow":
+                import pyarrow as pa
+
+                result = pa.array(
+                    result, mask=mask, type=pa.large_string(), from_pandas=True
+                )
+            # error: Too many arguments for "BaseStringArray"
+            return type(self)(result)  # type: ignore[call-arg]
+
+        else:
+            # This is when the result type is object. We reach this when
+            # -> We know the result type is truly object (e.g. .encode returns bytes
+            #    or .findall returns a list).
+            # -> We don't know the result type. E.g. `.get` can return anything.
+            return lib.map_infer_mask(arr, f, mask.view("uint8"))
+
 
 # error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
 # incompatible with definition in base class "ExtensionArray"
@@ -709,9 +742,53 @@ def _cmp_method(self, other, op):
     # base class "NumpyExtensionArray" defined the type as "float")
     _str_na_value = libmissing.NA  # type: ignore[assignment]
 
+    def _str_map_nan_semantics(
+        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
+    ):
+        if dtype is None:
+            dtype = self.dtype
+        if na_value is None:
+            na_value = self.dtype.na_value
+
+        mask = isna(self)
+        arr = np.asarray(self)
+        convert = convert and not np.all(mask)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            na_value_is_na = isna(na_value)
+            if na_value_is_na:
+                if is_integer_dtype(dtype):
+                    na_value = 0
+                else:
+                    na_value = True
+
+            result = lib.map_infer_mask(
+                arr,
+                f,
+                mask.view("uint8"),
+                convert=False,
+                na_value=na_value,
+                dtype=np.dtype(cast(type, dtype)),
+            )
+            if na_value_is_na and mask.any():
+                if is_integer_dtype(dtype):
+                    result = result.astype("float64")
+                else:
+                    result = result.astype("object")
+                result[mask] = np.nan
+            return result
+
+        else:
+            return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
+
     def _str_map(
         self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
     ):
+        if self.dtype.na_value is np.nan:
+            return self._str_map_nan_semantics(
+                f, na_value=na_value, dtype=dtype, convert=convert
+            )
+
         from pandas.arrays import BooleanArray
 
         if dtype is None:
@@ -751,18 +828,8 @@ def _str_map(
 
             return constructor(result, mask)
 
-        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
-            # i.e. StringDtype
-            result = lib.map_infer_mask(
-                arr, f, mask.view("uint8"), convert=False, na_value=na_value
-            )
-            return StringArray(result)
         else:
-            # This is when the result type is object. We reach this when
-            # -> We know the result type is truly object (e.g. .encode returns bytes
-            #    or .findall returns a list).
-            # -> We don't know the result type. E.g. `.get` can return anything.
-            return lib.map_infer_mask(arr, f, mask.view("uint8"))
+            return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
 
 
 class StringArrayNumpySemantics(StringArray):
@@ -829,52 +896,3 @@ def value_counts(self, dropna: bool = True) -> Series:
     # ------------------------------------------------------------------------
     # String methods interface
     _str_na_value = np.nan
-
-    def _str_map(
-        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
-    ):
-        if dtype is None:
-            dtype = self.dtype
-        if na_value is None:
-            na_value = self.dtype.na_value
-
-        mask = isna(self)
-        arr = np.asarray(self)
-        convert = convert and not np.all(mask)
-
-        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
-            na_value_is_na = isna(na_value)
-            if na_value_is_na:
-                if is_integer_dtype(dtype):
-                    na_value = 0
-                else:
-                    na_value = True
-
-            result = lib.map_infer_mask(
-                arr,
-                f,
-                mask.view("uint8"),
-                convert=False,
-                na_value=na_value,
-                dtype=np.dtype(cast(type, dtype)),
-            )
-            if na_value_is_na and mask.any():
-                if is_integer_dtype(dtype):
-                    result = result.astype("float64")
-                else:
-                    result = result.astype("object")
-                result[mask] = np.nan
-            return result
-
-        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
-            # i.e. StringDtype
-            result = lib.map_infer_mask(
-                arr, f, mask.view("uint8"), convert=False, na_value=na_value
-            )
-            return type(self)(result)
-        else:
-            # This is when the result type is object. We reach this when
-            # -> We know the result type is truly object (e.g. .encode returns bytes
-            #    or .findall returns a list).
-            # -> We don't know the result type. E.g. `.get` can return anything.
-            return lib.map_infer_mask(arr, f, mask.view("uint8"))
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -25,9 +25,7 @@
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_integer_dtype,
-    is_object_dtype,
     is_scalar,
-    is_string_dtype,
     pandas_dtype,
 )
 from pandas.core.dtypes.missing import isna
@@ -281,9 +279,53 @@ def astype(self, dtype, copy: bool = True):
     # base class "ObjectStringArrayMixin" defined the type as "float")
     _str_na_value = libmissing.NA  # type: ignore[assignment]
 
+    def _str_map_nan_semantics(
+        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
+    ):
+        if dtype is None:
+            dtype = self.dtype
+        if na_value is None:
+            na_value = self.dtype.na_value
+
+        mask = isna(self)
+        arr = np.asarray(self)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            if is_integer_dtype(dtype):
+                na_value = np.nan
+            else:
+                na_value = False
+
+            dtype = np.dtype(cast(type, dtype))
+            if mask.any():
+                # numpy int/bool dtypes cannot hold NaNs so we must convert to
+                # float64 for int (to match maybe_convert_objects) or
+                # object for bool (again to match maybe_convert_objects)
+                if is_integer_dtype(dtype):
+                    dtype = np.dtype("float64")
+                else:
+                    dtype = np.dtype(object)
+            result = lib.map_infer_mask(
+                arr,
+                f,
+                mask.view("uint8"),
+                convert=False,
+                na_value=na_value,
+                dtype=dtype,
+            )
+            return result
+
+        else:
+            return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
+
     def _str_map(
         self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
     ):
+        if self.dtype.na_value is np.nan:
+            return self._str_map_nan_semantics(
+                f, na_value=na_value, dtype=dtype, convert=convert
+            )
+
         # TODO: de-duplicate with StringArray method. This method is moreless copy and
         # paste.
 
@@ -327,21 +369,8 @@ def _str_map(
 
             return constructor(result, mask)
 
-        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
-            # i.e. StringDtype
-            result = lib.map_infer_mask(
-                arr, f, mask.view("uint8"), convert=False, na_value=na_value
-            )
-            result = pa.array(
-                result, mask=mask, type=pa.large_string(), from_pandas=True
-            )
-            return type(self)(result)
         else:
-            # This is when the result type is object. We reach this when
-            # -> We know the result type is truly object (e.g. .encode returns bytes
-            #    or .findall returns a list).
-            # -> We don't know the result type. E.g. `.get` can return anything.
-            return lib.map_infer_mask(arr, f, mask.view("uint8"))
+            return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
 
     def _str_contains(
         self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
@@ -614,58 +643,6 @@ def __getattribute__(self, item):
             return partial(getattr(ArrowStringArrayMixin, item), self)
         return super().__getattribute__(item)
 
-    def _str_map(
-        self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
-    ):
-        if dtype is None:
-            dtype = self.dtype
-        if na_value is None:
-            na_value = self.dtype.na_value
-
-        mask = isna(self)
-        arr = np.asarray(self)
-
-        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
-            if is_integer_dtype(dtype):
-                na_value = np.nan
-            else:
-                na_value = False
-
-            dtype = np.dtype(cast(type, dtype))
-            if mask.any():
-                # numpy int/bool dtypes cannot hold NaNs so we must convert to
-                # float64 for int (to match maybe_convert_objects) or
-                # object for bool (again to match maybe_convert_objects)
-                if is_integer_dtype(dtype):
-                    dtype = np.dtype("float64")
-                else:
-                    dtype = np.dtype(object)
-            result = lib.map_infer_mask(
-                arr,
-                f,
-                mask.view("uint8"),
-                convert=False,
-                na_value=na_value,
-                dtype=dtype,
-            )
-            return result
-
-        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
-            # i.e. StringDtype
-            result = lib.map_infer_mask(
-                arr, f, mask.view("uint8"), convert=False, na_value=na_value
-            )
-            result = pa.array(
-                result, mask=mask, type=pa.large_string(), from_pandas=True
-            )
-            return type(self)(result)
-        else:
-            # This is when the result type is object. We reach this when
-            # -> We know the result type is truly object (e.g. .encode returns bytes
-            #    or .findall returns a list).
-            # -> We don't know the result type. E.g. `.get` can return anything.
-            return lib.map_infer_mask(arr, f, mask.view("uint8"))
-
     def _convert_int_dtype(self, result):
         if isinstance(result, pa.Array):
             result = result.to_numpy(zero_copy_only=False)
diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -467,6 +467,10 @@ def __mul__(self, other) -> Self:
         if is_scalar(other):
             # numpy will accept float and int, raise TypeError for others
             result = self._ndarray * other
+            if result.dtype.kind != "m":
+                # numpy >= 2.1 may not raise a TypeError
+                # and seems to dispatch to others.__rmul__?
+                raise TypeError(f"Cannot multiply with {type(other).__name__}")
             freq = None
             if self.freq is not None and not isna(other):
                 freq = self.freq * other
@@ -494,6 +498,10 @@ def __mul__(self, other) -> Self:
 
         # numpy will accept float or int dtype, raise TypeError for others
         result = self._ndarray * other
+        if result.dtype.kind != "m":
+            # numpy >= 2.1 may not raise a TypeError
+            # and seems to dispatch to others.__rmul__?
+            raise TypeError(f"Cannot multiply with {type(other).__name__}")
         return type(self)._simple_new(result, dtype=result.dtype)
 
     __rmul__ = __mul__
diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py
@@ -1460,7 +1460,13 @@ def test_td64arr_mul_int(self, box_with_array):
     def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array):
         rng = timedelta_range("1 days", "10 days", name="foo")
         rng = tm.box_expected(rng, box_with_array)
-        msg = "argument must be an integer|cannot use operands with types dtype"
+        msg = "|".join(
+            [
+                "argument must be an integer",
+                "cannot use operands with types dtype",
+                "Cannot multiply with",
+            ]
+        )
         with pytest.raises(TypeError, match=msg):
             rng * two_hours