Skip to content

Commit 53c8f75

Browse files
Merge remote-tracking branch 'upstream/main' into string-dtype-alias
2 parents 39662d2 + 7147203 commit 53c8f75

File tree

6 files changed

+142
-131
lines changed

6 files changed

+142
-131
lines changed

doc/source/development/contributing.rst

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ If you are new to Git, you can reference some of these resources for learning Gi
7474
to the :ref:`contributor community <community>` for help if needed:
7575

7676
* `Git documentation <https://git-scm.com/doc>`_.
77-
* `Numpy's Git resources <https://numpy.org/doc/stable/dev/gitwash/git_resources.html>`_ tutorial.
7877

7978
Also, the project follows a forking workflow further described on this page whereby
8079
contributors fork the repository, make changes and then create a pull request.

pandas/_testing/asserters.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1176,7 +1176,10 @@ def assert_frame_equal(
11761176
Specify how to compare internal data. If False, compare by columns.
11771177
If True, compare by blocks.
11781178
check_exact : bool, default False
1179-
Whether to compare number exactly.
1179+
Whether to compare number exactly. If False, the comparison uses the
1180+
relative tolerance (``rtol``) and absolute tolerance (``atol``)
1181+
parameters to determine if two values are considered close,
1182+
according to the formula: ``|a - b| <= (atol + rtol * |b|)``.
11801183
11811184
.. versionchanged:: 2.2.0
11821185

pandas/core/arrays/string_.py

Lines changed: 78 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,8 @@ class BaseStringArray(ExtensionArray):
331331
Mixin class for StringArray, ArrowStringArray.
332332
"""
333333

334+
dtype: StringDtype
335+
334336
@doc(ExtensionArray.tolist)
335337
def tolist(self) -> list:
336338
if self.ndim > 1:
@@ -344,6 +346,37 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
344346
raise ValueError
345347
return cls._from_sequence(scalars, dtype=dtype)
346348

349+
def _str_map_str_or_object(
350+
self,
351+
dtype,
352+
na_value,
353+
arr: np.ndarray,
354+
f,
355+
mask: npt.NDArray[np.bool_],
356+
convert: bool,
357+
):
358+
# _str_map helper for case where dtype is either string dtype or object
359+
if is_string_dtype(dtype) and not is_object_dtype(dtype):
360+
# i.e. StringDtype
361+
result = lib.map_infer_mask(
362+
arr, f, mask.view("uint8"), convert=False, na_value=na_value
363+
)
364+
if self.dtype.storage == "pyarrow":
365+
import pyarrow as pa
366+
367+
result = pa.array(
368+
result, mask=mask, type=pa.large_string(), from_pandas=True
369+
)
370+
# error: Too many arguments for "BaseStringArray"
371+
return type(self)(result) # type: ignore[call-arg]
372+
373+
else:
374+
# This is when the result type is object. We reach this when
375+
# -> We know the result type is truly object (e.g. .encode returns bytes
376+
# or .findall returns a list).
377+
# -> We don't know the result type. E.g. `.get` can return anything.
378+
return lib.map_infer_mask(arr, f, mask.view("uint8"))
379+
347380

348381
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
349382
# incompatible with definition in base class "ExtensionArray"
@@ -709,9 +742,53 @@ def _cmp_method(self, other, op):
709742
# base class "NumpyExtensionArray" defined the type as "float")
710743
_str_na_value = libmissing.NA # type: ignore[assignment]
711744

745+
def _str_map_nan_semantics(
746+
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
747+
):
748+
if dtype is None:
749+
dtype = self.dtype
750+
if na_value is None:
751+
na_value = self.dtype.na_value
752+
753+
mask = isna(self)
754+
arr = np.asarray(self)
755+
convert = convert and not np.all(mask)
756+
757+
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
758+
na_value_is_na = isna(na_value)
759+
if na_value_is_na:
760+
if is_integer_dtype(dtype):
761+
na_value = 0
762+
else:
763+
na_value = True
764+
765+
result = lib.map_infer_mask(
766+
arr,
767+
f,
768+
mask.view("uint8"),
769+
convert=False,
770+
na_value=na_value,
771+
dtype=np.dtype(cast(type, dtype)),
772+
)
773+
if na_value_is_na and mask.any():
774+
if is_integer_dtype(dtype):
775+
result = result.astype("float64")
776+
else:
777+
result = result.astype("object")
778+
result[mask] = np.nan
779+
return result
780+
781+
else:
782+
return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
783+
712784
def _str_map(
713785
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
714786
):
787+
if self.dtype.na_value is np.nan:
788+
return self._str_map_nan_semantics(
789+
f, na_value=na_value, dtype=dtype, convert=convert
790+
)
791+
715792
from pandas.arrays import BooleanArray
716793

717794
if dtype is None:
@@ -751,18 +828,8 @@ def _str_map(
751828

752829
return constructor(result, mask)
753830

754-
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
755-
# i.e. StringDtype
756-
result = lib.map_infer_mask(
757-
arr, f, mask.view("uint8"), convert=False, na_value=na_value
758-
)
759-
return StringArray(result)
760831
else:
761-
# This is when the result type is object. We reach this when
762-
# -> We know the result type is truly object (e.g. .encode returns bytes
763-
# or .findall returns a list).
764-
# -> We don't know the result type. E.g. `.get` can return anything.
765-
return lib.map_infer_mask(arr, f, mask.view("uint8"))
832+
return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
766833

767834

768835
class StringArrayNumpySemantics(StringArray):
@@ -829,52 +896,3 @@ def value_counts(self, dropna: bool = True) -> Series:
829896
# ------------------------------------------------------------------------
830897
# String methods interface
831898
_str_na_value = np.nan
832-
833-
def _str_map(
834-
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
835-
):
836-
if dtype is None:
837-
dtype = self.dtype
838-
if na_value is None:
839-
na_value = self.dtype.na_value
840-
841-
mask = isna(self)
842-
arr = np.asarray(self)
843-
convert = convert and not np.all(mask)
844-
845-
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
846-
na_value_is_na = isna(na_value)
847-
if na_value_is_na:
848-
if is_integer_dtype(dtype):
849-
na_value = 0
850-
else:
851-
na_value = True
852-
853-
result = lib.map_infer_mask(
854-
arr,
855-
f,
856-
mask.view("uint8"),
857-
convert=False,
858-
na_value=na_value,
859-
dtype=np.dtype(cast(type, dtype)),
860-
)
861-
if na_value_is_na and mask.any():
862-
if is_integer_dtype(dtype):
863-
result = result.astype("float64")
864-
else:
865-
result = result.astype("object")
866-
result[mask] = np.nan
867-
return result
868-
869-
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
870-
# i.e. StringDtype
871-
result = lib.map_infer_mask(
872-
arr, f, mask.view("uint8"), convert=False, na_value=na_value
873-
)
874-
return type(self)(result)
875-
else:
876-
# This is when the result type is object. We reach this when
877-
# -> We know the result type is truly object (e.g. .encode returns bytes
878-
# or .findall returns a list).
879-
# -> We don't know the result type. E.g. `.get` can return anything.
880-
return lib.map_infer_mask(arr, f, mask.view("uint8"))

pandas/core/arrays/string_arrow.py

Lines changed: 45 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,7 @@
2525
from pandas.core.dtypes.common import (
2626
is_bool_dtype,
2727
is_integer_dtype,
28-
is_object_dtype,
2928
is_scalar,
30-
is_string_dtype,
3129
pandas_dtype,
3230
)
3331
from pandas.core.dtypes.missing import isna
@@ -281,9 +279,53 @@ def astype(self, dtype, copy: bool = True):
281279
# base class "ObjectStringArrayMixin" defined the type as "float")
282280
_str_na_value = libmissing.NA # type: ignore[assignment]
283281

282+
def _str_map_nan_semantics(
283+
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
284+
):
285+
if dtype is None:
286+
dtype = self.dtype
287+
if na_value is None:
288+
na_value = self.dtype.na_value
289+
290+
mask = isna(self)
291+
arr = np.asarray(self)
292+
293+
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
294+
if is_integer_dtype(dtype):
295+
na_value = np.nan
296+
else:
297+
na_value = False
298+
299+
dtype = np.dtype(cast(type, dtype))
300+
if mask.any():
301+
# numpy int/bool dtypes cannot hold NaNs so we must convert to
302+
# float64 for int (to match maybe_convert_objects) or
303+
# object for bool (again to match maybe_convert_objects)
304+
if is_integer_dtype(dtype):
305+
dtype = np.dtype("float64")
306+
else:
307+
dtype = np.dtype(object)
308+
result = lib.map_infer_mask(
309+
arr,
310+
f,
311+
mask.view("uint8"),
312+
convert=False,
313+
na_value=na_value,
314+
dtype=dtype,
315+
)
316+
return result
317+
318+
else:
319+
return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
320+
284321
def _str_map(
285322
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
286323
):
324+
if self.dtype.na_value is np.nan:
325+
return self._str_map_nan_semantics(
326+
f, na_value=na_value, dtype=dtype, convert=convert
327+
)
328+
287329
# TODO: de-duplicate with StringArray method. This method is moreless copy and
288330
# paste.
289331

@@ -327,21 +369,8 @@ def _str_map(
327369

328370
return constructor(result, mask)
329371

330-
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
331-
# i.e. StringDtype
332-
result = lib.map_infer_mask(
333-
arr, f, mask.view("uint8"), convert=False, na_value=na_value
334-
)
335-
result = pa.array(
336-
result, mask=mask, type=pa.large_string(), from_pandas=True
337-
)
338-
return type(self)(result)
339372
else:
340-
# This is when the result type is object. We reach this when
341-
# -> We know the result type is truly object (e.g. .encode returns bytes
342-
# or .findall returns a list).
343-
# -> We don't know the result type. E.g. `.get` can return anything.
344-
return lib.map_infer_mask(arr, f, mask.view("uint8"))
373+
return self._str_map_str_or_object(dtype, na_value, arr, f, mask, convert)
345374

346375
def _str_contains(
347376
self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
@@ -614,58 +643,6 @@ def __getattribute__(self, item):
614643
return partial(getattr(ArrowStringArrayMixin, item), self)
615644
return super().__getattribute__(item)
616645

617-
def _str_map(
618-
self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
619-
):
620-
if dtype is None:
621-
dtype = self.dtype
622-
if na_value is None:
623-
na_value = self.dtype.na_value
624-
625-
mask = isna(self)
626-
arr = np.asarray(self)
627-
628-
if is_integer_dtype(dtype) or is_bool_dtype(dtype):
629-
if is_integer_dtype(dtype):
630-
na_value = np.nan
631-
else:
632-
na_value = False
633-
634-
dtype = np.dtype(cast(type, dtype))
635-
if mask.any():
636-
# numpy int/bool dtypes cannot hold NaNs so we must convert to
637-
# float64 for int (to match maybe_convert_objects) or
638-
# object for bool (again to match maybe_convert_objects)
639-
if is_integer_dtype(dtype):
640-
dtype = np.dtype("float64")
641-
else:
642-
dtype = np.dtype(object)
643-
result = lib.map_infer_mask(
644-
arr,
645-
f,
646-
mask.view("uint8"),
647-
convert=False,
648-
na_value=na_value,
649-
dtype=dtype,
650-
)
651-
return result
652-
653-
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
654-
# i.e. StringDtype
655-
result = lib.map_infer_mask(
656-
arr, f, mask.view("uint8"), convert=False, na_value=na_value
657-
)
658-
result = pa.array(
659-
result, mask=mask, type=pa.large_string(), from_pandas=True
660-
)
661-
return type(self)(result)
662-
else:
663-
# This is when the result type is object. We reach this when
664-
# -> We know the result type is truly object (e.g. .encode returns bytes
665-
# or .findall returns a list).
666-
# -> We don't know the result type. E.g. `.get` can return anything.
667-
return lib.map_infer_mask(arr, f, mask.view("uint8"))
668-
669646
def _convert_int_dtype(self, result):
670647
if isinstance(result, pa.Array):
671648
result = result.to_numpy(zero_copy_only=False)

pandas/core/arrays/timedeltas.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,10 @@ def __mul__(self, other) -> Self:
467467
if is_scalar(other):
468468
# numpy will accept float and int, raise TypeError for others
469469
result = self._ndarray * other
470+
if result.dtype.kind != "m":
471+
# numpy >= 2.1 may not raise a TypeError
472+
# and seems to dispatch to others.__rmul__?
473+
raise TypeError(f"Cannot multiply with {type(other).__name__}")
470474
freq = None
471475
if self.freq is not None and not isna(other):
472476
freq = self.freq * other
@@ -494,6 +498,10 @@ def __mul__(self, other) -> Self:
494498

495499
# numpy will accept float or int dtype, raise TypeError for others
496500
result = self._ndarray * other
501+
if result.dtype.kind != "m":
502+
# numpy >= 2.1 may not raise a TypeError
503+
# and seems to dispatch to others.__rmul__?
504+
raise TypeError(f"Cannot multiply with {type(other).__name__}")
497505
return type(self)._simple_new(result, dtype=result.dtype)
498506

499507
__rmul__ = __mul__

pandas/tests/arithmetic/test_timedelta64.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1460,7 +1460,13 @@ def test_td64arr_mul_int(self, box_with_array):
14601460
def test_td64arr_mul_tdlike_scalar_raises(self, two_hours, box_with_array):
14611461
rng = timedelta_range("1 days", "10 days", name="foo")
14621462
rng = tm.box_expected(rng, box_with_array)
1463-
msg = "argument must be an integer|cannot use operands with types dtype"
1463+
msg = "|".join(
1464+
[
1465+
"argument must be an integer",
1466+
"cannot use operands with types dtype",
1467+
"Cannot multiply with",
1468+
]
1469+
)
14641470
with pytest.raises(TypeError, match=msg):
14651471
rng * two_hours
14661472

0 commit comments

Comments
 (0)