@@ -332,6 +332,36 @@ def _from_scalars(cls, scalars, dtype: DtypeObj) -> Self:
332
332
raise ValueError
333
333
return cls ._from_sequence (scalars , dtype = dtype )
334
334
335
+ def _str_map_str_or_object (
336
+ self ,
337
+ dtype ,
338
+ na_value ,
339
+ arr : np .ndarray ,
340
+ f ,
341
+ mask : npt .NDArray [np .bool_ ],
342
+ convert : bool ,
343
+ ):
344
+ # _str_map helper for case where dtype is either string dtype or object
345
+ if is_string_dtype (dtype ) and not is_object_dtype (dtype ):
346
+ # i.e. StringDtype
347
+ result = lib .map_infer_mask (
348
+ arr , f , mask .view ("uint8" ), convert = False , na_value = na_value
349
+ )
350
+ if self .dtype .storage == "pyarrow" :
351
+ import pyarrow as pa
352
+
353
+ result = pa .array (
354
+ result , mask = mask , type = pa .large_string (), from_pandas = True
355
+ )
356
+ return type (self )(result )
357
+
358
+ else :
359
+ # This is when the result type is object. We reach this when
360
+ # -> We know the result type is truly object (e.g. .encode returns bytes
361
+ # or .findall returns a list).
362
+ # -> We don't know the result type. E.g. `.get` can return anything.
363
+ return lib .map_infer_mask (arr , f , mask .view ("uint8" ))
364
+
335
365
336
366
# error: Definition of "_concat_same_type" in base class "NDArrayBacked" is
337
367
# incompatible with definition in base class "ExtensionArray"
@@ -697,9 +727,53 @@ def _cmp_method(self, other, op):
697
727
# base class "NumpyExtensionArray" defined the type as "float")
698
728
_str_na_value = libmissing .NA # type: ignore[assignment]
699
729
730
+ def _str_map_nan_semantics (
731
+ self , f , na_value = None , dtype : Dtype | None = None , convert : bool = True
732
+ ):
733
+ if dtype is None :
734
+ dtype = self .dtype
735
+ if na_value is None :
736
+ na_value = self .dtype .na_value
737
+
738
+ mask = isna (self )
739
+ arr = np .asarray (self )
740
+ convert = convert and not np .all (mask )
741
+
742
+ if is_integer_dtype (dtype ) or is_bool_dtype (dtype ):
743
+ na_value_is_na = isna (na_value )
744
+ if na_value_is_na :
745
+ if is_integer_dtype (dtype ):
746
+ na_value = 0
747
+ else :
748
+ na_value = True
749
+
750
+ result = lib .map_infer_mask (
751
+ arr ,
752
+ f ,
753
+ mask .view ("uint8" ),
754
+ convert = False ,
755
+ na_value = na_value ,
756
+ dtype = np .dtype (cast (type , dtype )),
757
+ )
758
+ if na_value_is_na and mask .any ():
759
+ if is_integer_dtype (dtype ):
760
+ result = result .astype ("float64" )
761
+ else :
762
+ result = result .astype ("object" )
763
+ result [mask ] = np .nan
764
+ return result
765
+
766
+ else :
767
+ return self ._str_map_str_or_object (dtype , na_value , arr , f , mask , convert )
768
+
700
769
def _str_map (
701
770
self , f , na_value = None , dtype : Dtype | None = None , convert : bool = True
702
771
):
772
+ if self .dtype .na_value is np .nan :
773
+ return self ._str_map_nan_semantics (
774
+ f , na_value = na_value , dtype = dtype , convert = convert
775
+ )
776
+
703
777
from pandas .arrays import BooleanArray
704
778
705
779
if dtype is None :
@@ -739,18 +813,8 @@ def _str_map(
739
813
740
814
return constructor (result , mask )
741
815
742
- elif is_string_dtype (dtype ) and not is_object_dtype (dtype ):
743
- # i.e. StringDtype
744
- result = lib .map_infer_mask (
745
- arr , f , mask .view ("uint8" ), convert = False , na_value = na_value
746
- )
747
- return StringArray (result )
748
816
else :
749
- # This is when the result type is object. We reach this when
750
- # -> We know the result type is truly object (e.g. .encode returns bytes
751
- # or .findall returns a list).
752
- # -> We don't know the result type. E.g. `.get` can return anything.
753
- return lib .map_infer_mask (arr , f , mask .view ("uint8" ))
817
+ return self ._str_map_str_or_object (dtype , na_value , arr , f , mask , convert )
754
818
755
819
756
820
class StringArrayNumpySemantics (StringArray ):
@@ -817,52 +881,3 @@ def value_counts(self, dropna: bool = True) -> Series:
817
881
# ------------------------------------------------------------------------
818
882
# String methods interface
819
883
_str_na_value = np .nan
820
-
821
- def _str_map (
822
- self , f , na_value = None , dtype : Dtype | None = None , convert : bool = True
823
- ):
824
- if dtype is None :
825
- dtype = self .dtype
826
- if na_value is None :
827
- na_value = self .dtype .na_value
828
-
829
- mask = isna (self )
830
- arr = np .asarray (self )
831
- convert = convert and not np .all (mask )
832
-
833
- if is_integer_dtype (dtype ) or is_bool_dtype (dtype ):
834
- na_value_is_na = isna (na_value )
835
- if na_value_is_na :
836
- if is_integer_dtype (dtype ):
837
- na_value = 0
838
- else :
839
- na_value = True
840
-
841
- result = lib .map_infer_mask (
842
- arr ,
843
- f ,
844
- mask .view ("uint8" ),
845
- convert = False ,
846
- na_value = na_value ,
847
- dtype = np .dtype (cast (type , dtype )),
848
- )
849
- if na_value_is_na and mask .any ():
850
- if is_integer_dtype (dtype ):
851
- result = result .astype ("float64" )
852
- else :
853
- result = result .astype ("object" )
854
- result [mask ] = np .nan
855
- return result
856
-
857
- elif is_string_dtype (dtype ) and not is_object_dtype (dtype ):
858
- # i.e. StringDtype
859
- result = lib .map_infer_mask (
860
- arr , f , mask .view ("uint8" ), convert = False , na_value = na_value
861
- )
862
- return type (self )(result )
863
- else :
864
- # This is when the result type is object. We reach this when
865
- # -> We know the result type is truly object (e.g. .encode returns bytes
866
- # or .findall returns a list).
867
- # -> We don't know the result type. E.g. `.get` can return anything.
868
- return lib .map_infer_mask (arr , f , mask .view ("uint8" ))
0 commit comments