1
1
from __future__ import annotations
2
2
3
+ import operator
3
4
from typing import (
4
5
TYPE_CHECKING ,
6
+ Any ,
5
7
ClassVar ,
6
8
Literal ,
7
9
cast ,
8
10
)
9
11
10
12
import numpy as np
11
13
12
- from pandas ._config import get_option
14
+ from pandas ._config import (
15
+ get_option ,
16
+ using_pyarrow_string_dtype ,
17
+ )
13
18
14
19
from pandas ._libs import (
15
20
lib ,
16
21
missing as libmissing ,
17
22
)
18
23
from pandas ._libs .arrays import NDArrayBacked
19
24
from pandas ._libs .lib import ensure_string_array
20
- from pandas .compat import pa_version_under10p1
25
+ from pandas .compat import (
26
+ HAS_PYARROW ,
27
+ pa_version_under10p1 ,
28
+ )
21
29
from pandas .compat .numpy import function as nv
22
30
from pandas .util ._decorators import doc
23
31
@@ -81,7 +89,7 @@ class StringDtype(StorageExtensionDtype):
81
89
82
90
Parameters
83
91
----------
84
- storage : {"python", "pyarrow", "pyarrow_numpy"}, optional
92
+ storage : {"python", "pyarrow", "python_numpy", " pyarrow_numpy"}, optional
85
93
If not given, the value of ``pd.options.mode.string_storage``.
86
94
87
95
Attributes
@@ -113,7 +121,7 @@ class StringDtype(StorageExtensionDtype):
113
121
# follows NumPy semantics, which uses nan.
114
122
@property
115
123
def na_value (self ) -> libmissing .NAType | float : # type: ignore[override]
116
- if self .storage == "pyarrow_numpy" :
124
+ if self .storage in ( "pyarrow_numpy" , "python_numpy" ) :
117
125
return np .nan
118
126
else :
119
127
return libmissing .NA
@@ -122,15 +130,17 @@ def na_value(self) -> libmissing.NAType | float: # type: ignore[override]
122
130
123
131
def __init__ (self , storage = None ) -> None :
124
132
if storage is None :
125
- infer_string = get_option ("future.infer_string" )
126
- if infer_string :
127
- storage = "pyarrow_numpy"
133
+ if using_pyarrow_string_dtype ():
134
+ if HAS_PYARROW :
135
+ storage = "pyarrow_numpy"
136
+ else :
137
+ storage = "python_numpy"
128
138
else :
129
139
storage = get_option ("mode.string_storage" )
130
- if storage not in {"python" , "pyarrow" , "pyarrow_numpy" }:
140
+ if storage not in {"python" , "pyarrow" , "python_numpy" , " pyarrow_numpy" }:
131
141
raise ValueError (
132
- f "Storage must be 'python', 'pyarrow' or 'pyarrow_numpy'. "
133
- f"Got { storage } instead."
142
+ "Storage must be 'python', 'pyarrow', 'python_numpy' or 'pyarrow_numpy'"
143
+ f". Got { storage } instead."
134
144
)
135
145
if storage in ("pyarrow" , "pyarrow_numpy" ) and pa_version_under10p1 :
136
146
raise ImportError (
@@ -178,6 +188,8 @@ def construct_from_string(cls, string) -> Self:
178
188
return cls ()
179
189
elif string == "string[python]" :
180
190
return cls (storage = "python" )
191
+ elif string == "string[python_numpy]" :
192
+ return cls (storage = "python_numpy" )
181
193
elif string == "string[pyarrow]" :
182
194
return cls (storage = "pyarrow" )
183
195
elif string == "string[pyarrow_numpy]" :
@@ -207,6 +219,8 @@ def construct_array_type( # type: ignore[override]
207
219
return StringArray
208
220
elif self .storage == "pyarrow" :
209
221
return ArrowStringArray
222
+ elif self .storage == "python_numpy" :
223
+ return StringArrayNumpySemantics
210
224
else :
211
225
return ArrowStringArrayNumpySemantics
212
226
@@ -238,7 +252,7 @@ def __from_arrow__(
238
252
# convert chunk by chunk to numpy and concatenate then, to avoid
239
253
# overflow for large string data when concatenating the pyarrow arrays
240
254
arr = arr .to_numpy (zero_copy_only = False )
241
- arr = ensure_string_array (arr , na_value = libmissing . NA )
255
+ arr = ensure_string_array (arr , na_value = self . na_value )
242
256
results .append (arr )
243
257
244
258
if len (chunks ) == 0 :
@@ -248,11 +262,7 @@ def __from_arrow__(
248
262
249
263
# Bypass validation inside StringArray constructor, see GH#47781
250
264
new_string_array = StringArray .__new__ (StringArray )
251
- NDArrayBacked .__init__ (
252
- new_string_array ,
253
- arr ,
254
- StringDtype (storage = "python" ),
255
- )
265
+ NDArrayBacked .__init__ (new_string_array , arr , self )
256
266
return new_string_array
257
267
258
268
@@ -360,14 +370,15 @@ class StringArray(BaseStringArray, NumpyExtensionArray): # type: ignore[misc]
360
370
361
371
# undo the NumpyExtensionArray hack
362
372
_typ = "extension"
373
+ _storage = "python"
363
374
364
375
def __init__ (self , values , copy : bool = False ) -> None :
365
376
values = extract_array (values )
366
377
367
378
super ().__init__ (values , copy = copy )
368
379
if not isinstance (values , type (self )):
369
380
self ._validate ()
370
- NDArrayBacked .__init__ (self , self ._ndarray , StringDtype (storage = "python" ))
381
+ NDArrayBacked .__init__ (self , self ._ndarray , StringDtype (storage = self . _storage ))
371
382
372
383
def _validate (self ) -> None :
373
384
"""Validate that we only store NA or strings."""
@@ -385,22 +396,41 @@ def _validate(self) -> None:
385
396
else :
386
397
lib .convert_nans_to_NA (self ._ndarray )
387
398
399
+ def _validate_scalar (self , value ):
400
+ # used by NDArrayBackedExtensionIndex.insert
401
+ if isna (value ):
402
+ return self .dtype .na_value
403
+ elif not isinstance (value , str ):
404
+ raise TypeError (
405
+ f"Cannot set non-string value '{ value } ' into a string array."
406
+ )
407
+ return value
408
+
388
409
@classmethod
389
410
def _from_sequence (
390
411
cls , scalars , * , dtype : Dtype | None = None , copy : bool = False
391
412
) -> Self :
392
413
if dtype and not (isinstance (dtype , str ) and dtype == "string" ):
393
414
dtype = pandas_dtype (dtype )
394
- assert isinstance (dtype , StringDtype ) and dtype .storage == "python"
415
+ assert isinstance (dtype , StringDtype ) and dtype .storage in (
416
+ "python" ,
417
+ "python_numpy" ,
418
+ )
419
+ else :
420
+ if get_option ("future.infer_string" ):
421
+ dtype = StringDtype (storage = "python_numpy" )
422
+ else :
423
+ dtype = StringDtype (storage = "python" )
395
424
396
425
from pandas .core .arrays .masked import BaseMaskedArray
397
426
427
+ na_value = dtype .na_value
398
428
if isinstance (scalars , BaseMaskedArray ):
399
429
# avoid costly conversion to object dtype
400
430
na_values = scalars ._mask
401
431
result = scalars ._data
402
432
result = lib .ensure_string_array (result , copy = copy , convert_na_value = False )
403
- result [na_values ] = libmissing . NA
433
+ result [na_values ] = na_value
404
434
405
435
else :
406
436
if lib .is_pyarrow_array (scalars ):
@@ -409,12 +439,12 @@ def _from_sequence(
409
439
# zero_copy_only to True which caused problems see GH#52076
410
440
scalars = np .array (scalars )
411
441
# convert non-na-likes to str, and nan-likes to StringDtype().na_value
412
- result = lib .ensure_string_array (scalars , na_value = libmissing . NA , copy = copy )
442
+ result = lib .ensure_string_array (scalars , na_value = na_value , copy = copy )
413
443
414
444
# Manually creating new array avoids the validation step in the __init__, so is
415
445
# faster. Refactor need for validation?
416
446
new_string_array = cls .__new__ (cls )
417
- NDArrayBacked .__init__ (new_string_array , result , StringDtype ( storage = "python" ) )
447
+ NDArrayBacked .__init__ (new_string_array , result , dtype )
418
448
419
449
return new_string_array
420
450
@@ -464,7 +494,7 @@ def __setitem__(self, key, value) -> None:
464
494
# validate new items
465
495
if scalar_value :
466
496
if isna (value ):
467
- value = libmissing . NA
497
+ value = self . dtype . na_value
468
498
elif not isinstance (value , str ):
469
499
raise TypeError (
470
500
f"Cannot set non-string value '{ value } ' into a StringArray."
@@ -478,7 +508,7 @@ def __setitem__(self, key, value) -> None:
478
508
mask = isna (value )
479
509
if mask .any ():
480
510
value = value .copy ()
481
- value [isna (value )] = libmissing . NA
511
+ value [isna (value )] = self . dtype . na_value
482
512
483
513
super ().__setitem__ (key , value )
484
514
@@ -591,9 +621,9 @@ def _cmp_method(self, other, op):
591
621
592
622
if op .__name__ in ops .ARITHMETIC_BINOPS :
593
623
result = np .empty_like (self ._ndarray , dtype = "object" )
594
- result [mask ] = libmissing . NA
624
+ result [mask ] = self . dtype . na_value
595
625
result [valid ] = op (self ._ndarray [valid ], other )
596
- return StringArray (result )
626
+ return self . _from_backing_data (result )
597
627
else :
598
628
# logical
599
629
result = np .zeros (len (self ._ndarray ), dtype = "bool" )
@@ -662,3 +692,97 @@ def _str_map(
662
692
# or .findall returns a list).
663
693
# -> We don't know the result type. E.g. `.get` can return anything.
664
694
return lib .map_infer_mask (arr , f , mask .view ("uint8" ))
695
+
696
+
697
+ class StringArrayNumpySemantics (StringArray ):
698
+ _storage = "python_numpy"
699
+
700
+ @classmethod
701
+ def _from_sequence (
702
+ cls , scalars , * , dtype : Dtype | None = None , copy : bool = False
703
+ ) -> Self :
704
+ if dtype is None :
705
+ dtype = StringDtype (storage = "python_numpy" )
706
+ return super ()._from_sequence (scalars , dtype = dtype , copy = copy )
707
+
708
+ def _from_backing_data (self , arr : np .ndarray ) -> NumpyExtensionArray :
709
+ # need to overrde NumpyExtensionArray._from_backing_data to ensure
710
+ # we always preserve the dtype
711
+ return NDArrayBacked ._from_backing_data (self , arr )
712
+
713
+ def _wrap_reduction_result (self , axis : AxisInt | None , result ) -> Any :
714
+ # the masked_reductions use pd.NA
715
+ if result is libmissing .NA :
716
+ return np .nan
717
+ return super ()._wrap_reduction_result (axis , result )
718
+
719
+ def _cmp_method (self , other , op ):
720
+ result = super ()._cmp_method (other , op )
721
+ if op == operator .ne :
722
+ return result .to_numpy (np .bool_ , na_value = True )
723
+ else :
724
+ return result .to_numpy (np .bool_ , na_value = False )
725
+
726
+ def value_counts (self , dropna : bool = True ) -> Series :
727
+ from pandas .core .algorithms import value_counts_internal as value_counts
728
+
729
+ result = value_counts (self ._ndarray , sort = False , dropna = dropna )
730
+ result .index = result .index .astype (self .dtype )
731
+ return result
732
+
733
+ # ------------------------------------------------------------------------
734
+ # String methods interface
735
+ _str_na_value = np .nan
736
+
737
+ def _str_map (
738
+ self , f , na_value = None , dtype : Dtype | None = None , convert : bool = True
739
+ ):
740
+ if dtype is None :
741
+ dtype = self .dtype
742
+ if na_value is None :
743
+ na_value = self .dtype .na_value
744
+
745
+ mask = isna (self )
746
+ arr = np .asarray (self )
747
+ convert = convert and not np .all (mask )
748
+
749
+ if is_integer_dtype (dtype ) or is_bool_dtype (dtype ):
750
+ # if is_integer_dtype(dtype):
751
+ # na_value = np.nan
752
+ # else:
753
+ # na_value = False
754
+ try :
755
+ result = lib .map_infer_mask (
756
+ arr ,
757
+ f ,
758
+ mask .view ("uint8" ),
759
+ convert = False ,
760
+ na_value = na_value ,
761
+ dtype = np .dtype (cast (type , dtype )),
762
+ )
763
+ return result
764
+
765
+ except ValueError :
766
+ result = lib .map_infer_mask (
767
+ arr ,
768
+ f ,
769
+ mask .view ("uint8" ),
770
+ convert = False ,
771
+ na_value = na_value ,
772
+ )
773
+ if convert and result .dtype == object :
774
+ result = lib .maybe_convert_objects (result )
775
+ return result
776
+
777
+ elif is_string_dtype (dtype ) and not is_object_dtype (dtype ):
778
+ # i.e. StringDtype
779
+ result = lib .map_infer_mask (
780
+ arr , f , mask .view ("uint8" ), convert = False , na_value = na_value
781
+ )
782
+ return type (self )(result )
783
+ else :
784
+ # This is when the result type is object. We reach this when
785
+ # -> We know the result type is truly object (e.g. .encode returns bytes
786
+ # or .findall returns a list).
787
+ # -> We don't know the result type. E.g. `.get` can return anything.
788
+ return lib .map_infer_mask (arr , f , mask .view ("uint8" ))
0 commit comments