12
12
overload ,
13
13
)
14
14
import unicodedata
15
- import warnings
16
15
17
16
import numpy as np
18
17
23
22
timezones ,
24
23
)
25
24
from pandas .compat import (
26
- HAS_PYARROW ,
27
- pa_version_under12p1 ,
25
+ pa_version_under10p1 ,
26
+ pa_version_under11p0 ,
28
27
pa_version_under13p0 ,
29
28
)
30
29
from pandas .util ._decorators import doc
31
- from pandas .util ._exceptions import find_stack_level
32
30
33
31
from pandas .core .dtypes .cast import (
34
32
can_hold_element ,
65
63
from pandas .core .arrays .masked import BaseMaskedArray
66
64
from pandas .core .arrays .string_ import StringDtype
67
65
import pandas .core .common as com
68
- from pandas .core .construction import extract_array
69
66
from pandas .core .indexers import (
70
67
check_array_indexer ,
71
68
unpack_tuple_and_ellipses ,
77
74
from pandas .io ._util import _arrow_dtype_mapping
78
75
from pandas .tseries .frequencies import to_offset
79
76
80
- if HAS_PYARROW :
77
+ if not pa_version_under10p1 :
81
78
import pyarrow as pa
82
79
import pyarrow .compute as pc
83
80
@@ -211,6 +208,16 @@ def floordiv_compat(
211
208
from pandas .core .arrays .timedeltas import TimedeltaArray
212
209
213
210
211
+ def get_unit_from_pa_dtype (pa_dtype ) -> str :
212
+ # https://github.com/pandas-dev/pandas/pull/50998#discussion_r1100344804
213
+ if pa_version_under11p0 :
214
+ unit = str (pa_dtype ).split ("[" , 1 )[- 1 ][:- 1 ]
215
+ if unit not in ["s" , "ms" , "us" , "ns" ]:
216
+ raise ValueError (pa_dtype )
217
+ return unit
218
+ return pa_dtype .unit
219
+
220
+
214
221
def to_pyarrow_type (
215
222
dtype : ArrowDtype | pa .DataType | Dtype | None ,
216
223
) -> pa .DataType | None :
@@ -293,7 +300,7 @@ class ArrowExtensionArray(
293
300
_dtype : ArrowDtype
294
301
295
302
def __init__ (self , values : pa .Array | pa .ChunkedArray ) -> None :
296
- if pa_version_under12p1 :
303
+ if pa_version_under10p1 :
297
304
msg = "pyarrow>=10.0.1 is required for PyArrow backed ArrowExtensionArray."
298
305
raise ImportError (msg )
299
306
if isinstance (values , pa .Array ):
@@ -503,33 +510,6 @@ def _box_pa_array(
503
510
value = to_timedelta (value , unit = pa_type .unit ).as_unit (pa_type .unit )
504
511
value = value .to_numpy ()
505
512
506
- if pa_type is not None and pa .types .is_timestamp (pa_type ):
507
- # Use DatetimeArray to exclude Decimal(NaN) (GH#61774) and
508
- # ensure constructor treats tznaive the same as non-pyarrow
509
- # dtypes (GH#61775)
510
- from pandas .core .arrays .datetimes import (
511
- DatetimeArray ,
512
- tz_to_dtype ,
513
- )
514
-
515
- pass_dtype = tz_to_dtype (tz = pa_type .tz , unit = pa_type .unit )
516
- value = extract_array (value , extract_numpy = True )
517
- if isinstance (value , DatetimeArray ):
518
- dta = value
519
- else :
520
- dta = DatetimeArray ._from_sequence (
521
- value , copy = copy , dtype = pass_dtype
522
- )
523
- dta_mask = dta .isna ()
524
- value_i8 = cast ("npt.NDArray" , dta .view ("i8" ))
525
- if not value_i8 .flags ["WRITEABLE" ]:
526
- # e.g. test_setitem_frame_2d_values
527
- value_i8 = value_i8 .copy ()
528
- dta = DatetimeArray ._from_sequence (value_i8 , dtype = dta .dtype )
529
- value_i8 [dta_mask ] = 0 # GH#61776 avoid __sub__ overflow
530
- pa_array = pa .array (dta ._ndarray , type = pa_type , mask = dta_mask )
531
- return pa_array
532
-
533
513
try :
534
514
pa_array = pa .array (value , type = pa_type , from_pandas = True )
535
515
except (pa .ArrowInvalid , pa .ArrowTypeError ):
@@ -854,25 +834,6 @@ def _logical_method(self, other, op) -> Self:
854
834
# integer types. Otherwise these are boolean ops.
855
835
if pa .types .is_integer (self ._pa_array .type ):
856
836
return self ._evaluate_op_method (other , op , ARROW_BIT_WISE_FUNCS )
857
- elif (
858
- (
859
- pa .types .is_string (self ._pa_array .type )
860
- or pa .types .is_large_string (self ._pa_array .type )
861
- )
862
- and op in (roperator .ror_ , roperator .rand_ , roperator .rxor )
863
- and isinstance (other , np .ndarray )
864
- and other .dtype == bool
865
- ):
866
- # GH#60234 backward compatibility for the move to StringDtype in 3.0
867
- op_name = op .__name__ [1 :].strip ("_" )
868
- warnings .warn (
869
- f"'{ op_name } ' operations between boolean dtype and { self .dtype } are "
870
- "deprecated and will raise in a future version. Explicitly "
871
- "cast the strings to a boolean dtype before operating instead." ,
872
- FutureWarning ,
873
- stacklevel = find_stack_level (),
874
- )
875
- return op (other , self .astype (bool ))
876
837
else :
877
838
return self ._evaluate_op_method (other , op , ARROW_LOGICAL_FUNCS )
878
839
@@ -1238,6 +1199,10 @@ def factorize(
1238
1199
null_encoding = "mask" if use_na_sentinel else "encode"
1239
1200
1240
1201
data = self ._pa_array
1202
+ pa_type = data .type
1203
+ if pa_version_under11p0 and pa .types .is_duration (pa_type ):
1204
+ # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1205
+ data = data .cast (pa .int64 ())
1241
1206
1242
1207
if pa .types .is_dictionary (data .type ):
1243
1208
if null_encoding == "encode" :
@@ -1262,6 +1227,8 @@ def factorize(
1262
1227
)
1263
1228
uniques = type (self )(combined .dictionary )
1264
1229
1230
+ if pa_version_under11p0 and pa .types .is_duration (pa_type ):
1231
+ uniques = cast (ArrowExtensionArray , uniques .astype (self .dtype ))
1265
1232
return indices , uniques
1266
1233
1267
1234
def reshape (self , * args , ** kwargs ):
@@ -1548,7 +1515,19 @@ def unique(self) -> Self:
1548
1515
-------
1549
1516
ArrowExtensionArray
1550
1517
"""
1551
- pa_result = pc .unique (self ._pa_array )
1518
+ pa_type = self ._pa_array .type
1519
+
1520
+ if pa_version_under11p0 and pa .types .is_duration (pa_type ):
1521
+ # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1522
+ data = self ._pa_array .cast (pa .int64 ())
1523
+ else :
1524
+ data = self ._pa_array
1525
+
1526
+ pa_result = pc .unique (data )
1527
+
1528
+ if pa_version_under11p0 and pa .types .is_duration (pa_type ):
1529
+ pa_result = pa_result .cast (pa_type )
1530
+
1552
1531
return type (self )(pa_result )
1553
1532
1554
1533
def value_counts (self , dropna : bool = True ) -> Series :
@@ -1568,12 +1547,18 @@ def value_counts(self, dropna: bool = True) -> Series:
1568
1547
--------
1569
1548
Series.value_counts
1570
1549
"""
1550
+ pa_type = self ._pa_array .type
1551
+ if pa_version_under11p0 and pa .types .is_duration (pa_type ):
1552
+ # https://github.com/apache/arrow/issues/15226#issuecomment-1376578323
1553
+ data = self ._pa_array .cast (pa .int64 ())
1554
+ else :
1555
+ data = self ._pa_array
1556
+
1571
1557
from pandas import (
1572
1558
Index ,
1573
1559
Series ,
1574
1560
)
1575
1561
1576
- data = self ._pa_array
1577
1562
vc = data .value_counts ()
1578
1563
1579
1564
values = vc .field (0 )
@@ -1583,6 +1568,9 @@ def value_counts(self, dropna: bool = True) -> Series:
1583
1568
values = values .filter (mask )
1584
1569
counts = counts .filter (mask )
1585
1570
1571
+ if pa_version_under11p0 and pa .types .is_duration (pa_type ):
1572
+ values = values .cast (pa_type )
1573
+
1586
1574
counts = ArrowExtensionArray (counts )
1587
1575
1588
1576
index = Index (type (self )(values ))
@@ -1876,7 +1864,8 @@ def pyarrow_meth(data, skip_nulls, min_count=0): # type: ignore[misc]
1876
1864
if pa .types .is_duration (pa_type ):
1877
1865
result = result .cast (pa_type )
1878
1866
elif pa .types .is_time (pa_type ):
1879
- result = result .cast (pa .duration (pa_type .unit ))
1867
+ unit = get_unit_from_pa_dtype (pa_type )
1868
+ result = result .cast (pa .duration (unit ))
1880
1869
elif pa .types .is_date (pa_type ):
1881
1870
# go with closest available unit, i.e. "s"
1882
1871
result = result .cast (pa .duration ("s" ))
@@ -1957,10 +1946,8 @@ def _explode(self):
1957
1946
fill_value = pa .scalar ([None ], type = self ._pa_array .type )
1958
1947
mask = counts == 0
1959
1948
if mask .any ():
1960
- # pc.if_else here is similar to `values[mask] = fill_value`
1961
- # but this avoids an object-dtype round-trip.
1962
- pa_values = pc .if_else (~ mask , values ._pa_array , fill_value )
1963
- values = type (self )(pa_values )
1949
+ values = values .copy ()
1950
+ values [mask ] = fill_value
1964
1951
counts = counts .copy ()
1965
1952
counts [mask ] = 1
1966
1953
values = values .fillna (fill_value )
@@ -2969,14 +2956,6 @@ def _dt_tz_convert(self, tz) -> Self:
2969
2956
result = self ._pa_array .cast (pa .timestamp (current_unit , tz ))
2970
2957
return type (self )(result )
2971
2958
2972
- def max (self , * , skipna : bool = True , axis : int | None = 0 , ** kwargs ):
2973
- """Return the maximum value of the array."""
2974
- return self ._reduce ("max" , skipna = skipna , ** kwargs )
2975
-
2976
- def min (self , * , skipna : bool = True , axis : int | None = 0 , ** kwargs ):
2977
- """Return the minimum value of the array."""
2978
- return self ._reduce ("min" , skipna = skipna , ** kwargs )
2979
-
2980
2959
2981
2960
def transpose_homogeneous_pyarrow (
2982
2961
arrays : Sequence [ArrowExtensionArray ],
0 commit comments