17
17
import numpy as np
18
18
19
19
from pandas ._libs import lib
20
+ from pandas ._libs .missing import NA
20
21
from pandas ._libs .tslibs import (
21
22
Timedelta ,
22
23
Timestamp ,
@@ -353,7 +354,7 @@ def _from_sequence_of_strings(
353
354
# duration to string casting behavior
354
355
mask = isna (scalars )
355
356
if not isinstance (strings , (pa .Array , pa .ChunkedArray )):
356
- strings = pa .array (strings , type = pa .string (), from_pandas = True )
357
+ strings = pa .array (strings , type = pa .string ())
357
358
strings = pc .if_else (mask , None , strings )
358
359
try :
359
360
scalars = strings .cast (pa .int64 ())
@@ -374,7 +375,7 @@ def _from_sequence_of_strings(
374
375
if isinstance (strings , (pa .Array , pa .ChunkedArray )):
375
376
scalars = strings
376
377
else :
377
- scalars = pa .array (strings , type = pa .string (), from_pandas = True )
378
+ scalars = pa .array (strings , type = pa .string ())
378
379
scalars = pc .if_else (pc .equal (scalars , "1.0" ), "1" , scalars )
379
380
scalars = pc .if_else (pc .equal (scalars , "0.0" ), "0" , scalars )
380
381
scalars = scalars .cast (pa .bool_ ())
@@ -386,6 +387,13 @@ def _from_sequence_of_strings(
386
387
from pandas .core .tools .numeric import to_numeric
387
388
388
389
scalars = to_numeric (strings , errors = "raise" )
390
+ if not pa .types .is_decimal (pa_type ):
391
+ # TODO: figure out why doing this cast breaks with decimal dtype
392
+ # in test_from_sequence_of_strings_pa_array
393
+ mask = strings .is_null ()
394
+ scalars = pa .array (scalars , mask = np .array (mask ), type = pa_type )
395
+ # TODO: could we just do strings.cast(pa_type)?
396
+
389
397
else :
390
398
raise NotImplementedError (
391
399
f"Converting strings to { pa_type } is not implemented."
@@ -428,7 +436,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
428
436
"""
429
437
if isinstance (value , pa .Scalar ):
430
438
pa_scalar = value
431
- elif isna (value ):
439
+ elif isna (value ) and not lib . is_float ( value ) :
432
440
pa_scalar = pa .scalar (None , type = pa_type )
433
441
else :
434
442
# Workaround https://github.com/apache/arrow/issues/37291
@@ -445,7 +453,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
445
453
value = value .as_unit (pa_type .unit )
446
454
value = value ._value
447
455
448
- pa_scalar = pa .scalar (value , type = pa_type , from_pandas = True )
456
+ pa_scalar = pa .scalar (value , type = pa_type )
449
457
450
458
if pa_type is not None and pa_scalar .type != pa_type :
451
459
pa_scalar = pa_scalar .cast (pa_type )
@@ -477,6 +485,13 @@ def _box_pa_array(
477
485
if copy :
478
486
value = value .copy ()
479
487
pa_array = value .__arrow_array__ ()
488
+
489
+ elif hasattr (value , "__arrow_array__" ):
490
+ # e.g. StringArray
491
+ if copy :
492
+ value = value .copy ()
493
+ pa_array = value .__arrow_array__ ()
494
+
480
495
else :
481
496
if (
482
497
isinstance (value , np .ndarray )
@@ -530,19 +545,32 @@ def _box_pa_array(
530
545
pa_array = pa .array (dta ._ndarray , type = pa_type , mask = dta_mask )
531
546
return pa_array
532
547
548
+ mask = None
549
+ if getattr (value , "dtype" , None ) is None or value .dtype .kind not in "mfM" :
550
+ # similar to isna(value) but exclude NaN
551
+ # TODO: cythonize!
552
+ mask = np .array ([x is NA or x is None for x in value ], dtype = bool )
553
+
554
+ from_pandas = False
555
+ if pa .types .is_integer (pa_type ):
556
+ # If user specifically asks to cast a numpy float array with NaNs
557
+ # to pyarrow integer, we'll treat those NaNs as NA
558
+ from_pandas = True
533
559
try :
534
- pa_array = pa .array (value , type = pa_type , from_pandas = True )
560
+ pa_array = pa .array (
561
+ value , type = pa_type , mask = mask , from_pandas = from_pandas
562
+ )
535
563
except (pa .ArrowInvalid , pa .ArrowTypeError ):
536
564
# GH50430: let pyarrow infer type, then cast
537
- pa_array = pa .array (value , from_pandas = True )
565
+ pa_array = pa .array (value , mask = mask , from_pandas = from_pandas )
538
566
539
567
if pa_type is None and pa .types .is_duration (pa_array .type ):
540
568
# Workaround https://github.com/apache/arrow/issues/37291
541
569
from pandas .core .tools .timedeltas import to_timedelta
542
570
543
571
value = to_timedelta (value )
544
572
value = value .to_numpy ()
545
- pa_array = pa .array (value , type = pa_type , from_pandas = True )
573
+ pa_array = pa .array (value , type = pa_type )
546
574
547
575
if pa .types .is_duration (pa_array .type ) and pa_array .null_count > 0 :
548
576
# GH52843: upstream bug for duration types when originally
@@ -1208,7 +1236,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
1208
1236
if not len (values ):
1209
1237
return np .zeros (len (self ), dtype = bool )
1210
1238
1211
- result = pc .is_in (self ._pa_array , value_set = pa .array (values , from_pandas = True ))
1239
+ result = pc .is_in (self ._pa_array , value_set = pa .array (values ))
1212
1240
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
1213
1241
# to False
1214
1242
return np .array (result , dtype = np .bool_ )
@@ -2015,7 +2043,7 @@ def __setitem__(self, key, value) -> None:
2015
2043
raise ValueError ("Length of indexer and values mismatch" )
2016
2044
chunks = [
2017
2045
* self ._pa_array [:key ].chunks ,
2018
- pa .array ([value ], type = self ._pa_array .type , from_pandas = True ),
2046
+ pa .array ([value ], type = self ._pa_array .type ),
2019
2047
* self ._pa_array [key + 1 :].chunks ,
2020
2048
]
2021
2049
data = pa .chunked_array (chunks ).combine_chunks ()
@@ -2069,7 +2097,7 @@ def _rank_calc(
2069
2097
pa_type = pa .float64 ()
2070
2098
else :
2071
2099
pa_type = pa .uint64 ()
2072
- result = pa .array (ranked , type = pa_type , from_pandas = True )
2100
+ result = pa .array (ranked , type = pa_type )
2073
2101
return result
2074
2102
2075
2103
data = self ._pa_array .combine_chunks ()
@@ -2321,7 +2349,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
2321
2349
right , right_type = _to_numpy_and_type (right )
2322
2350
pa_type = left_type or right_type
2323
2351
result = np .where (cond , left , right )
2324
- return pa .array (result , type = pa_type , from_pandas = True )
2352
+ return pa .array (result , type = pa_type )
2325
2353
2326
2354
@classmethod
2327
2355
def _replace_with_mask (
@@ -2364,7 +2392,7 @@ def _replace_with_mask(
2364
2392
replacements = replacements .as_py ()
2365
2393
result = np .array (values , dtype = object )
2366
2394
result [mask ] = replacements
2367
- return pa .array (result , type = values .type , from_pandas = True )
2395
+ return pa .array (result , type = values .type )
2368
2396
2369
2397
# ------------------------------------------------------------------
2370
2398
# GroupBy Methods
@@ -2443,7 +2471,7 @@ def _groupby_op(
2443
2471
return type (self )(pa_result )
2444
2472
else :
2445
2473
# DatetimeArray, TimedeltaArray
2446
- pa_result = pa .array (result , from_pandas = True )
2474
+ pa_result = pa .array (result )
2447
2475
return type (self )(pa_result )
2448
2476
2449
2477
def _apply_elementwise (self , func : Callable ) -> list [list [Any ]]:
0 commit comments