1616import numpy as np
1717
1818from pandas ._libs import lib
19+ from pandas ._libs .missing import NA
1920from pandas ._libs .tslibs import (
2021 Timedelta ,
2122 Timestamp ,
@@ -351,7 +352,7 @@ def _from_sequence_of_strings(
351352 # duration to string casting behavior
352353 mask = isna (scalars )
353354 if not isinstance (strings , (pa .Array , pa .ChunkedArray )):
354- strings = pa .array (strings , type = pa .string (), from_pandas = True )
355+ strings = pa .array (strings , type = pa .string ())
355356 strings = pc .if_else (mask , None , strings )
356357 try :
357358 scalars = strings .cast (pa .int64 ())
@@ -372,7 +373,7 @@ def _from_sequence_of_strings(
372373 if isinstance (strings , (pa .Array , pa .ChunkedArray )):
373374 scalars = strings
374375 else :
375- scalars = pa .array (strings , type = pa .string (), from_pandas = True )
376+ scalars = pa .array (strings , type = pa .string ())
376377 scalars = pc .if_else (pc .equal (scalars , "1.0" ), "1" , scalars )
377378 scalars = pc .if_else (pc .equal (scalars , "0.0" ), "0" , scalars )
378379 scalars = scalars .cast (pa .bool_ ())
@@ -384,6 +385,13 @@ def _from_sequence_of_strings(
384385 from pandas .core .tools .numeric import to_numeric
385386
386387 scalars = to_numeric (strings , errors = "raise" )
388+ if not pa .types .is_decimal (pa_type ):
389+ # TODO: figure out why doing this cast breaks with decimal dtype
390+ # in test_from_sequence_of_strings_pa_array
391+ mask = strings .is_null ()
392+ scalars = pa .array (scalars , mask = np .array (mask ), type = pa_type )
393+ # TODO: could we just do strings.cast(pa_type)?
394+
387395 else :
388396 raise NotImplementedError (
389397 f"Converting strings to { pa_type } is not implemented."
@@ -426,7 +434,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
426434 """
427435 if isinstance (value , pa .Scalar ):
428436 pa_scalar = value
429- elif isna (value ):
437+ elif isna (value ) and not lib . is_float ( value ) :
430438 pa_scalar = pa .scalar (None , type = pa_type )
431439 else :
432440 # Workaround https://github.com/apache/arrow/issues/37291
@@ -443,7 +451,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
443451 value = value .as_unit (pa_type .unit )
444452 value = value ._value
445453
446- pa_scalar = pa .scalar (value , type = pa_type , from_pandas = True )
454+ pa_scalar = pa .scalar (value , type = pa_type )
447455
448456 if pa_type is not None and pa_scalar .type != pa_type :
449457 pa_scalar = pa_scalar .cast (pa_type )
@@ -475,6 +483,13 @@ def _box_pa_array(
475483 if copy :
476484 value = value .copy ()
477485 pa_array = value .__arrow_array__ ()
486+
487+ elif hasattr (value , "__arrow_array__" ):
488+ # e.g. StringArray
489+ if copy :
490+ value = value .copy ()
491+ pa_array = value .__arrow_array__ ()
492+
478493 else :
479494 if (
480495 isinstance (value , np .ndarray )
@@ -528,19 +543,32 @@ def _box_pa_array(
528543 pa_array = pa .array (dta ._ndarray , type = pa_type , mask = mask )
529544 return pa_array
530545
546+ mask = None
547+ if getattr (value , "dtype" , None ) is None or value .dtype .kind not in "mfM" :
548+ # similar to isna(value) but exclude NaN
549+ # TODO: cythonize!
550+ mask = np .array ([x is NA or x is None for x in value ], dtype = bool )
551+
552+ from_pandas = False
553+ if pa .types .is_integer (pa_type ):
554+ # If user specifically asks to cast a numpy float array with NaNs
555+ # to pyarrow integer, we'll treat those NaNs as NA
556+ from_pandas = True
531557 try :
532- pa_array = pa .array (value , type = pa_type , from_pandas = True )
558+ pa_array = pa .array (
559+ value , type = pa_type , mask = mask , from_pandas = from_pandas
560+ )
533561 except (pa .ArrowInvalid , pa .ArrowTypeError ):
534562 # GH50430: let pyarrow infer type, then cast
535- pa_array = pa .array (value , from_pandas = True )
563+ pa_array = pa .array (value , mask = mask , from_pandas = from_pandas )
536564
537565 if pa_type is None and pa .types .is_duration (pa_array .type ):
538566 # Workaround https://github.com/apache/arrow/issues/37291
539567 from pandas .core .tools .timedeltas import to_timedelta
540568
541569 value = to_timedelta (value )
542570 value = value .to_numpy ()
543- pa_array = pa .array (value , type = pa_type , from_pandas = True )
571+ pa_array = pa .array (value , type = pa_type )
544572
545573 if pa .types .is_duration (pa_array .type ) and pa_array .null_count > 0 :
546574 # GH52843: upstream bug for duration types when originally
@@ -1187,7 +1215,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
11871215 if not len (values ):
11881216 return np .zeros (len (self ), dtype = bool )
11891217
1190- result = pc .is_in (self ._pa_array , value_set = pa .array (values , from_pandas = True ))
1218+ result = pc .is_in (self ._pa_array , value_set = pa .array (values ))
11911219 # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
11921220 # to False
11931221 return np .array (result , dtype = np .bool_ )
@@ -1992,7 +2020,7 @@ def __setitem__(self, key, value) -> None:
19922020 raise ValueError ("Length of indexer and values mismatch" )
19932021 chunks = [
19942022 * self ._pa_array [:key ].chunks ,
1995- pa .array ([value ], type = self ._pa_array .type , from_pandas = True ),
2023+ pa .array ([value ], type = self ._pa_array .type ),
19962024 * self ._pa_array [key + 1 :].chunks ,
19972025 ]
19982026 data = pa .chunked_array (chunks ).combine_chunks ()
@@ -2046,7 +2074,7 @@ def _rank_calc(
20462074 pa_type = pa .float64 ()
20472075 else :
20482076 pa_type = pa .uint64 ()
2049- result = pa .array (ranked , type = pa_type , from_pandas = True )
2077+ result = pa .array (ranked , type = pa_type )
20502078 return result
20512079
20522080 data = self ._pa_array .combine_chunks ()
@@ -2298,7 +2326,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
22982326 right , right_type = _to_numpy_and_type (right )
22992327 pa_type = left_type or right_type
23002328 result = np .where (cond , left , right )
2301- return pa .array (result , type = pa_type , from_pandas = True )
2329+ return pa .array (result , type = pa_type )
23022330
23032331 @classmethod
23042332 def _replace_with_mask (
@@ -2341,7 +2369,7 @@ def _replace_with_mask(
23412369 replacements = replacements .as_py ()
23422370 result = np .array (values , dtype = object )
23432371 result [mask ] = replacements
2344- return pa .array (result , type = values .type , from_pandas = True )
2372+ return pa .array (result , type = values .type )
23452373
23462374 # ------------------------------------------------------------------
23472375 # GroupBy Methods
@@ -2420,7 +2448,7 @@ def _groupby_op(
24202448 return type (self )(pa_result )
24212449 else :
24222450 # DatetimeArray, TimedeltaArray
2423- pa_result = pa .array (result , from_pandas = True )
2451+ pa_result = pa .array (result )
24242452 return type (self )(pa_result )
24252453
24262454 def _apply_elementwise (self , func : Callable ) -> list [list [Any ]]:
0 commit comments