Skip to content

Commit 6a23f05

Browse files
address comment
1 parent bd00fc5 commit 6a23f05

File tree

1 file changed

+21
-8
lines changed

1 file changed

+21
-8
lines changed

pandas/core/internals/construction.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from pandas._config import using_string_dtype
1818

1919
from pandas._libs import lib
20+
from pandas.compat._optional import import_optional_dependency
2021

2122
from pandas.core.dtypes.astype import astype_is_view
2223
from pandas.core.dtypes.cast import (
@@ -34,7 +35,10 @@
3435
is_object_dtype,
3536
is_scalar,
3637
)
37-
from pandas.core.dtypes.dtypes import ExtensionDtype
38+
from pandas.core.dtypes.dtypes import (
39+
ArrowDtype,
40+
ExtensionDtype,
41+
)
3842
from pandas.core.dtypes.generic import (
3943
ABCDataFrame,
4044
ABCSeries,
@@ -968,18 +972,27 @@ def convert(arr):
968972
# i.e. maybe_convert_objects didn't convert
969973
arr = maybe_infer_to_datetimelike(arr)
970974
if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
971-
new_dtype = StringDtype()
972-
arr_cls = new_dtype.construct_array_type()
973-
try:
975+
if dtype_backend == "pyarrow":
976+
pa = import_optional_dependency("pyarrow")
974977
# Addressing (#59242)
975978
# Byte data that could not be decoded into
976979
# a string would throw a UnicodeDecodeError exception
977980

978-
# Try and greedily convert to string
979-
# Will fail if the object is bytes
981+
# Try and greedily convert to pyarrow string
982+
# Will fail if the object is bytes:
983+
# in this case convert to pyarrow binary
984+
try:
985+
str_dtype = ArrowDtype(pa.string())
986+
str_cls = str_dtype.construct_array_type()
987+
arr = str_cls._from_sequence(arr, dtype=str_dtype)
988+
except pa.lib.ArrowInvalid:
989+
bin_dtype = ArrowDtype(pa.binary())
990+
bin_cls = bin_dtype.construct_array_type()
991+
arr = bin_cls._from_sequence(arr, dtype=bin_dtype)
992+
else:
993+
new_dtype = StringDtype()
994+
arr_cls = new_dtype.construct_array_type()
980995
arr = arr_cls._from_sequence(arr, dtype=new_dtype)
981-
except UnicodeDecodeError:
982-
pass
983996

984997
elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
985998
if arr.dtype.kind in "iufb":

0 commit comments

Comments
 (0)