|
17 | 17 | from pandas._config import using_string_dtype |
18 | 18 |
|
19 | 19 | from pandas._libs import lib |
| 20 | +from pandas.compat._optional import import_optional_dependency |
20 | 21 |
|
21 | 22 | from pandas.core.dtypes.astype import astype_is_view |
22 | 23 | from pandas.core.dtypes.cast import ( |
|
34 | 35 | is_object_dtype, |
35 | 36 | is_scalar, |
36 | 37 | ) |
37 | | -from pandas.core.dtypes.dtypes import ExtensionDtype |
| 38 | +from pandas.core.dtypes.dtypes import ( |
| 39 | + ArrowDtype, |
| 40 | + ExtensionDtype, |
| 41 | +) |
38 | 42 | from pandas.core.dtypes.generic import ( |
39 | 43 | ABCDataFrame, |
40 | 44 | ABCSeries, |
@@ -968,18 +972,27 @@ def convert(arr): |
968 | 972 | # i.e. maybe_convert_objects didn't convert |
969 | 973 | arr = maybe_infer_to_datetimelike(arr) |
970 | 974 | if dtype_backend != "numpy" and arr.dtype == np.dtype("O"): |
971 | | - new_dtype = StringDtype() |
972 | | - arr_cls = new_dtype.construct_array_type() |
973 | | - try: |
| 975 | + if dtype_backend == "pyarrow": |
| 976 | + pa = import_optional_dependency("pyarrow") |
974 | 977 | # Addressing (#59242) |
975 | 978 | # Byte data that could not be decoded into |
976 | 979 | # a string would throw a UnicodeDecodeError exception |
977 | 980 |
|
978 | | - # Try and greedily convert to string |
979 | | - # Will fail if the object is bytes |
| 981 | + # Try and greedily convert to pyarrow string |
| 982 | + # Will fail if the object is bytes: |
| 983 | + # in this case convert to pyarrow binary |
| 984 | + try: |
| 985 | + str_dtype = ArrowDtype(pa.string()) |
| 986 | + str_cls = str_dtype.construct_array_type() |
| 987 | + arr = str_cls._from_sequence(arr, dtype=str_dtype) |
| 988 | + except pa.lib.ArrowInvalid: |
| 989 | + bin_dtype = ArrowDtype(pa.binary()) |
| 990 | + bin_cls = bin_dtype.construct_array_type() |
| 991 | + arr = bin_cls._from_sequence(arr, dtype=bin_dtype) |
| 992 | + else: |
| 993 | + new_dtype = StringDtype() |
| 994 | + arr_cls = new_dtype.construct_array_type() |
980 | 995 | arr = arr_cls._from_sequence(arr, dtype=new_dtype) |
981 | | - except UnicodeDecodeError: |
982 | | - pass |
983 | 996 |
|
984 | 997 | elif dtype_backend != "numpy" and isinstance(arr, np.ndarray): |
985 | 998 | if arr.dtype.kind in "iufb": |
|
0 commit comments