Skip to content

Commit 8f900b8

Browse files
also fix for dtype_backend=numpy_nullable
1 parent 2f261c8 commit 8f900b8

File tree

2 files changed

+17
-13
lines changed

2 files changed

+17
-13
lines changed

pandas/core/internals/construction.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -972,27 +972,30 @@ def convert(arr):
972972
# i.e. maybe_convert_objects didn't convert
973973
arr = maybe_infer_to_datetimelike(arr)
974974
if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
975+
# Addressing (#59242)
976+
# Byte data that could not be decoded into
977+
# a string would throw a UnicodeDecodeError exception
978+
979+
# Try and greedily convert to string
975980
if dtype_backend == "pyarrow":
976981
pa = import_optional_dependency("pyarrow")
977-
# Addressing (#59242)
978-
# Byte data that could not be decoded into
979-
# a string would throw a UnicodeDecodeError exception
980-
981-
# Try and greedily convert to pyarrow string
982-
# Will fail if the object is bytes:
983-
# in this case convert to pyarrow binary
984982
try:
985983
str_dtype = ArrowDtype(pa.string())
986984
str_cls = str_dtype.construct_array_type()
987985
arr = str_cls._from_sequence(arr, dtype=str_dtype)
988986
except pa.lib.ArrowInvalid:
987+
# in this case convert to pyarrow binary
989988
bin_dtype = ArrowDtype(pa.binary())
990989
bin_cls = bin_dtype.construct_array_type()
991990
arr = bin_cls._from_sequence(arr, dtype=bin_dtype)
992991
else:
993-
new_dtype = StringDtype()
994-
arr_cls = new_dtype.construct_array_type()
995-
arr = arr_cls._from_sequence(arr, dtype=new_dtype)
992+
try:
993+
new_dtype = StringDtype()
994+
arr_cls = new_dtype.construct_array_type()
995+
arr = arr_cls._from_sequence(arr, dtype=new_dtype)
996+
except UnicodeDecodeError:
997+
# in this case do nothing
998+
pass
996999

9971000
elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
9981001
if arr.dtype.kind in "iufb":

pandas/tests/io/test_sql.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4357,16 +4357,17 @@ def test_xsqlite_if_exists(sqlite_buildin):
43574357
drop_table(table_name, sqlite_buildin)
43584358

43594359

4360-
def test_bytes_column(sqlite_buildin):
4360+
@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default])
4361+
def test_bytes_column(sqlite_buildin, dtype_backend):
43614362
pytest.importorskip("pyarrow")
43624363
"""
43634364
Regression test for (#59242)
43644365
Bytes being returned in a column that could not be converted
43654366
to a string would raise a UnicodeDecodeError
4366-
when using dtype_backend='pyarrow'
4367+
when using dtype_backend='pyarrow' or dtype_backend='numpy_nullable'
43674368
"""
43684369
query = """
43694370
select cast(x'0123456789abcdef0123456789abcdef' as blob) a
43704371
"""
4371-
df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow")
4372+
df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend)
43724373
assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef"

0 commit comments

Comments
 (0)