also fix for dtype_backend=numpy_nullable

kastkeepitjumpinlikekangaroos · kastkeepitjumpinlikekangaroos · commit 8f900b833cd3 · 2024-11-13T17:32:10.000-05:00
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -972,27 +972,30 @@ def convert(arr):
                     # i.e. maybe_convert_objects didn't convert
                     arr = maybe_infer_to_datetimelike(arr)
                     if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
+                        # Addressing (#59242)
+                        # Byte data that could not be decoded into
+                        # a string would throw a UnicodeDecodeError exception
+
+                        # Try and greedily convert to string
                         if dtype_backend == "pyarrow":
                             pa = import_optional_dependency("pyarrow")
-                            # Addressing (#59242)
-                            # Byte data that could not be decoded into
-                            # a string would throw a UnicodeDecodeError exception
-
-                            # Try and greedily convert to pyarrow string
-                            # Will fail if the object is bytes:
-                            # in this case convert to pyarrow binary
                             try:
                                 str_dtype = ArrowDtype(pa.string())
                                 str_cls = str_dtype.construct_array_type()
                                 arr = str_cls._from_sequence(arr, dtype=str_dtype)
                             except pa.lib.ArrowInvalid:
+                                # in this case convert to pyarrow binary
                                 bin_dtype = ArrowDtype(pa.binary())
                                 bin_cls = bin_dtype.construct_array_type()
                                 arr = bin_cls._from_sequence(arr, dtype=bin_dtype)
                         else:
-                            new_dtype = StringDtype()
-                            arr_cls = new_dtype.construct_array_type()
-                            arr = arr_cls._from_sequence(arr, dtype=new_dtype)
+                            try:
+                                new_dtype = StringDtype()
+                                arr_cls = new_dtype.construct_array_type()
+                                arr = arr_cls._from_sequence(arr, dtype=new_dtype)
+                            except UnicodeDecodeError:
+                                # in this case do nothing
+                                pass
 
                 elif dtype_backend != "numpy" and isinstance(arr, np.ndarray):
                     if arr.dtype.kind in "iufb":
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
@@ -4357,16 +4357,17 @@ def test_xsqlite_if_exists(sqlite_buildin):
     drop_table(table_name, sqlite_buildin)
 
 
-def test_bytes_column(sqlite_buildin):
+@pytest.mark.parametrize("dtype_backend", ["pyarrow", "numpy_nullable", lib.no_default])
+def test_bytes_column(sqlite_buildin, dtype_backend):
     pytest.importorskip("pyarrow")
     """
     Regression test for (#59242)
     Bytes being returned in a column that could not be converted
     to a string would raise a UnicodeDecodeError
-    when using dtype_backend='pyarrow'
+    when using dtype_backend='pyarrow' or dtype_backend='numpy_nullable'
     """
     query = """
     select cast(x'0123456789abcdef0123456789abcdef' as blob) a
     """
-    df = pd.read_sql(query, sqlite_buildin, dtype_backend="pyarrow")
+    df = pd.read_sql(query, sqlite_buildin, dtype_backend=dtype_backend)
     assert df.a.values[0] == b"\x01#Eg\x89\xab\xcd\xef\x01#Eg\x89\xab\xcd\xef"