Use Matts idea

jbrockmendel · jbrockmendel · commit 221328d4ee51 · 2025-08-11T11:29:33.000-07:00
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -16,14 +16,23 @@
 )
 from pandas.compat._optional import import_optional_dependency
 
+from pandas.core.dtypes.common import pandas_dtype
+
 import pandas as pd
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import (
+        Callable,
+        Hashable,
+        Sequence,
+    )
 
     import pyarrow
 
-    from pandas._typing import DtypeBackend
+    from pandas._typing import (
+        DtypeArg,
+        DtypeBackend,
+    )
 
 
 def _arrow_dtype_mapping() -> dict:
@@ -64,6 +73,8 @@ def arrow_table_to_pandas(
     dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
     null_to_int64: bool = False,
     to_pandas_kwargs: dict | None = None,
+    dtype: DtypeArg | None = None,
+    names: Sequence[Hashable] | None = None,
 ) -> pd.DataFrame:
     pa = import_optional_dependency("pyarrow")
 
@@ -82,12 +93,77 @@ def arrow_table_to_pandas(
     elif using_string_dtype():
         if pa_version_under19p0:
             types_mapper = _arrow_string_types_mapper()
+        elif dtype is not None:
+            # GH#56136 Avoid lossy conversion to float64
+            # We'll convert to numpy below if
+            types_mapper = {
+                pa.int8(): pd.Int8Dtype(),
+                pa.int16(): pd.Int16Dtype(),
+                pa.int32(): pd.Int32Dtype(),
+                pa.int64(): pd.Int64Dtype(),
+            }.get
         else:
             types_mapper = None
     elif dtype_backend is lib.no_default or dtype_backend == "numpy":
-        types_mapper = None
+        if dtype is not None:
+            # GH#56136 Avoid lossy conversion to float64
+            # We'll convert to numpy below if
+            types_mapper = {
+                pa.int8(): pd.Int8Dtype(),
+                pa.int16(): pd.Int16Dtype(),
+                pa.int32(): pd.Int32Dtype(),
+                pa.int64(): pd.Int64Dtype(),
+            }.get
+        else:
+            types_mapper = None
     else:
         raise NotImplementedError
 
     df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
+    return _post_convert_dtypes(df, dtype_backend, dtype, names)
+
+
+def _post_convert_dtypes(
+    df: pd.DataFrame,
+    dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault,
+    dtype: DtypeArg | None,
+    names: Sequence[Hashable] | None,
+) -> pd.DataFrame:
+    if dtype is not None and (
+        dtype_backend is lib.no_default or dtype_backend == "numpy"
+    ):
+        # GH#56136 apply any user-provided dtype, and convert any IntegerDtype
+        #  columns the user didn't explicitly ask for.
+        if isinstance(dtype, dict):
+            if names is not None:
+                df.columns = names
+
+            cmp_dtypes = {
+                pd.Int8Dtype(),
+                pd.Int16Dtype(),
+                pd.Int32Dtype(),
+                pd.Int64Dtype(),
+            }
+            for col in df.columns:
+                if col not in dtype and df[col].dtype in cmp_dtypes:
+                    # Any key that the user didn't explicitly specify
+                    #  that got converted to IntegerDtype now gets converted
+                    #  to numpy dtype.
+                    dtype[col] = df[col].dtype.numpy_dtype
+
+            # Ignore non-existent columns from dtype mapping
+            # like other parsers do
+            dtype = {
+                key: pandas_dtype(dtype[key]) for key in dtype if key in df.columns
+            }
+
+        else:
+            dtype = pandas_dtype(dtype)
+
+        try:
+            df = df.astype(dtype)
+        except TypeError as err:
+            # GH#44901 reraise to keep api consistent
+            raise ValueError(str(err)) from err
+
     return df
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -3,10 +3,6 @@
 from typing import TYPE_CHECKING
 import warnings
 
-import numpy as np
-
-from pandas._config import using_string_dtype
-
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
@@ -16,20 +12,16 @@
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
-    is_string_dtype,
     pandas_dtype,
 )
-from pandas.core.dtypes.dtypes import (
-    BaseMaskedDtype,
-)
 from pandas.core.dtypes.inference import is_integer
 
-from pandas.core.arrays.string_ import StringDtype
-
 from pandas.io._util import arrow_table_to_pandas
 from pandas.io.parsers.base_parser import ParserBase
 
 if TYPE_CHECKING:
+    import pyarrow as pa
+
     from pandas._typing import ReadBuffer
 
     from pandas import DataFrame
@@ -174,8 +166,8 @@ def _get_convert_options(self):
 
         return convert_options
 
-    def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
-        num_cols = len(frame.columns)
+    def _adjust_column_names(self, table: pa.Table) -> bool:
+        num_cols = len(table.columns)
         multi_index_named = True
         if self.header is None:
             if self.names is None:
@@ -188,8 +180,7 @@ def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
                 columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
                 self.names = columns_prefix + self.names
                 multi_index_named = False
-            frame.columns = self.names
-        return frame, multi_index_named
+        return multi_index_named
 
     def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame:
         if self.index_col is not None:
@@ -312,13 +303,7 @@ def read(self) -> DataFrame:
 
             table = table.cast(new_schema)
 
-        workaround = False
-        pass_backend = dtype_backend
-        if self.dtype is not None and dtype_backend != "pyarrow":
-            # We pass dtype_backend="pyarrow" and subsequently cast
-            #  to avoid lossy conversion e.g. GH#56136
-            workaround = True
-            pass_backend = "numpy_nullable"
+        multi_index_named = self._adjust_column_names(table)
 
         with warnings.catch_warnings():
             warnings.filterwarnings(
@@ -327,49 +312,14 @@ def read(self) -> DataFrame:
                 DeprecationWarning,
             )
             frame = arrow_table_to_pandas(
-                table, dtype_backend=pass_backend, null_to_int64=True
+                table,
+                dtype_backend=dtype_backend,
+                null_to_int64=True,
+                dtype=self.dtype,
+                names=self.names,
             )
 
-        frame, multi_index_named = self._adjust_column_names(frame)
-
-        if workaround and dtype_backend != "numpy_nullable":
-            old_dtype = self.dtype
-            if not isinstance(old_dtype, dict):
-                # e.g. test_categorical_dtype_utf16
-                old_dtype = dict.fromkeys(frame.columns, old_dtype)
-
-            # _finalize_pandas_output will call astype, but we need to make
-            #  sure all keys are populated appropriately.
-            new_dtype = {}
-            for key in frame.columns:
-                ser = frame[key]
-                if isinstance(ser.dtype, BaseMaskedDtype):
-                    new_dtype[key] = ser.dtype.numpy_dtype
-                    if (
-                        key in old_dtype
-                        and not using_string_dtype()
-                        and is_string_dtype(old_dtype[key])
-                        and not isinstance(old_dtype[key], StringDtype)
-                        and ser.array._hasna
-                    ):
-                        # Cast to make sure we get "NaN" string instead of "NA"
-                        frame[key] = ser.astype(old_dtype[key])
-                        frame.loc[ser.isna(), key] = np.nan
-                        old_dtype[key] = object  # Avoid re-casting
-                elif isinstance(ser.dtype, StringDtype):
-                    # We cast here in case the user passed "category" in
-                    #  order to get the correct dtype.categories.dtype
-                    #  e.g. test_categorical_dtype_utf16
-                    if not using_string_dtype():
-                        sdt = np.dtype(object)
-                        frame[key] = ser.astype(sdt)
-                        frame.loc[ser.isna(), key] = np.nan
-                    else:
-                        sdt = StringDtype(na_value=np.nan)  # type: ignore[assignment]
-                        frame[key] = frame[key].astype(sdt)
-                    new_dtype[key] = sdt
-
-            new_dtype.update(old_dtype)
-            self.dtype = new_dtype
+        if self.header is None:
+            frame.columns = self.names
 
         return self._finalize_pandas_output(frame, multi_index_named)
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
@@ -808,7 +808,10 @@ def test_bool_and_nan_to_int(all_parsers):
     if parser.engine == "python":
         msg = "Unable to convert column 0 to type int(64|32)"
     elif parser.engine == "pyarrow":
-        msg = r"cannot convert NA to integer"
+        msg = (
+            r"int\(\) argument must be a string, a bytes-like object or a "
+            "real number, not 'NoneType"
+        )
     with pytest.raises(ValueError, match=msg):
         parser.read_csv(StringIO(data), dtype="int")