BUG: read_csv with engine=pyarrow and numpy-nullable dtype (#62053)

jbrockmendel · web-flow · commit d42575f65002 · 2025-08-12T10:42:26.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -857,6 +857,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
+- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
 - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
diff --git a/pandas/io/_util.py b/pandas/io/_util.py
@@ -16,14 +16,23 @@
 )
 from pandas.compat._optional import import_optional_dependency
 
+from pandas.core.dtypes.common import pandas_dtype
+
 import pandas as pd
 
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import (
+        Callable,
+        Hashable,
+        Sequence,
+    )
 
     import pyarrow
 
-    from pandas._typing import DtypeBackend
+    from pandas._typing import (
+        DtypeArg,
+        DtypeBackend,
+    )
 
 
 def _arrow_dtype_mapping() -> dict:
@@ -64,6 +73,8 @@ def arrow_table_to_pandas(
     dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
     null_to_int64: bool = False,
     to_pandas_kwargs: dict | None = None,
+    dtype: DtypeArg | None = None,
+    names: Sequence[Hashable] | None = None,
 ) -> pd.DataFrame:
     pa = import_optional_dependency("pyarrow")
 
@@ -82,12 +93,77 @@ def arrow_table_to_pandas(
     elif using_string_dtype():
         if pa_version_under19p0:
             types_mapper = _arrow_string_types_mapper()
+        elif dtype is not None:
+            # GH#56136 Avoid lossy conversion to float64
+            # We'll convert to numpy below if
+            types_mapper = {
+                pa.int8(): pd.Int8Dtype(),
+                pa.int16(): pd.Int16Dtype(),
+                pa.int32(): pd.Int32Dtype(),
+                pa.int64(): pd.Int64Dtype(),
+            }.get
         else:
             types_mapper = None
     elif dtype_backend is lib.no_default or dtype_backend == "numpy":
-        types_mapper = None
+        if dtype is not None:
+            # GH#56136 Avoid lossy conversion to float64
+            # We'll convert to numpy below if
+            types_mapper = {
+                pa.int8(): pd.Int8Dtype(),
+                pa.int16(): pd.Int16Dtype(),
+                pa.int32(): pd.Int32Dtype(),
+                pa.int64(): pd.Int64Dtype(),
+            }.get
+        else:
+            types_mapper = None
     else:
         raise NotImplementedError
 
     df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
+    return _post_convert_dtypes(df, dtype_backend, dtype, names)
+
+
+def _post_convert_dtypes(
+    df: pd.DataFrame,
+    dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault,
+    dtype: DtypeArg | None,
+    names: Sequence[Hashable] | None,
+) -> pd.DataFrame:
+    if dtype is not None and (
+        dtype_backend is lib.no_default or dtype_backend == "numpy"
+    ):
+        # GH#56136 apply any user-provided dtype, and convert any IntegerDtype
+        #  columns the user didn't explicitly ask for.
+        if isinstance(dtype, dict):
+            if names is not None:
+                df.columns = names
+
+            cmp_dtypes = {
+                pd.Int8Dtype(),
+                pd.Int16Dtype(),
+                pd.Int32Dtype(),
+                pd.Int64Dtype(),
+            }
+            for col in df.columns:
+                if col not in dtype and df[col].dtype in cmp_dtypes:
+                    # Any key that the user didn't explicitly specify
+                    #  that got converted to IntegerDtype now gets converted
+                    #  to numpy dtype.
+                    dtype[col] = df[col].dtype.numpy_dtype
+
+            # Ignore non-existent columns from dtype mapping
+            # like other parsers do
+            dtype = {
+                key: pandas_dtype(dtype[key]) for key in dtype if key in df.columns
+            }
+
+        else:
+            dtype = pandas_dtype(dtype)
+
+        try:
+            df = df.astype(dtype)
+        except TypeError as err:
+            # GH#44901 reraise to keep api consistent
+            raise ValueError(str(err)) from err
+
     return df
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -11,13 +11,17 @@
 )
 from pandas.util._exceptions import find_stack_level
 
-from pandas.core.dtypes.common import pandas_dtype
+from pandas.core.dtypes.common import (
+    pandas_dtype,
+)
 from pandas.core.dtypes.inference import is_integer
 
 from pandas.io._util import arrow_table_to_pandas
 from pandas.io.parsers.base_parser import ParserBase
 
 if TYPE_CHECKING:
+    import pyarrow as pa
+
     from pandas._typing import ReadBuffer
 
     from pandas import DataFrame
@@ -162,13 +166,12 @@ def _get_convert_options(self):
 
         return convert_options
 
-    def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
-        num_cols = len(frame.columns)
+    def _adjust_column_names(self, table: pa.Table) -> bool:
+        num_cols = len(table.columns)
         multi_index_named = True
         if self.header is None:
             if self.names is None:
-                if self.header is None:
-                    self.names = range(num_cols)
+                self.names = range(num_cols)
             if len(self.names) != num_cols:
                 # usecols is passed through to pyarrow, we only handle index col here
                 # The only way self.names is not the same length as number of cols is
@@ -177,8 +180,7 @@ def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
                 columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
                 self.names = columns_prefix + self.names
                 multi_index_named = False
-            frame.columns = self.names
-        return frame, multi_index_named
+        return multi_index_named
 
     def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame:
         if self.index_col is not None:
@@ -227,21 +229,23 @@ def _finalize_dtype(self, frame: DataFrame) -> DataFrame:
                 raise ValueError(str(err)) from err
         return frame
 
-    def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
+    def _finalize_pandas_output(
+        self, frame: DataFrame, multi_index_named: bool
+    ) -> DataFrame:
         """
         Processes data read in based on kwargs.
 
         Parameters
         ----------
-        frame: DataFrame
+        frame : DataFrame
             The DataFrame to process.
+        multi_index_named : bool
 
         Returns
         -------
         DataFrame
             The processed DataFrame.
         """
-        frame, multi_index_named = self._adjust_column_names(frame)
         frame = self._do_date_conversions(frame.columns, frame)
         frame = self._finalize_index(frame, multi_index_named)
         frame = self._finalize_dtype(frame)
@@ -299,14 +303,23 @@ def read(self) -> DataFrame:
 
             table = table.cast(new_schema)
 
+        multi_index_named = self._adjust_column_names(table)
+
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore",
                 "make_block is deprecated",
                 DeprecationWarning,
             )
             frame = arrow_table_to_pandas(
-                table, dtype_backend=dtype_backend, null_to_int64=True
+                table,
+                dtype_backend=dtype_backend,
+                null_to_int64=True,
+                dtype=self.dtype,
+                names=self.names,
             )
 
-        return self._finalize_pandas_output(frame)
+        if self.header is None:
+            frame.columns = self.names
+
+        return self._finalize_pandas_output(frame, multi_index_named)
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request):
     tm.assert_frame_equal(result, expected)
 
 
-# pyarrow engine failing:
-# https://github.com/pandas-dev/pandas/issues/56136
-@pytest.mark.usefixtures("pyarrow_xfail")
 def test_ea_int_avoid_overflow(all_parsers):
     # GH#32134
     parser = all_parsers
@@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_accurate_parsing_of_large_integers(all_parsers):
     # GH#52505
     data = """SYMBOL,MOMENT,ID,ID_DEAL
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
@@ -670,11 +670,16 @@ def test_inf_na_values_with_int_index(all_parsers):
     tm.assert_frame_equal(out, expected)
 
 
-@xfail_pyarrow  # mismatched shape
 @pytest.mark.parametrize("na_filter", [True, False])
-def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
+def test_na_values_with_dtype_str_and_na_filter(
+    all_parsers, na_filter, using_infer_string, request
+):
     # see gh-20377
     parser = all_parsers
+    if parser.engine == "pyarrow" and (na_filter is False or not using_infer_string):
+        mark = pytest.mark.xfail(reason="mismatched shape")
+        request.applymarker(mark)
+
     data = "a,b,c\n1,,3\n4,5,6"
 
     # na_filter=True --> missing value becomes NaN.
@@ -798,7 +803,18 @@ def test_bool_and_nan_to_int(all_parsers):
 True
 False
 """
-    with pytest.raises(ValueError, match="convert|NoneType"):
+    msg = (
+        "cannot safely convert passed user dtype of int(64|32) for "
+        "<class 'numpy.bool_?'> dtyped data in column 0 due to NA values"
+    )
+    if parser.engine == "python":
+        msg = "Unable to convert column 0 to type int(64|32)"
+    elif parser.engine == "pyarrow":
+        msg = (
+            r"int\(\) argument must be a string, a bytes-like object or a "
+            "real number, not 'NoneType"
+        )
+    with pytest.raises(ValueError, match=msg):
         parser.read_csv(StringIO(data), dtype="int")