BUG: read_csv with engine=pyarrow and numpy-nullable dtype

jbrockmendel · jbrockmendel · commit 98bedc463571 · 2025-08-05T17:49:00.000-07:00
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -814,6 +814,7 @@ I/O
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
 - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
+- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
 - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
 - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
 - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py
@@ -3,6 +3,8 @@
 from typing import TYPE_CHECKING
 import warnings
 
+import numpy as np
+
 from pandas._libs import lib
 from pandas.compat._optional import import_optional_dependency
 from pandas.errors import (
@@ -12,8 +14,13 @@
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import pandas_dtype
+from pandas.core.dtypes.dtypes import (
+    BaseMaskedDtype,
+)
 from pandas.core.dtypes.inference import is_integer
 
+from pandas.core.arrays.string_ import StringDtype
+
 from pandas.io._util import arrow_table_to_pandas
 from pandas.io.parsers.base_parser import ParserBase
 
@@ -140,20 +147,7 @@ def handle_warning(invalid_row) -> str:
             "encoding": self.encoding,
         }
 
-    def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
-        """
-        Processes data read in based on kwargs.
-
-        Parameters
-        ----------
-        frame: DataFrame
-            The DataFrame to process.
-
-        Returns
-        -------
-        DataFrame
-            The processed DataFrame.
-        """
+    def _finalize_column_names(self, frame: DataFrame) -> DataFrame:
         num_cols = len(frame.columns)
         multi_index_named = True
         if self.header is None:
@@ -196,6 +190,23 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
             if self.header is None and not multi_index_named:
                 frame.index.names = [None] * len(frame.index.names)
 
+        return frame
+
+    def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
+        """
+        Processes data read in based on kwargs.
+
+        Parameters
+        ----------
+        frame: DataFrame
+            The DataFrame to process.
+
+        Returns
+        -------
+        DataFrame
+            The processed DataFrame.
+        """
+
         if self.dtype is not None:
             # Ignore non-existent columns from dtype mapping
             # like other parsers do
@@ -282,14 +293,47 @@ def read(self) -> DataFrame:
 
             table = table.cast(new_schema)
 
+        workaround = False
+        pass_backend = dtype_backend
+        if self.dtype is not None and dtype_backend != "pyarrow":
+            # We pass dtype_backend="pyarrow" and subsequently cast
+            #  to avoid lossy conversion e.g. GH#56136
+            workaround = True
+            pass_backend = "numpy_nullable"
+
         with warnings.catch_warnings():
             warnings.filterwarnings(
                 "ignore",
                 "make_block is deprecated",
                 DeprecationWarning,
             )
             frame = arrow_table_to_pandas(
-                table, dtype_backend=dtype_backend, null_to_int64=True
+                table, dtype_backend=pass_backend, null_to_int64=True
             )
 
+        frame = self._finalize_column_names(frame)
+
+        if workaround and dtype_backend != "numpy_nullable":
+            old_dtype = self.dtype
+            if not isinstance(old_dtype, dict):
+                # e.g. test_categorical_dtype_utf16
+                old_dtype = dict.fromkeys(frame.columns, old_dtype)
+
+            # _finalize_pandas_output will call astype, but we need to make
+            #  sure all keys are populated appropriately.
+            new_dtype = {}
+            for key in frame.columns:
+                ser = frame[key]
+                if isinstance(ser.dtype, BaseMaskedDtype):
+                    new_dtype[key] = ser.dtype.numpy_dtype
+                elif isinstance(ser.dtype, StringDtype):
+                    # We cast here in case the user passed "category" in
+                    #  order to get the correct dtype.categories.dtype
+                    #  e.g. test_categorical_dtype_utf16
+                    new_dtype[key] = StringDtype(na_value=np.nan)
+                    frame[key] = frame[key].astype(new_dtype[key])
+
+            new_dtype.update(old_dtype)
+            self.dtype = new_dtype
+
         return self._finalize_pandas_output(frame)
diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py
@@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request):
     tm.assert_frame_equal(result, expected)
 
 
-# pyarrow engine failing:
-# https://github.com/pandas-dev/pandas/issues/56136
-@pytest.mark.usefixtures("pyarrow_xfail")
 def test_ea_int_avoid_overflow(all_parsers):
     # GH#32134
     parser = all_parsers
@@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
     tm.assert_frame_equal(result, expected)
 
 
-@xfail_pyarrow
 def test_accurate_parsing_of_large_integers(all_parsers):
     # GH#52505
     data = """SYMBOL,MOMENT,ID,ID_DEAL
diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py
@@ -670,11 +670,14 @@ def test_inf_na_values_with_int_index(all_parsers):
     tm.assert_frame_equal(out, expected)
 
 
-@xfail_pyarrow  # mismatched shape
 @pytest.mark.parametrize("na_filter", [True, False])
-def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
+def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter, request):
     # see gh-20377
     parser = all_parsers
+    if parser.engine == "pyarrow" and na_filter is False:
+        mark = pytest.mark.xfail(reason="mismatched shape")
+        request.applymarker(mark)
+
     data = "a,b,c\n1,,3\n4,5,6"
 
     # na_filter=True --> missing value becomes NaN.
@@ -798,7 +801,15 @@ def test_bool_and_nan_to_int(all_parsers):
 True
 False
 """
-    with pytest.raises(ValueError, match="convert|NoneType"):
+    msg = (
+        "cannot safely convert passed user dtype of int64 for "
+        "<class 'numpy.bool'> dtyped data in column 0 due to NA values"
+    )
+    if parser.engine == "python":
+        msg = "Unable to convert column 0 to type int64"
+    elif parser.engine == "pyarrow":
+        msg = r"cannot convert NA to integer"
+    with pytest.raises(ValueError, match=msg):
         parser.read_csv(StringIO(data), dtype="int")