diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 1c176f7343b2d..a49ad86ab464d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -852,6 +852,7 @@ I/O - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`) - Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`) - Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`) +- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`) - Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`) - Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`) - Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`) diff --git a/pandas/io/_util.py b/pandas/io/_util.py index 6827fbe9c998e..7f21b45265da6 100644 --- a/pandas/io/_util.py +++ b/pandas/io/_util.py @@ -16,14 +16,23 @@ ) from pandas.compat._optional import import_optional_dependency +from pandas.core.dtypes.common import pandas_dtype + import pandas as pd if TYPE_CHECKING: - from collections.abc import Callable + from collections.abc import ( + Callable, + Hashable, + Sequence, + ) import pyarrow - from pandas._typing import DtypeBackend + from pandas._typing import ( + DtypeArg, + DtypeBackend, + ) def _arrow_dtype_mapping() -> dict: @@ -64,6 +73,8 @@ def arrow_table_to_pandas( dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default, null_to_int64: bool = False, to_pandas_kwargs: dict | None = None, + dtype: DtypeArg | None = None, + names: Sequence[Hashable] | None = None, ) -> pd.DataFrame: pa = import_optional_dependency("pyarrow") @@ -82,12 +93,77 @@ def arrow_table_to_pandas( elif using_string_dtype(): if pa_version_under19p0: types_mapper = _arrow_string_types_mapper() + elif dtype is not None: + # GH#56136 Avoid lossy conversion to float64 + # We'll convert to numpy below if + types_mapper = { + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + }.get else: types_mapper = None elif dtype_backend is lib.no_default or dtype_backend == "numpy": - types_mapper = None + if dtype is not None: + # GH#56136 Avoid lossy conversion to float64 + # We'll convert to numpy below if + types_mapper = { + pa.int8(): pd.Int8Dtype(), + pa.int16(): pd.Int16Dtype(), + pa.int32(): pd.Int32Dtype(), + pa.int64(): pd.Int64Dtype(), + }.get + else: + types_mapper = None else: raise NotImplementedError df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs) + return _post_convert_dtypes(df, dtype_backend, dtype, names) + + +def _post_convert_dtypes( + df: pd.DataFrame, + dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault, + dtype: DtypeArg | None, + names: Sequence[Hashable] | None, +) -> pd.DataFrame: + if dtype is not None and ( + dtype_backend is lib.no_default or dtype_backend == "numpy" + ): + # GH#56136 apply any user-provided dtype, and convert any IntegerDtype + # columns the user didn't explicitly ask for. + if isinstance(dtype, dict): + if names is not None: + df.columns = names + + cmp_dtypes = { + pd.Int8Dtype(), + pd.Int16Dtype(), + pd.Int32Dtype(), + pd.Int64Dtype(), + } + for col in df.columns: + if col not in dtype and df[col].dtype in cmp_dtypes: + # Any key that the user didn't explicitly specify + # that got converted to IntegerDtype now gets converted + # to numpy dtype. + dtype[col] = df[col].dtype.numpy_dtype + + # Ignore non-existent columns from dtype mapping + # like other parsers do + dtype = { + key: pandas_dtype(dtype[key]) for key in dtype if key in df.columns + } + + else: + dtype = pandas_dtype(dtype) + + try: + df = df.astype(dtype) + except TypeError as err: + # GH#44901 reraise to keep api consistent + raise ValueError(str(err)) from err + return df diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index 2f7b001456654..e61f9bcec0d62 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -11,13 +11,17 @@ ) from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.common import ( + pandas_dtype, +) from pandas.core.dtypes.inference import is_integer from pandas.io._util import arrow_table_to_pandas from pandas.io.parsers.base_parser import ParserBase if TYPE_CHECKING: + import pyarrow as pa + from pandas._typing import ReadBuffer from pandas import DataFrame @@ -162,13 +166,12 @@ def _get_convert_options(self): return convert_options - def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]: - num_cols = len(frame.columns) + def _adjust_column_names(self, table: pa.Table) -> bool: + num_cols = len(table.columns) multi_index_named = True if self.header is None: if self.names is None: - if self.header is None: - self.names = range(num_cols) + self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here # The only way self.names is not the same length as number of cols is @@ -177,8 +180,7 @@ def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]: columns_prefix = [str(x) for x in range(num_cols - len(self.names))] self.names = columns_prefix + self.names multi_index_named = False - frame.columns = self.names - return frame, multi_index_named + return multi_index_named def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame: if self.index_col is not None: @@ -227,21 +229,23 @@ def _finalize_dtype(self, frame: DataFrame) -> DataFrame: raise ValueError(str(err)) from err return frame - def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: + def _finalize_pandas_output( + self, frame: DataFrame, multi_index_named: bool + ) -> DataFrame: """ Processes data read in based on kwargs. Parameters ---------- - frame: DataFrame + frame : DataFrame The DataFrame to process. + multi_index_named : bool Returns ------- DataFrame The processed DataFrame. """ - frame, multi_index_named = self._adjust_column_names(frame) frame = self._do_date_conversions(frame.columns, frame) frame = self._finalize_index(frame, multi_index_named) frame = self._finalize_dtype(frame) @@ -299,6 +303,8 @@ def read(self) -> DataFrame: table = table.cast(new_schema) + multi_index_named = self._adjust_column_names(table) + with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -306,7 +312,14 @@ def read(self) -> DataFrame: DeprecationWarning, ) frame = arrow_table_to_pandas( - table, dtype_backend=dtype_backend, null_to_int64=True + table, + dtype_backend=dtype_backend, + null_to_int64=True, + dtype=self.dtype, + names=self.names, ) - return self._finalize_pandas_output(frame) + if self.header is None: + frame.columns = self.names + + return self._finalize_pandas_output(frame, multi_index_named) diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 75b7cf0d42cb8..e4563afc631c5 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request): tm.assert_frame_equal(result, expected) -# pyarrow engine failing: -# https://github.com/pandas-dev/pandas/issues/56136 -@pytest.mark.usefixtures("pyarrow_xfail") def test_ea_int_avoid_overflow(all_parsers): # GH#32134 parser = all_parsers @@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index 213fa2c01cef4..11b54692fe2e1 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -670,11 +670,16 @@ def test_inf_na_values_with_int_index(all_parsers): tm.assert_frame_equal(out, expected) -@xfail_pyarrow # mismatched shape @pytest.mark.parametrize("na_filter", [True, False]) -def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter): +def test_na_values_with_dtype_str_and_na_filter( + all_parsers, na_filter, using_infer_string, request +): # see gh-20377 parser = all_parsers + if parser.engine == "pyarrow" and (na_filter is False or not using_infer_string): + mark = pytest.mark.xfail(reason="mismatched shape") + request.applymarker(mark) + data = "a,b,c\n1,,3\n4,5,6" # na_filter=True --> missing value becomes NaN. @@ -798,7 +803,18 @@ def test_bool_and_nan_to_int(all_parsers): True False """ - with pytest.raises(ValueError, match="convert|NoneType"): + msg = ( + "cannot safely convert passed user dtype of int(64|32) for " + " dtyped data in column 0 due to NA values" + ) + if parser.engine == "python": + msg = "Unable to convert column 0 to type int(64|32)" + elif parser.engine == "pyarrow": + msg = ( + r"int\(\) argument must be a string, a bytes-like object or a " + "real number, not 'NoneType" + ) + with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), dtype="int")