Skip to content
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down
98 changes: 82 additions & 16 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
from typing import TYPE_CHECKING
import warnings

import numpy as np

from pandas._config import using_string_dtype

from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.errors import (
Expand All @@ -11,9 +15,17 @@
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.common import (
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
BaseMaskedDtype,
)
from pandas.core.dtypes.inference import is_integer

from pandas.core.arrays.string_ import StringDtype

from pandas.io._util import arrow_table_to_pandas
from pandas.io.parsers.base_parser import ParserBase

Expand Down Expand Up @@ -140,20 +152,7 @@ def handle_warning(invalid_row) -> str:
"encoding": self.encoding,
}

def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
"""
Processes data read in based on kwargs.

Parameters
----------
frame: DataFrame
The DataFrame to process.

Returns
-------
DataFrame
The processed DataFrame.
"""
def _finalize_column_names(self, frame: DataFrame) -> DataFrame:
num_cols = len(frame.columns)
multi_index_named = True
if self.header is None:
Expand Down Expand Up @@ -196,6 +195,23 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
if self.header is None and not multi_index_named:
frame.index.names = [None] * len(frame.index.names)

return frame

def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
"""
Processes data read in based on kwargs.

Parameters
----------
frame: DataFrame
The DataFrame to process.

Returns
-------
DataFrame
The processed DataFrame.
"""

if self.dtype is not None:
# Ignore non-existent columns from dtype mapping
# like other parsers do
Expand Down Expand Up @@ -282,14 +298,64 @@ def read(self) -> DataFrame:

table = table.cast(new_schema)

workaround = False
pass_backend = dtype_backend
if self.dtype is not None and dtype_backend != "pyarrow":
# We pass dtype_backend="pyarrow" and subsequently cast
# to avoid lossy conversion e.g. GH#56136
workaround = True
pass_backend = "numpy_nullable"

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"make_block is deprecated",
DeprecationWarning,
)
frame = arrow_table_to_pandas(
table, dtype_backend=dtype_backend, null_to_int64=True
table, dtype_backend=pass_backend, null_to_int64=True
)

frame = self._finalize_column_names(frame)

if workaround and dtype_backend != "numpy_nullable":
old_dtype = self.dtype
if not isinstance(old_dtype, dict):
# e.g. test_categorical_dtype_utf16
old_dtype = dict.fromkeys(frame.columns, old_dtype)

# _finalize_pandas_output will call astype, but we need to make
# sure all keys are populated appropriately.
new_dtype = {}
for key in frame.columns:
ser = frame[key]
if isinstance(ser.dtype, BaseMaskedDtype):
new_dtype[key] = ser.dtype.numpy_dtype
if (
key in old_dtype
and not using_string_dtype()
and is_string_dtype(old_dtype[key])
and not isinstance(old_dtype[key], StringDtype)
and ser.array._hasna
):
# Cast to make sure we get "NaN" string instead of "NA"
frame[key] = ser.astype(old_dtype[key])
frame.loc[ser.isna(), key] = np.nan
old_dtype[key] = object # Avoid re-casting
elif isinstance(ser.dtype, StringDtype):
# We cast here in case the user passed "category" in
# order to get the correct dtype.categories.dtype
# e.g. test_categorical_dtype_utf16
if not using_string_dtype():
sdt = np.dtype(object)
frame[key] = ser.astype(sdt)
frame.loc[ser.isna(), key] = np.nan
else:
sdt = StringDtype(na_value=np.nan) # type: ignore[assignment]
frame[key] = frame[key].astype(sdt)
new_dtype[key] = sdt

new_dtype.update(old_dtype)
self.dtype = new_dtype

return self._finalize_pandas_output(frame)
4 changes: 0 additions & 4 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request):
tm.assert_frame_equal(result, expected)


# pyarrow engine failing:
# https://github.com/pandas-dev/pandas/issues/56136
@pytest.mark.usefixtures("pyarrow_xfail")
def test_ea_int_avoid_overflow(all_parsers):
# GH#32134
parser = all_parsers
Expand Down Expand Up @@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand Down
17 changes: 14 additions & 3 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,11 +670,14 @@ def test_inf_na_values_with_int_index(all_parsers):
tm.assert_frame_equal(out, expected)


@xfail_pyarrow # mismatched shape
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter, request):
# see gh-20377
parser = all_parsers
if parser.engine == "pyarrow" and na_filter is False:
mark = pytest.mark.xfail(reason="mismatched shape")
request.applymarker(mark)

data = "a,b,c\n1,,3\n4,5,6"

# na_filter=True --> missing value becomes NaN.
Expand Down Expand Up @@ -798,7 +801,15 @@ def test_bool_and_nan_to_int(all_parsers):
True
False
"""
with pytest.raises(ValueError, match="convert|NoneType"):
msg = (
"cannot safely convert passed user dtype of int(64|32) for "
"<class 'numpy.bool_?'> dtyped data in column 0 due to NA values"
)
if parser.engine == "python":
msg = "Unable to convert column 0 to type int(64|32)"
elif parser.engine == "pyarrow":
msg = r"cannot convert NA to integer"
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype="int")


Expand Down
Loading