Skip to content

BUG: read_csv with engine=pyarrow and numpy-nullable dtype #62053

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,7 @@ I/O
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
- Bug in :meth:`read_csv` raising ``TypeError`` when ``nrows`` and ``iterator`` are specified without specifying a ``chunksize``. (:issue:`59079`)
- Bug in :meth:`read_csv` where the order of the ``na_values`` makes an inconsistency when ``na_values`` is a list non-string values. (:issue:`59303`)
- Bug in :meth:`read_csv` with ``engine="pyarrow"`` and ``dtype="Int64"`` losing precision (:issue:`56136`)
- Bug in :meth:`read_excel` raising ``ValueError`` when passing array of boolean values when ``dtype="boolean"``. (:issue:`58159`)
- Bug in :meth:`read_html` where ``rowspan`` in header row causes incorrect conversion to ``DataFrame``. (:issue:`60210`)
- Bug in :meth:`read_json` ignoring the given ``dtype`` when ``engine="pyarrow"`` (:issue:`59516`)
Expand Down
82 changes: 79 additions & 3 deletions pandas/io/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,23 @@
)
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.common import pandas_dtype

import pandas as pd

if TYPE_CHECKING:
from collections.abc import Callable
from collections.abc import (
Callable,
Hashable,
Sequence,
)

import pyarrow

from pandas._typing import DtypeBackend
from pandas._typing import (
DtypeArg,
DtypeBackend,
)


def _arrow_dtype_mapping() -> dict:
Expand Down Expand Up @@ -64,6 +73,8 @@ def arrow_table_to_pandas(
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
null_to_int64: bool = False,
to_pandas_kwargs: dict | None = None,
dtype: DtypeArg | None = None,
names: Sequence[Hashable] | None = None,
) -> pd.DataFrame:
pa = import_optional_dependency("pyarrow")

Expand All @@ -82,12 +93,77 @@ def arrow_table_to_pandas(
elif using_string_dtype():
if pa_version_under19p0:
types_mapper = _arrow_string_types_mapper()
elif dtype is not None:
# GH#56136 Avoid lossy conversion to float64
# We'll convert to numpy below if
types_mapper = {
pa.int8(): pd.Int8Dtype(),
pa.int16(): pd.Int16Dtype(),
pa.int32(): pd.Int32Dtype(),
pa.int64(): pd.Int64Dtype(),
}.get
else:
types_mapper = None
elif dtype_backend is lib.no_default or dtype_backend == "numpy":
types_mapper = None
if dtype is not None:
# GH#56136 Avoid lossy conversion to float64
# We'll convert to numpy below if
types_mapper = {
pa.int8(): pd.Int8Dtype(),
pa.int16(): pd.Int16Dtype(),
pa.int32(): pd.Int32Dtype(),
pa.int64(): pd.Int64Dtype(),
}.get
else:
types_mapper = None
else:
raise NotImplementedError

df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
return _post_convert_dtypes(df, dtype_backend, dtype, names)


def _post_convert_dtypes(
df: pd.DataFrame,
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault,
dtype: DtypeArg | None,
names: Sequence[Hashable] | None,
) -> pd.DataFrame:
if dtype is not None and (
dtype_backend is lib.no_default or dtype_backend == "numpy"
):
# GH#56136 apply any user-provided dtype, and convert any IntegerDtype
# columns the user didn't explicitly ask for.
if isinstance(dtype, dict):
if names is not None:
df.columns = names

cmp_dtypes = {
pd.Int8Dtype(),
pd.Int16Dtype(),
pd.Int32Dtype(),
pd.Int64Dtype(),
}
for col in df.columns:
if col not in dtype and df[col].dtype in cmp_dtypes:
# Any key that the user didn't explicitly specify
# that got converted to IntegerDtype now gets converted
# to numpy dtype.
dtype[col] = df[col].dtype.numpy_dtype

# Ignore non-existent columns from dtype mapping
# like other parsers do
dtype = {
key: pandas_dtype(dtype[key]) for key in dtype if key in df.columns
}

else:
dtype = pandas_dtype(dtype)

try:
df = df.astype(dtype)
except TypeError as err:
# GH#44901 reraise to keep api consistent
raise ValueError(str(err)) from err

return df
37 changes: 25 additions & 12 deletions pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,17 @@
)
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import pandas_dtype
from pandas.core.dtypes.common import (
pandas_dtype,
)
from pandas.core.dtypes.inference import is_integer

from pandas.io._util import arrow_table_to_pandas
from pandas.io.parsers.base_parser import ParserBase

if TYPE_CHECKING:
import pyarrow as pa

from pandas._typing import ReadBuffer

from pandas import DataFrame
Expand Down Expand Up @@ -162,13 +166,12 @@ def _get_convert_options(self):

return convert_options

def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
num_cols = len(frame.columns)
def _adjust_column_names(self, table: pa.Table) -> bool:
num_cols = len(table.columns)
multi_index_named = True
if self.header is None:
if self.names is None:
if self.header is None:
self.names = range(num_cols)
self.names = range(num_cols)
if len(self.names) != num_cols:
# usecols is passed through to pyarrow, we only handle index col here
# The only way self.names is not the same length as number of cols is
Expand All @@ -177,8 +180,7 @@ def _adjust_column_names(self, frame: DataFrame) -> tuple[DataFrame, bool]:
columns_prefix = [str(x) for x in range(num_cols - len(self.names))]
self.names = columns_prefix + self.names
multi_index_named = False
frame.columns = self.names
return frame, multi_index_named
return multi_index_named

def _finalize_index(self, frame: DataFrame, multi_index_named: bool) -> DataFrame:
if self.index_col is not None:
Expand Down Expand Up @@ -227,21 +229,23 @@ def _finalize_dtype(self, frame: DataFrame) -> DataFrame:
raise ValueError(str(err)) from err
return frame

def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
def _finalize_pandas_output(
self, frame: DataFrame, multi_index_named: bool
) -> DataFrame:
"""
Processes data read in based on kwargs.

Parameters
----------
frame: DataFrame
frame : DataFrame
The DataFrame to process.
multi_index_named : bool

Returns
-------
DataFrame
The processed DataFrame.
"""
frame, multi_index_named = self._adjust_column_names(frame)
frame = self._do_date_conversions(frame.columns, frame)
frame = self._finalize_index(frame, multi_index_named)
frame = self._finalize_dtype(frame)
Expand Down Expand Up @@ -299,14 +303,23 @@ def read(self) -> DataFrame:

table = table.cast(new_schema)

multi_index_named = self._adjust_column_names(table)

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"make_block is deprecated",
DeprecationWarning,
)
frame = arrow_table_to_pandas(
table, dtype_backend=dtype_backend, null_to_int64=True
table,
dtype_backend=dtype_backend,
null_to_int64=True,
dtype=self.dtype,
names=self.names,
)

return self._finalize_pandas_output(frame)
if self.header is None:
frame.columns = self.names

return self._finalize_pandas_output(frame, multi_index_named)
4 changes: 0 additions & 4 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,9 +518,6 @@ def test_dtype_backend_pyarrow(all_parsers, request):
tm.assert_frame_equal(result, expected)


# pyarrow engine failing:
# https://github.com/pandas-dev/pandas/issues/56136
@pytest.mark.usefixtures("pyarrow_xfail")
def test_ea_int_avoid_overflow(all_parsers):
# GH#32134
parser = all_parsers
Expand Down Expand Up @@ -594,7 +591,6 @@ def test_string_inference_object_dtype(all_parsers, dtype, using_infer_string):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand Down
20 changes: 17 additions & 3 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -670,11 +670,14 @@ def test_inf_na_values_with_int_index(all_parsers):
tm.assert_frame_equal(out, expected)


@xfail_pyarrow # mismatched shape
@pytest.mark.parametrize("na_filter", [True, False])
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter):
def test_na_values_with_dtype_str_and_na_filter(all_parsers, na_filter, request):
# see gh-20377
parser = all_parsers
if parser.engine == "pyarrow" and na_filter is False:
mark = pytest.mark.xfail(reason="mismatched shape")
request.applymarker(mark)

data = "a,b,c\n1,,3\n4,5,6"

# na_filter=True --> missing value becomes NaN.
Expand Down Expand Up @@ -798,7 +801,18 @@ def test_bool_and_nan_to_int(all_parsers):
True
False
"""
with pytest.raises(ValueError, match="convert|NoneType"):
msg = (
"cannot safely convert passed user dtype of int(64|32) for "
"<class 'numpy.bool_?'> dtyped data in column 0 due to NA values"
)
if parser.engine == "python":
msg = "Unable to convert column 0 to type int(64|32)"
elif parser.engine == "pyarrow":
msg = (
r"int\(\) argument must be a string, a bytes-like object or a "
"real number, not 'NoneType"
)
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), dtype="int")


Expand Down
Loading