Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
"""
from __future__ import annotations

from collections import abc
from collections import (
abc,
defaultdict,
)
import csv
import sys
from textwrap import fill
Expand Down Expand Up @@ -38,8 +41,10 @@
is_float,
is_integer,
is_list_like,
pandas_dtype,
)

from pandas import Series
from pandas.core.frame import DataFrame
from pandas.core.indexes.api import RangeIndex
from pandas.core.shared_docs import _shared_docs
Expand Down Expand Up @@ -1846,7 +1851,25 @@ def read(self, nrows: int | None = None) -> DataFrame:
else:
new_rows = len(index)

df = DataFrame(col_dict, columns=columns, index=index)
if hasattr(self, "orig_options"):
dtype_arg = self.orig_options.get("dtype", None)
else:
dtype_arg = None

if dtype_arg is None:
dtype = defaultdict(lambda: None) # type: ignore[var-annotated]
elif isinstance(dtype_arg, dict):
dtype = defaultdict(lambda: None)
dtype.update(dtype_arg)
else:
dtype = defaultdict(lambda: dtype_arg)

new_col_dict = {}
for k, v in col_dict.items():
d = dtype[k] if pandas_dtype(dtype[k]) == "object" else None
new_col_dict[k] = Series(v, index=index, dtype=d)

df = DataFrame(new_col_dict, columns=columns, index=index)

self._currow += new_rows
return df
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/parser/common/test_chunksize.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch):
assert result.a.dtype == float


def test_warn_if_chunks_have_mismatched_type(all_parsers):
def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string):
warning_type = None
parser = all_parsers
size = 10000
Expand Down Expand Up @@ -265,7 +265,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers):
buf,
)

assert df.a.dtype == object
assert df.a.dtype == object if not using_infer_string else "string[pyarrow_numpy]"


@pytest.mark.parametrize("iterator", [True, False])
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import numpy as np
import pytest

from pandas._config import using_pyarrow_string_dtype

from pandas.errors import (
EmptyDataError,
ParserError,
Expand Down Expand Up @@ -878,6 +880,7 @@ def test_dict_keys_as_names(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't decode")
@xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0
def test_encoding_surrogatepass(all_parsers):
# GH39017
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/parser/common/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def test_multi_index_no_level_names(all_parsers, index_col):

# No index names in headless data.
expected.index.names = [None] * 2
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected, check_column_type=False)


@skip_pyarrow
Expand Down
47 changes: 45 additions & 2 deletions pandas/tests/io/parser/dtypes/test_dtypes_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
IntegerArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics

pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
Expand Down Expand Up @@ -458,7 +459,7 @@ def test_dtype_backend_and_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


def test_dtype_backend_string(all_parsers, string_storage):
def test_dtype_backend_string(all_parsers, string_storage, using_infer_string):
# GH#36712
pa = pytest.importorskip("pyarrow")

Expand All @@ -471,7 +472,14 @@ def test_dtype_backend_string(all_parsers, string_storage):
"""
result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable")

if string_storage == "python":
if using_infer_string:
expected = DataFrame(
{
"a": ArrowStringArrayNumpySemantics(pa.array(["a", "b"])),
"b": ArrowStringArrayNumpySemantics(pa.array(["x", None])),
}
)
elif string_storage == "python":
expected = DataFrame(
{
"a": StringArray(np.array(["a", "b"], dtype=np.object_)),
Expand Down Expand Up @@ -574,6 +582,41 @@ def test_string_inference(all_parsers):
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_])
def test_string_inference_object_dtype(all_parsers, dtype):
# GH#56047
pytest.importorskip("pyarrow")

data = """a,b
x,a
y,a
z,a"""
parser = all_parsers
with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype=dtype)

expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=object),
"b": pd.Series(["a", "a", "a"], dtype=object),
},
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)

with pd.option_context("future.infer_string", True):
result = parser.read_csv(StringIO(data), dtype={"a": dtype})

expected = DataFrame(
{
"a": pd.Series(["x", "y", "z"], dtype=object),
"b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"),
},
columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"),
)
tm.assert_frame_equal(result, expected)


def test_accurate_parsing_of_large_integers(all_parsers):
# GH#52505
data = """SYMBOL,MOMENT,ID,ID_DEAL
Expand Down
10 changes: 7 additions & 3 deletions pandas/tests/io/parser/test_c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def error(val: float, actual_val: Decimal) -> Decimal:
assert max(precise_errors) <= max(normal_errors)


def test_usecols_dtypes(c_parser_only):
def test_usecols_dtypes(c_parser_only, using_infer_string):
parser = c_parser_only
data = """\
1,2,3
Expand All @@ -204,8 +204,12 @@ def test_usecols_dtypes(c_parser_only):
dtype={"b": int, "c": float},
)

assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()
if using_infer_string:
assert (result.dtypes == ["string", int, float]).all()
assert (result2.dtypes == ["string", float]).all()
else:
assert (result.dtypes == [object, int, float]).all()
assert (result2.dtypes == [object, float]).all()


def test_disable_bool_parsing(c_parser_only):
Expand Down
11 changes: 9 additions & 2 deletions pandas/tests/io/parser/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def convert_score(x):


@pytest.mark.parametrize("conv_f", [lambda x: x, str])
def test_converter_index_col_bug(all_parsers, conv_f):
def test_converter_index_col_bug(all_parsers, conv_f, using_infer_string):
# see gh-1835 , GH#40589
parser = all_parsers
data = "A;B\n1;2\n3;4"
Expand All @@ -202,7 +202,14 @@ def test_converter_index_col_bug(all_parsers, conv_f):
StringIO(data), sep=";", index_col="A", converters={"A": conv_f}
)

xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object"))
xp = DataFrame(
{"B": [2, 4]},
index=Index(
["1", "3"],
name="A",
dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
),
)
tm.assert_frame_equal(rs, xp)


Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/io/parser/test_mangle_dupes.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_mangled_unnamed_placeholders(all_parsers):
expected[orig_key] = orig_value
df = parser.read_csv(StringIO(df.to_csv()))

tm.assert_frame_equal(df, expected)
tm.assert_frame_equal(df, expected, check_column_type=False)


@xfail_pyarrow # ValueError: Found non-unique column index
Expand Down
19 changes: 14 additions & 5 deletions pandas/tests/io/parser/test_na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected):
),
],
)
def test_na_values_keep_default(all_parsers, kwargs, expected, request):
def test_na_values_keep_default(
all_parsers, kwargs, expected, request, using_infer_string
):
data = """\
A,B,C
a,1,one
Expand All @@ -321,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request):
with pytest.raises(ValueError, match=msg):
parser.read_csv(StringIO(data), **kwargs)
return
mark = pytest.mark.xfail()
request.applymarker(mark)
if not using_infer_string or len(kwargs) > 0:
mark = pytest.mark.xfail()
request.applymarker(mark)

result = parser.read_csv(StringIO(data), **kwargs)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -432,22 +435,28 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v
tm.assert_frame_equal(result, expected)


@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case
@pytest.mark.parametrize(
"na_filter,row_data",
[
(True, [[1, "A"], [np.nan, np.nan], [3, "C"]]),
(False, [["1", "A"], ["nan", "B"], ["3", "C"]]),
],
)
def test_na_values_na_filter_override(all_parsers, na_filter, row_data):
def test_na_values_na_filter_override(
all_parsers, na_filter, row_data, request, using_infer_string
):
data = """\
A,B
1,A
nan,B
3,C
"""
parser = all_parsers
if parser.engine == "pyarrow":
if not using_infer_string or not na_filter:
mark = pytest.mark.xfail()
request.applymarker(mark)

result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter)

expected = DataFrame(row_data, columns=["A", "B"])
Expand Down
19 changes: 13 additions & 6 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1751,9 +1751,12 @@ def test_parse_timezone(all_parsers):
"date_string",
["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"],
)
def test_invalid_parse_delimited_date(all_parsers, date_string):
def test_invalid_parse_delimited_date(all_parsers, date_string, using_infer_string):
parser = all_parsers
expected = DataFrame({0: [date_string]}, dtype="object")
expected = DataFrame(
{0: [date_string]},
dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
)
result = parser.read_csv(
StringIO(date_string),
header=None,
Expand Down Expand Up @@ -2021,7 +2024,7 @@ def test_parse_dates_and_keep_original_column(all_parsers):
tm.assert_frame_equal(result, expected)


def test_dayfirst_warnings():
def test_dayfirst_warnings(using_infer_string):
# GH 12585

# CASE 1: valid input
Expand Down Expand Up @@ -2053,7 +2056,11 @@ def test_dayfirst_warnings():

# first in DD/MM/YYYY, second in MM/DD/YYYY
input = "date\n31/12/2014\n03/30/2011"
expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date")
expected = Index(
["31/12/2014", "03/30/2011"],
dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
name="date",
)

# A. use dayfirst=True
res5 = read_csv(
Expand Down Expand Up @@ -2170,7 +2177,7 @@ def test_parse_dates_and_string_dtype(all_parsers):
tm.assert_frame_equal(result, expected)


def test_parse_dot_separated_dates(all_parsers):
def test_parse_dot_separated_dates(all_parsers, using_infer_string):
# https://github.com/pandas-dev/pandas/issues/2586
parser = all_parsers
data = """a,b
Expand All @@ -2179,7 +2186,7 @@ def test_parse_dot_separated_dates(all_parsers):
if parser.engine == "pyarrow":
expected_index = Index(
["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"],
dtype="object",
dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
name="a",
)
warn = None
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/io/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ArrowStringArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics

from pandas.io.common import urlopen
from pandas.io.parsers import (
Expand Down Expand Up @@ -960,9 +961,13 @@ def test_widths_and_usecols():
tm.assert_frame_equal(result, expected)


def test_dtype_backend(string_storage, dtype_backend):
def test_dtype_backend(string_storage, dtype_backend, using_infer_string):
# GH#50289
if string_storage == "python":
if using_infer_string:
pa = pytest.importorskip("pyarrow")
arr = ArrowStringArrayNumpySemantics(pa.array(["a", "b"]))
arr_na = ArrowStringArrayNumpySemantics(pa.array([None, "a"]))
elif string_storage == "python":
arr = StringArray(np.array(["a", "b"], dtype=np.object_))
arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_))
else:
Expand Down
8 changes: 6 additions & 2 deletions pandas/tests/io/parser/test_upcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
IntegerArray,
StringArray,
)
from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics


def test_maybe_upcast(any_real_numpy_dtype):
Expand Down Expand Up @@ -85,15 +86,18 @@ def test_maybe_upcaste_all_nan():


@pytest.mark.parametrize("val", [na_values[np.object_], "c"])
def test_maybe_upcast_object(val, string_storage):
def test_maybe_upcast_object(val, string_storage, using_infer_string):
# GH#36712
pa = pytest.importorskip("pyarrow")

with pd.option_context("mode.string_storage", string_storage):
arr = np.array(["a", "b", val], dtype=np.object_)
result = _maybe_upcast(arr, use_dtype_backend=True)

if string_storage == "python":
if using_infer_string:
exp_val = "c" if val == "c" else None
expected = ArrowStringArrayNumpySemantics(pa.array(["a", "b", exp_val]))
elif string_storage == "python":
exp_val = "c" if val == "c" else NA
expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_))
else:
Expand Down