diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 66990de6d3b89..3b0b1e973974b 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -5,7 +5,10 @@ """ from __future__ import annotations -from collections import abc +from collections import ( + abc, + defaultdict, +) import csv import sys from textwrap import fill @@ -38,8 +41,10 @@ is_float, is_integer, is_list_like, + pandas_dtype, ) +from pandas import Series from pandas.core.frame import DataFrame from pandas.core.indexes.api import RangeIndex from pandas.core.shared_docs import _shared_docs @@ -1846,7 +1851,25 @@ def read(self, nrows: int | None = None) -> DataFrame: else: new_rows = len(index) - df = DataFrame(col_dict, columns=columns, index=index) + if hasattr(self, "orig_options"): + dtype_arg = self.orig_options.get("dtype", None) + else: + dtype_arg = None + + if dtype_arg is None: + dtype = defaultdict(lambda: None) # type: ignore[var-annotated] + elif isinstance(dtype_arg, dict): + dtype = defaultdict(lambda: None) + dtype.update(dtype_arg) + else: + dtype = defaultdict(lambda: dtype_arg) + + new_col_dict = {} + for k, v in col_dict.items(): + d = dtype[k] if pandas_dtype(dtype[k]) == "object" else None + new_col_dict[k] = Series(v, index=index, dtype=d) + + df = DataFrame(new_col_dict, columns=columns, index=index) self._currow += new_rows return df diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index baed74fc212e4..3afa58918cd2a 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -233,7 +233,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers, monkeypatch): assert result.a.dtype == float -def test_warn_if_chunks_have_mismatched_type(all_parsers): +def test_warn_if_chunks_have_mismatched_type(all_parsers, using_infer_string): warning_type = None parser = all_parsers size = 10000 @@ -265,7 +265,7 @@ def test_warn_if_chunks_have_mismatched_type(all_parsers): buf, ) - assert df.a.dtype == object + assert df.a.dtype == object if not using_infer_string else "string[pyarrow_numpy]" @pytest.mark.parametrize("iterator", [True, False]) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 558fdb7632102..d4625f11b5a09 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -12,6 +12,8 @@ import numpy as np import pytest +from pandas._config import using_pyarrow_string_dtype + from pandas.errors import ( EmptyDataError, ParserError, @@ -878,6 +880,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't decode") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 038c684c90c9e..4a8cdaed9d9d1 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -106,7 +106,7 @@ def test_multi_index_no_level_names(all_parsers, index_col): # No index names in headless data. expected.index.names = [None] * 2 - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_column_type=False) @skip_pyarrow diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 0deafda750904..6fc7c3cb61424 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -21,6 +21,7 @@ IntegerArray, StringArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" @@ -458,7 +459,7 @@ def test_dtype_backend_and_dtype(all_parsers): tm.assert_frame_equal(result, expected) -def test_dtype_backend_string(all_parsers, string_storage): +def test_dtype_backend_string(all_parsers, string_storage, using_infer_string): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -471,7 +472,14 @@ def test_dtype_backend_string(all_parsers, string_storage): """ result = parser.read_csv(StringIO(data), dtype_backend="numpy_nullable") - if string_storage == "python": + if using_infer_string: + expected = DataFrame( + { + "a": ArrowStringArrayNumpySemantics(pa.array(["a", "b"])), + "b": ArrowStringArrayNumpySemantics(pa.array(["x", None])), + } + ) + elif string_storage == "python": expected = DataFrame( { "a": StringArray(np.array(["a", "b"], dtype=np.object_)), @@ -574,6 +582,41 @@ def test_string_inference(all_parsers): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("dtype", ["O", object, "object", np.object_]) +def test_string_inference_object_dtype(all_parsers, dtype): + # GH#56047 + pytest.importorskip("pyarrow") + + data = """a,b +x,a +y,a +z,a""" + parser = all_parsers + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype=dtype) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype=object), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + with pd.option_context("future.infer_string", True): + result = parser.read_csv(StringIO(data), dtype={"a": dtype}) + + expected = DataFrame( + { + "a": pd.Series(["x", "y", "z"], dtype=object), + "b": pd.Series(["a", "a", "a"], dtype="string[pyarrow_numpy]"), + }, + columns=pd.Index(["a", "b"], dtype="string[pyarrow_numpy]"), + ) + tm.assert_frame_equal(result, expected) + + def test_accurate_parsing_of_large_integers(all_parsers): # GH#52505 data = """SYMBOL,MOMENT,ID,ID_DEAL diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py index 500863dce84ee..75ff8439407be 100644 --- a/pandas/tests/io/parser/test_c_parser_only.py +++ b/pandas/tests/io/parser/test_c_parser_only.py @@ -179,7 +179,7 @@ def error(val: float, actual_val: Decimal) -> Decimal: assert max(precise_errors) <= max(normal_errors) -def test_usecols_dtypes(c_parser_only): +def test_usecols_dtypes(c_parser_only, using_infer_string): parser = c_parser_only data = """\ 1,2,3 @@ -204,8 +204,12 @@ def test_usecols_dtypes(c_parser_only): dtype={"b": int, "c": float}, ) - assert (result.dtypes == [object, int, float]).all() - assert (result2.dtypes == [object, float]).all() + if using_infer_string: + assert (result.dtypes == ["string", int, float]).all() + assert (result2.dtypes == ["string", float]).all() + else: + assert (result.dtypes == [object, int, float]).all() + assert (result2.dtypes == [object, float]).all() def test_disable_bool_parsing(c_parser_only): diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py index 7f3e45324dbd2..111f7f23a9bb4 100644 --- a/pandas/tests/io/parser/test_converters.py +++ b/pandas/tests/io/parser/test_converters.py @@ -185,7 +185,7 @@ def convert_score(x): @pytest.mark.parametrize("conv_f", [lambda x: x, str]) -def test_converter_index_col_bug(all_parsers, conv_f): +def test_converter_index_col_bug(all_parsers, conv_f, using_infer_string): # see gh-1835 , GH#40589 parser = all_parsers data = "A;B\n1;2\n3;4" @@ -202,7 +202,14 @@ def test_converter_index_col_bug(all_parsers, conv_f): StringIO(data), sep=";", index_col="A", converters={"A": conv_f} ) - xp = DataFrame({"B": [2, 4]}, index=Index(["1", "3"], name="A", dtype="object")) + xp = DataFrame( + {"B": [2, 4]}, + index=Index( + ["1", "3"], + name="A", + dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + ), + ) tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_mangle_dupes.py b/pandas/tests/io/parser/test_mangle_dupes.py index 1d245f81f027c..713cc424449ad 100644 --- a/pandas/tests/io/parser/test_mangle_dupes.py +++ b/pandas/tests/io/parser/test_mangle_dupes.py @@ -138,7 +138,7 @@ def test_mangled_unnamed_placeholders(all_parsers): expected[orig_key] = orig_value df = parser.read_csv(StringIO(df.to_csv())) - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected, check_column_type=False) @xfail_pyarrow # ValueError: Found non-unique column index diff --git a/pandas/tests/io/parser/test_na_values.py b/pandas/tests/io/parser/test_na_values.py index ca106fa772e82..260d49e6367fa 100644 --- a/pandas/tests/io/parser/test_na_values.py +++ b/pandas/tests/io/parser/test_na_values.py @@ -303,7 +303,9 @@ def test_na_value_dict_multi_index(all_parsers, index_col, expected): ), ], ) -def test_na_values_keep_default(all_parsers, kwargs, expected, request): +def test_na_values_keep_default( + all_parsers, kwargs, expected, request, using_infer_string +): data = """\ A,B,C a,1,one @@ -321,8 +323,9 @@ def test_na_values_keep_default(all_parsers, kwargs, expected, request): with pytest.raises(ValueError, match=msg): parser.read_csv(StringIO(data), **kwargs) return - mark = pytest.mark.xfail() - request.applymarker(mark) + if not using_infer_string or len(kwargs) > 0: + mark = pytest.mark.xfail() + request.applymarker(mark) result = parser.read_csv(StringIO(data), **kwargs) tm.assert_frame_equal(result, expected) @@ -432,7 +435,6 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v tm.assert_frame_equal(result, expected) -@xfail_pyarrow # mismatched dtypes in both cases, FutureWarning in the True case @pytest.mark.parametrize( "na_filter,row_data", [ @@ -440,7 +442,9 @@ def test_no_keep_default_na_dict_na_values_diff_reprs(all_parsers, col_zero_na_v (False, [["1", "A"], ["nan", "B"], ["3", "C"]]), ], ) -def test_na_values_na_filter_override(all_parsers, na_filter, row_data): +def test_na_values_na_filter_override( + all_parsers, na_filter, row_data, request, using_infer_string +): data = """\ A,B 1,A @@ -448,6 +452,11 @@ def test_na_values_na_filter_override(all_parsers, na_filter, row_data): 3,C """ parser = all_parsers + if parser.engine == "pyarrow": + if not using_infer_string or not na_filter: + mark = pytest.mark.xfail() + request.applymarker(mark) + result = parser.read_csv(StringIO(data), na_values=["B"], na_filter=na_filter) expected = DataFrame(row_data, columns=["A", "B"]) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 113402cda1b9a..eb4a6fc3e66b7 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -1751,9 +1751,12 @@ def test_parse_timezone(all_parsers): "date_string", ["32/32/2019", "02/30/2019", "13/13/2019", "13/2019", "a3/11/2018", "10/11/2o17"], ) -def test_invalid_parse_delimited_date(all_parsers, date_string): +def test_invalid_parse_delimited_date(all_parsers, date_string, using_infer_string): parser = all_parsers - expected = DataFrame({0: [date_string]}, dtype="object") + expected = DataFrame( + {0: [date_string]}, + dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + ) result = parser.read_csv( StringIO(date_string), header=None, @@ -2021,7 +2024,7 @@ def test_parse_dates_and_keep_original_column(all_parsers): tm.assert_frame_equal(result, expected) -def test_dayfirst_warnings(): +def test_dayfirst_warnings(using_infer_string): # GH 12585 # CASE 1: valid input @@ -2053,7 +2056,11 @@ def test_dayfirst_warnings(): # first in DD/MM/YYYY, second in MM/DD/YYYY input = "date\n31/12/2014\n03/30/2011" - expected = Index(["31/12/2014", "03/30/2011"], dtype="object", name="date") + expected = Index( + ["31/12/2014", "03/30/2011"], + dtype="object" if not using_infer_string else "string[pyarrow_numpy]", + name="date", + ) # A. use dayfirst=True res5 = read_csv( @@ -2170,7 +2177,7 @@ def test_parse_dates_and_string_dtype(all_parsers): tm.assert_frame_equal(result, expected) -def test_parse_dot_separated_dates(all_parsers): +def test_parse_dot_separated_dates(all_parsers, using_infer_string): # https://github.com/pandas-dev/pandas/issues/2586 parser = all_parsers data = """a,b @@ -2179,7 +2186,7 @@ def test_parse_dot_separated_dates(all_parsers): if parser.engine == "pyarrow": expected_index = Index( ["27.03.2003 14:55:00.000", "03.08.2003 15:20:00.000"], - dtype="object", + dtype="object" if not using_infer_string else "string[pyarrow_numpy]", name="a", ) warn = None diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 480d579f7f400..8bde6d25d2ad7 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -26,6 +26,7 @@ ArrowStringArray, StringArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics from pandas.io.common import urlopen from pandas.io.parsers import ( @@ -960,9 +961,13 @@ def test_widths_and_usecols(): tm.assert_frame_equal(result, expected) -def test_dtype_backend(string_storage, dtype_backend): +def test_dtype_backend(string_storage, dtype_backend, using_infer_string): # GH#50289 - if string_storage == "python": + if using_infer_string: + pa = pytest.importorskip("pyarrow") + arr = ArrowStringArrayNumpySemantics(pa.array(["a", "b"])) + arr_na = ArrowStringArrayNumpySemantics(pa.array([None, "a"])) + elif string_storage == "python": arr = StringArray(np.array(["a", "b"], dtype=np.object_)) arr_na = StringArray(np.array([pd.NA, "a"], dtype=np.object_)) else: diff --git a/pandas/tests/io/parser/test_upcast.py b/pandas/tests/io/parser/test_upcast.py index bc4c4c2e24e9c..43e6acb37b0ae 100644 --- a/pandas/tests/io/parser/test_upcast.py +++ b/pandas/tests/io/parser/test_upcast.py @@ -16,6 +16,7 @@ IntegerArray, StringArray, ) +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics def test_maybe_upcast(any_real_numpy_dtype): @@ -85,7 +86,7 @@ def test_maybe_upcaste_all_nan(): @pytest.mark.parametrize("val", [na_values[np.object_], "c"]) -def test_maybe_upcast_object(val, string_storage): +def test_maybe_upcast_object(val, string_storage, using_infer_string): # GH#36712 pa = pytest.importorskip("pyarrow") @@ -93,7 +94,10 @@ def test_maybe_upcast_object(val, string_storage): arr = np.array(["a", "b", val], dtype=np.object_) result = _maybe_upcast(arr, use_dtype_backend=True) - if string_storage == "python": + if using_infer_string: + exp_val = "c" if val == "c" else None + expected = ArrowStringArrayNumpySemantics(pa.array(["a", "b", exp_val])) + elif string_storage == "python": exp_val = "c" if val == "c" else NA expected = StringArray(np.array(["a", "b", exp_val], dtype=np.object_)) else: