diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index ffc2690a5efdf..43d264e29aa56 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -247,7 +247,11 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): .. versionadded:: 1.4.0 The 'pyarrow' engine was added as an *experimental* engine, and some features - are unsupported, or may not work correctly, with this engine. + are unsupported, or may not work correctly, with this engine. For example, + the newlines_in_values in the ParseOptions of the pyarrow allows handling the + newline characters within values when parsing csv files. However, this is not + currently supported by Pandas. In this case, the 'csv' module in the pyarrow + should be used instead. For more information, refer to the example. converters : dict of {{Hashable : Callable}}, optional Functions for converting values in specified columns. Keys can either be column labels or column indices. @@ -545,12 +549,26 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): ... parse_dates=[1, 2], ... date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}}, ... ) # doctest: +SKIP - >>> df.dtypes # doctest: +SKIP col 1 int64 col 2 datetime64[ns] col 3 datetime64[ns] dtype: object + +The csv in the pyarrow must be used if the values in the file have +new line characters. + +>>> from pyarrow import csv # doctest: +SKIP +>>> parse_options = csv.ParseOptions(newlines_in_values=True) # doctest: +SKIP +>>> table = csv.read_csv("example.csv", parse_options=parse_options) # doctest: +SKIP +>>> df = table.to_pandas() # doctest: +SKIP +>>> df.head() # doctest: +SKIP + text idx +0 ab\ncd 0 +1 ab\ncd 1 +2 ab\ncd 2 +3 ab\ncd 3 +4 ab\ncd 4 """ # noqa: E501 diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 07f84466e3ac2..94f71bf38ec43 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -13,9 +13,11 @@ import pytest +from pandas.compat.pyarrow import pa_version_under18p0 from pandas.errors import ParserError import pandas._testing as tm +from pandas.core.frame import DataFrame from pandas.io.parsers import read_csv import pandas.io.parsers.readers as parsers @@ -150,6 +152,22 @@ def test_pyarrow_engine(self): with pytest.raises(ValueError, match=msg): read_csv(StringIO(data), engine="pyarrow", **kwargs) + @pytest.mark.skipif(not pa_version_under18p0, reason="No ParserError raised") + def test_pyarrow_newlines_in_values(self): + pytest.importorskip("pyarrow") + msg = ( + "CSV parser got out of sync with chunker. " + "This can mean the data file contains cell values spanning multiple " + "lines; please consider enabling the option 'newlines_in_values'." + ) + rows = [{"text": "ab\ncd", "idx": idx} for idx in range(1_000_000)] + df = DataFrame(rows) + df.to_csv("test.csv", index=False) + + with pytest.raises(ParserError, match=msg): + read_csv("test.csv", engine="pyarrow") + os.unlink("test.csv") + def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers): # GH 5686 # GH 54643