Skip to content

Commit 25c2604

Browse files
committed
Added document and a test case for newlines_in_values case.
1 parent 71b395f commit 25c2604

File tree

2 files changed

+35
-2
lines changed

2 files changed

+35
-2
lines changed

pandas/io/parsers/readers.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,11 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
247247
.. versionadded:: 1.4.0
248248
249249
The 'pyarrow' engine was added as an *experimental* engine, and some features
250-
are unsupported, or may not work correctly, with this engine.
250+
are unsupported, or may not work correctly, with this engine. For example,
251+
the newlines_in_values in the ParseOptions of the pyarrow allows handling the
252+
newline characters within values when parsing csv files. However, this is not
253+
currently supported by Pandas. In this case, the 'csv' module in the pyarrow
254+
should be used instead. For more information, refer to the example.
251255
converters : dict of {{Hashable : Callable}}, optional
252256
Functions for converting values in specified columns. Keys can either
253257
be column labels or column indices.
@@ -545,12 +549,25 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
545549
... parse_dates=[1, 2],
546550
... date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}},
547551
... ) # doctest: +SKIP
548-
549552
>>> df.dtypes # doctest: +SKIP
550553
col 1 int64
551554
col 2 datetime64[ns]
552555
col 3 datetime64[ns]
553556
dtype: object
557+
558+
The csv in pyarrow must be used if values have new line character.
559+
560+
>>> from pyarrow import csv
561+
>>> parse_options = csv.ParseOptions(newlines_in_values=True)
562+
>>> table = csv.read_csv("./example.csv", parse_options=parse_options)
563+
>>> df = table.to_pandas()
564+
>>> df.head()
565+
text idx
566+
0 ab\ncd 0
567+
1 ab\ncd 1
568+
2 ab\ncd 2
569+
3 ab\ncd 3
570+
4 ab\ncd 4
554571
""" # noqa: E501
555572

556573

pandas/tests/io/parser/test_unsupported.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from pandas.io.parsers import read_csv
2121
import pandas.io.parsers.readers as parsers
22+
from pandas.core.frame import DataFrame
2223

2324
pytestmark = pytest.mark.filterwarnings(
2425
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
@@ -150,6 +151,21 @@ def test_pyarrow_engine(self):
150151
with pytest.raises(ValueError, match=msg):
151152
read_csv(StringIO(data), engine="pyarrow", **kwargs)
152153

154+
def test_pyarrow_newlines_in_values(self):
155+
msg = (
156+
"CSV parser got out of sync with chunker. "
157+
"This can mean the data file contains cell values spanning multiple lines; "
158+
"please consider enabling the option 'newlines_in_values'."
159+
)
160+
rows = []
161+
for idx in range(1_000_000):
162+
rows.append({"text": "ab\ncd", "idx" : idx})
163+
df = DataFrame(rows)
164+
df.to_csv("test.csv", index=False)
165+
with pytest.raises(ValueError, match=msg):
166+
read_csv("test.csv", engine="pyarrow")
167+
os.unlink("test.csv")
168+
153169
def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
154170
# GH 5686
155171
# GH 54643

0 commit comments

Comments
 (0)