Added document and a test case for newlines_in_values case.

wooseogchoi · wooseogchoi · commit 25c260412221 · 2024-09-22T21:58:54.000-04:00
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -247,7 +247,11 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
     .. versionadded:: 1.4.0
 
         The 'pyarrow' engine was added as an *experimental* engine, and some features
-        are unsupported, or may not work correctly, with this engine.
+        are unsupported, or may not work correctly, with this engine. For example,
+        the newlines_in_values in the ParseOptions of the pyarrow allows handling the 
+        newline characters within values when parsing csv files. However, this is not
+        currently supported by Pandas. In this case, the 'csv' module in the pyarrow 
+        should be used instead. For more information, refer to the example.
 converters : dict of {{Hashable : Callable}}, optional
     Functions for converting values in specified columns. Keys can either
     be column labels or column indices.
@@ -545,12 +549,25 @@ class _read_shared(TypedDict, Generic[HashableT], total=False):
 ...     parse_dates=[1, 2],
 ...     date_format={{'col 2': '%d/%m/%Y', 'col 3': '%a %d %b %Y'}},
 ... )  # doctest: +SKIP
-
 >>> df.dtypes  # doctest: +SKIP
 col 1             int64
 col 2    datetime64[ns]
 col 3    datetime64[ns]
 dtype: object
+
+The csv in pyarrow must be used if values have new line character.
+
+>>> from pyarrow import csv
+>>> parse_options = csv.ParseOptions(newlines_in_values=True)
+>>> table = csv.read_csv("./example.csv", parse_options=parse_options)
+>>> df = table.to_pandas()
+>>> df.head()
+     text  idx
+0  ab\ncd  0
+1  ab\ncd  1
+2  ab\ncd  2
+3  ab\ncd  3
+4  ab\ncd  4
 """  # noqa: E501
 
 
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -19,6 +19,7 @@
 
 from pandas.io.parsers import read_csv
 import pandas.io.parsers.readers as parsers
+from pandas.core.frame import DataFrame
 
 pytestmark = pytest.mark.filterwarnings(
     "ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
@@ -150,6 +151,21 @@ def test_pyarrow_engine(self):
             with pytest.raises(ValueError, match=msg):
                 read_csv(StringIO(data), engine="pyarrow", **kwargs)
 
+    def test_pyarrow_newlines_in_values(self):
+        msg = (
+              "CSV parser got out of sync with chunker. "
+              "This can mean the data file contains cell values spanning multiple lines; "
+              "please consider enabling the option 'newlines_in_values'."
+              )
+        rows = []
+        for idx in range(1_000_000):
+            rows.append({"text": "ab\ncd", "idx" : idx})
+        df = DataFrame(rows)
+        df.to_csv("test.csv", index=False)
+        with pytest.raises(ValueError, match=msg):
+            read_csv("test.csv", engine="pyarrow")
+        os.unlink("test.csv")
+
     def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
         # GH 5686
         # GH 54643