diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 448ceffdaa1eb..db3e65c64456a 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -217,6 +217,7 @@ Other enhancements - Added half-year offset classes :class:`HalfYearBegin`, :class:`HalfYearEnd`, :class:`BHalfYearBegin` and :class:`BHalfYearEnd` (:issue:`60928`) - Added support for ``axis=1`` with ``dict`` or :class:`Series` arguments into :meth:`DataFrame.fillna` (:issue:`4514`) - Added support to read and write from and to Apache Iceberg tables with the new :func:`read_iceberg` and :meth:`DataFrame.to_iceberg` functions (:issue:`61383`) +- Allow using pyarrow to serialize :class:`DataFrame` and :class:`Series` to CSV with ``engine="pyarrow"`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` (:issue:`53618`) - Errors occurring during SQL I/O will now throw a generic :class:`.DatabaseError` instead of the raw Exception type from the underlying driver manager library (:issue:`60748`) - Implemented :meth:`Series.str.isascii` and :meth:`Series.str.isascii` (:issue:`59091`) - Improve the resulting dtypes in :meth:`DataFrame.where` and :meth:`DataFrame.mask` with :class:`ExtensionDtype` ``other`` (:issue:`62038`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0154087b18399..6fb04c8d67a4f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3726,6 +3726,7 @@ def to_csv( decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., + engine: str = "python", ) -> str: ... @overload @@ -3753,6 +3754,7 @@ def to_csv( decimal: str = ..., errors: OpenFileErrors = ..., storage_options: StorageOptions = ..., + engine: str = "python", ) -> None: ... @final @@ -3771,7 +3773,7 @@ def to_csv( header: bool | list[str] = True, index: bool = True, index_label: IndexLabel | None = None, - mode: str = "w", + mode: str | None = None, encoding: str | None = None, compression: CompressionOptions = "infer", quoting: int | None = None, @@ -3784,6 +3786,7 @@ def to_csv( decimal: str = ".", errors: OpenFileErrors = "strict", storage_options: StorageOptions | None = None, + engine: str = "python", ) -> str | None: r""" Write object to a comma-separated values (csv) file. @@ -3816,7 +3819,7 @@ def to_csv( sequence should be given if the object uses MultiIndex. If False do not print fields for index names. Use index_label=False for easier importing in R. - mode : {{'w', 'x', 'a'}}, default 'w' + mode : {{'w', 'x', 'a'}}, default 'w' (Python engine) or 'wb' (Pyarrow engine) Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control the file opening. Typical values include: @@ -3824,6 +3827,9 @@ def to_csv( - 'x', exclusive creation, failing if the file already exists. - 'a', append to the end of file if it exists. + .. note:: + The pyarrow engine can only handle binary buffers. + encoding : str, optional A string representing the encoding to use in the output file, defaults to 'utf-8'. `encoding` is not supported if `path_or_buf` @@ -3871,6 +3877,16 @@ def to_csv( {storage_options} + engine : str, default 'python' + The engine to use. Available options are "pyarrow" or "python". + The pyarrow engine requires the pyarrow library to be installed + and is generally faster than the python engine. + + However, the python engine may be more feature complete than the + pyarrow engine. + + .. versionadded:: 3.0.0 + Returns ------- None or str @@ -3934,8 +3950,14 @@ def to_csv( decimal=decimal, ) + if mode is None: + mode = "w" + if engine == "pyarrow": + mode += "b" + return DataFrameRenderer(formatter).to_csv( path_or_buf, + engine=engine, lineterminator=lineterminator, sep=sep, encoding=encoding, diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 762fee465c008..51a55da9bb2cd 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -11,10 +11,13 @@ Sequence, ) import csv as csvlib +import io import os from typing import ( + IO, TYPE_CHECKING, Any, + AnyStr, cast, ) @@ -22,6 +25,7 @@ from pandas._libs import writers as libwriters from pandas._typing import SequenceNotStr +from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import cache_readonly from pandas.core.dtypes.generic import ( @@ -60,6 +64,7 @@ def __init__( self, formatter: DataFrameFormatter, path_or_buf: FilePath | WriteBuffer[str] | WriteBuffer[bytes] = "", + engine: str = "python", sep: str = ",", cols: Sequence[Hashable] | None = None, index_label: IndexLabel | None = None, @@ -81,6 +86,7 @@ def __init__( self.obj = self.fmt.frame self.filepath_or_buffer = path_or_buf + self.engine = engine self.encoding = encoding self.compression: CompressionOptions = compression self.mode = mode @@ -247,6 +253,11 @@ def save(self) -> None: """ Create the writer & save. """ + if self.engine == "pyarrow" and ( + "b" not in self.mode or isinstance(self.filepath_or_buffer, io.TextIOBase) + ): + raise ValueError("The pyarrow engine can only open files in binary mode.") + # apply compression and byte/text conversion with get_handle( self.filepath_or_buffer, @@ -255,12 +266,89 @@ def save(self) -> None: errors=self.errors, compression=self.compression, storage_options=self.storage_options, + # pyarrow engine exclusively writes bytes + is_text=self.engine == "python", ) as handles: # Note: self.encoding is irrelevant here - # error: Argument "quoting" to "writer" has incompatible type "int"; - # expected "Literal[0, 1, 2, 3]" + + # This is a mypy bug? + # error: Cannot infer type argument 1 of "_save" of "CSVFormatter" [misc] + self._save(handles.handle) # type: ignore[misc] + + def _save_pyarrow(self, handle: IO[AnyStr]) -> None: + pa = import_optional_dependency("pyarrow") + pa_csv = import_optional_dependency("pyarrow.csv") + + if self.quotechar is not None and self.quotechar != '"': + raise ValueError('The pyarrow engine only supports " as a quotechar.') + + unsupported_options = [ + # each pair is (option value, default, option name) + (self.decimal, ".", "decimal"), + (self.float_format, None, "float_format"), + (self.na_rep, "", "na_rep"), + (self.date_format, None, "date_foramt"), + (self.lineterminator, os.linesep, "lineterminator"), + (self.encoding, None, "encoding"), + (self.errors, "strict", "errors"), + (self.escapechar, None, "escapechar"), + ] + + for opt_val, default, option in unsupported_options: + if opt_val != default: + raise ValueError( + f"The {option} option is not supported with the pyarrow engine." + ) + + # Convert index to column and rename name to empty string + # since we serialize the index as basically a column with no name + # TODO: this won't work for multi-indexes (without names) + obj = self.obj + if self.index: + new_names = [ + label if label is not None else "" for label in self.obj.index.names + ] + obj = self.obj.reset_index(names=new_names) + + table = pa.Table.from_pandas(obj) + + # Map quoting arg to pyarrow equivalents + if self.quoting == csvlib.QUOTE_MINIMAL: + pa_quoting = "needed" + elif self.quotechar is None: + raise TypeError("quotechar must be set if quoting enabled") + elif self.quoting == csvlib.QUOTE_ALL: + # TODO: Is this a 1-1 mapping? + # This doesn't quote nulls, check if Python does this + pa_quoting = "all_valid" + elif self.quoting == csvlib.QUOTE_NONE: + pa_quoting = "none" + else: + raise NotImplementedError( + f"Quoting option {self.quoting} is not supported with engine='pyarrow'" + ) + + kwargs: dict[str, Any] = { + "include_header": self._need_to_save_header, + "batch_size": self.chunksize, + } + kwargs["delimiter"] = self.sep + kwargs["quoting_style"] = pa_quoting + + write_options = pa_csv.WriteOptions(**kwargs) + pa_csv.write_csv(table, handle, write_options) + + def _save(self, handle: IO[AnyStr]) -> None: + if self.engine == "pyarrow": + self._save_pyarrow(handle) + else: self.writer = csvlib.writer( - handles.handle, + # error: Argument of type "IO[AnyStr@_save]" cannot be assigned + # to parameter "csvfile" of type "SupportsWrite[str]" + # in function "writer" + # error: Argument "quoting" to "writer" has incompatible type "int"; + # expected "Literal[0, 1, 2, 3]" + handle, # type: ignore[arg-type] lineterminator=self.lineterminator, delimiter=self.sep, quoting=self.quoting, # type: ignore[arg-type] @@ -268,13 +356,9 @@ def save(self) -> None: escapechar=self.escapechar, quotechar=self.quotechar, ) - - self._save() - - def _save(self) -> None: - if self._need_to_save_header: - self._save_header() - self._save_body() + if self._need_to_save_header: + self._save_header() + self._save_body() def _save_header(self) -> None: if not self.has_mi_columns or self._has_aliases: diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d72b6cd89b940..98cd96e5a8336 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -16,7 +16,10 @@ from csv import QUOTE_NONE from decimal import Decimal from functools import partial -from io import StringIO +from io import ( + BytesIO, + StringIO, +) import math import re from shutil import get_terminal_size @@ -24,6 +27,7 @@ TYPE_CHECKING, Any, Final, + Union, cast, ) @@ -980,6 +984,7 @@ def to_string( def to_csv( self, path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, + engine: str = "python", encoding: str | None = None, sep: str = ",", columns: Sequence[Hashable] | None = None, @@ -1003,12 +1008,13 @@ def to_csv( if path_or_buf is None: created_buffer = True - path_or_buf = StringIO() + path_or_buf = StringIO() if engine == "python" else BytesIO() else: created_buffer = False csv_formatter = CSVFormatter( path_or_buf=path_or_buf, + engine=engine, lineterminator=lineterminator, sep=sep, encoding=encoding, @@ -1029,8 +1035,12 @@ def to_csv( csv_formatter.save() if created_buffer: - assert isinstance(path_or_buf, StringIO) + path_or_buf = cast(Union[BytesIO, StringIO], path_or_buf) content = path_or_buf.getvalue() + if isinstance(content, bytes): + # Need to decode into string since the + # pyarrow engine only writes binary data + content = content.decode("utf-8") path_or_buf.close() return content diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index f70875172ccc8..3ed4044c2c68d 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,4 @@ +import contextlib import io import os import sys @@ -16,8 +17,45 @@ import pandas._testing as tm +@pytest.fixture(params=["python", "pyarrow"]) +def engine(request): + if request.param == "pyarrow": + pytest.importorskip("pyarrow") + return request.param + + +@pytest.fixture +def pyarrow_xfail(request): + """ + Fixture that xfails a test if the engine is pyarrow. + """ + engine = request.getfixturevalue("engine") + if engine == "pyarrow": + mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") + request.applymarker(mark) + + +def check_raises_if_pyarrow(option, engine): + """ + Returns a context manager that ensures that the pyarrow engine raises an + exception for unsupported options. + """ + if engine == "pyarrow": + raises_if_pyarrow = pytest.raises( + ValueError, + match=f"The {option} option is not supported with the pyarrow engine.", + ) + else: + raises_if_pyarrow = contextlib.nullcontext() + return raises_if_pyarrow + + +xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail") + + class TestToCSV: - def test_to_csv_with_single_column(self, temp_file): + @xfail_pyarrow + def test_to_csv_with_single_column(self, temp_file, engine): # see gh-18676, https://bugs.python.org/issue32255 # # Python's CSV library adds an extraneous '""' @@ -30,7 +68,7 @@ def test_to_csv_with_single_column(self, temp_file): "" 1.0 """ - df1.to_csv(temp_file, header=None, index=None) + df1.to_csv(temp_file, header=None, index=None, engine=engine) with open(temp_file, encoding="utf-8") as f: assert f.read() == expected1 @@ -39,19 +77,19 @@ def test_to_csv_with_single_column(self, temp_file): 1.0 "" """ - df2.to_csv(temp_file, header=None, index=None) + df2.to_csv(temp_file, header=None, index=None, engine=engine) with open(temp_file, encoding="utf-8") as f: assert f.read() == expected2 - def test_to_csv_default_encoding(self, temp_file): + def test_to_csv_default_encoding(self, temp_file, engine): # GH17097 df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]}) - # the default to_csv encoding is uft-8. - df.to_csv(temp_file) + # the default to_csv encoding is utf-8. + df.to_csv(temp_file, engine=engine) tm.assert_frame_equal(pd.read_csv(temp_file, index_col=0), df) - def test_to_csv_quotechar(self, temp_file): + def test_to_csv_quotechar(self, temp_file, engine): df = DataFrame({"col": [1, 2]}) expected = """\ "","col" @@ -59,7 +97,7 @@ def test_to_csv_quotechar(self, temp_file): "1","2" """ - df.to_csv(temp_file, quoting=1) # 1=QUOTE_ALL + df.to_csv(temp_file, quoting=1, engine=engine) # 1=QUOTE_ALL with open(temp_file, encoding="utf-8") as f: assert f.read() == expected @@ -69,12 +107,21 @@ def test_to_csv_quotechar(self, temp_file): $1$,$2$ """ - df.to_csv(temp_file, quoting=1, quotechar="$") - with open(temp_file, encoding="utf-8") as f: - assert f.read() == expected + if engine == "pyarrow": + raises_if_pyarrow = pytest.raises( + ValueError, + match='The pyarrow engine only supports " as a quotechar.', + ) + else: + raises_if_pyarrow = contextlib.nullcontext() + with raises_if_pyarrow: + df.to_csv(temp_file, quoting=1, quotechar="$", engine=engine) + if engine != "pyarrow": + with open(temp_file, encoding="utf-8") as f: + assert f.read() == expected with pytest.raises(TypeError, match="quotechar"): - df.to_csv(temp_file, quoting=1, quotechar=None) + df.to_csv(temp_file, quoting=1, quotechar=None, engine=engine) def test_to_csv_doublequote(self, temp_file): df = DataFrame({"col": ['a"a', '"bb"']}) @@ -84,14 +131,15 @@ def test_to_csv_doublequote(self, temp_file): "1","""bb""" ''' - df.to_csv(temp_file, quoting=1, doublequote=True) # QUOTE_ALL + df.to_csv(temp_file, quoting=1, doublequote=True, engine=engine) # QUOTE_ALL with open(temp_file, encoding="utf-8") as f: assert f.read() == expected with pytest.raises(Error, match="escapechar"): - df.to_csv(temp_file, doublequote=False) # no escapechar set + df.to_csv(temp_file, doublequote=False, engine=engine) # no escapechar set - def test_to_csv_escapechar(self, temp_file): + def test_to_csv_escapechar(self, temp_file, engine): + raises_if_pyarrow = check_raises_if_pyarrow("escapechar", engine) df = DataFrame({"col": ['a"a', '"bb"']}) expected = """\ "","col" @@ -99,9 +147,12 @@ def test_to_csv_escapechar(self, temp_file): "1","\\"bb\\"" """ - df.to_csv(temp_file, quoting=1, doublequote=False, escapechar="\\") - with open(temp_file, encoding="utf-8") as f: - assert f.read() == expected + with raises_if_pyarrow: + df.to_csv( + temp_file, quoting=1, doublequote=False, escapechar="\\", engine=engine + ) + with open(temp_file, encoding="utf-8") as f: + assert f.read() == expected df = DataFrame({"col": ["a,a", ",bb,"]}) expected = """\ @@ -110,106 +161,154 @@ def test_to_csv_escapechar(self, temp_file): 1,\\,bb\\, """ - df.to_csv(temp_file, quoting=3, escapechar="\\") # QUOTE_NONE - with open(temp_file, encoding="utf-8") as f: - assert f.read() == expected + with raises_if_pyarrow: + df.to_csv( + temp_file, quoting=3, escapechar="\\", engine=engine + ) # QUOTE_NONE + with open(temp_file, encoding="utf-8") as f: + assert f.read() == expected - def test_csv_to_string(self): + @xfail_pyarrow + def test_csv_to_string(self, engine): df = DataFrame({"col": [1, 2]}) expected_rows = [",col", "0,1", "1,2"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv() == expected + assert df.to_csv(engine=engine) == expected - def test_to_csv_decimal(self): + @xfail_pyarrow + def test_to_csv_decimal(self, engine): # see gh-781 + raises_if_pyarrow = check_raises_if_pyarrow("decimal", engine) df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]}) expected_rows = [",col1,col2,col3", "0,1,a,10.1"] expected_default = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv() == expected_default + # This assert fails for the pyarrow engine since it quotes strings + # and the Python engine doesn't + assert df.to_csv(engine=engine) == expected_default expected_rows = [";col1;col2;col3", "0;1;a;10,1"] expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(decimal=",", sep=";") == expected_european_excel + with raises_if_pyarrow: + assert ( + df.to_csv(engine=engine, decimal=",", sep=";") + == expected_european_excel + ) expected_rows = [",col1,col2,col3", "0,1,a,10.10"] expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(float_format="%.2f") == expected_float_format_default + with raises_if_pyarrow: + assert ( + df.to_csv(engine=engine, float_format="%.2f") + == expected_float_format_default + ) expected_rows = [";col1;col2;col3", "0;1;a;10,10"] expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows) - assert ( - df.to_csv(decimal=",", sep=";", float_format="%.2f") - == expected_float_format - ) + with raises_if_pyarrow: + assert ( + df.to_csv(engine=engine, decimal=",", sep=";", float_format="%.2f") + == expected_float_format + ) # see gh-11553: testing if decimal is taken into account for '0.0' df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(index=False, decimal="^") == expected + with raises_if_pyarrow: + assert df.to_csv(engine=engine, index=False, decimal="^") == expected # same but for an index - assert df.set_index("a").to_csv(decimal="^") == expected + with raises_if_pyarrow: + assert df.set_index("a").to_csv(engine=engine, decimal="^") == expected # same for a multi-index - assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, decimal="^") == expected + ) - def test_to_csv_float_format(self): + def test_to_csv_float_format(self, engine): # testing if float_format is taken into account for the index # GH 11553 + raises_if_pyarrow = check_raises_if_pyarrow("float_format", engine) df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1}) expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(float_format="%.2f") == expected + with raises_if_pyarrow: + assert ( + df.set_index("a").to_csv(engine=engine, float_format="%.2f") == expected + ) # same for a multi-index - assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, float_format="%.2f") + == expected + ) - def test_to_csv_na_rep(self): + def test_to_csv_na_rep(self, engine): # see gh-11553 # # Testing if NaN values are correctly represented in the index. + raises_if_pyarrow = check_raises_if_pyarrow("na_rep", engine) df = DataFrame({"a": [0, np.nan], "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(na_rep="_") == expected - assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + with raises_if_pyarrow: + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected + ) # now with an index containing only NaNs df = DataFrame({"a": np.nan, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "_,0,2", "_,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(na_rep="_") == expected - assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + with raises_if_pyarrow: + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected + ) # check if na_rep parameter does not break anything when no NaN df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]}) expected_rows = ["a,b,c", "0,0,2", "0,1,3"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.set_index("a").to_csv(na_rep="_") == expected - assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected + with raises_if_pyarrow: + assert df.set_index("a").to_csv(engine=engine, na_rep="_") == expected + with raises_if_pyarrow: + assert ( + df.set_index(["a", "b"]).to_csv(engine=engine, na_rep="_") == expected + ) - csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ") - expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) - assert expected == csv + with raises_if_pyarrow: + csv = pd.Series(["a", pd.NA, "c"]).to_csv(engine=engine, na_rep="ZZZZZ") + expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) + assert expected == csv - def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype): + def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype, engine): # GH 29975 # Make sure full na_rep shows up when a dtype is provided + raises_if_pyarrow = check_raises_if_pyarrow("na_rep", engine) expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"]) - csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv( - na_rep="ZZZZZ" - ) - assert expected == csv - - def test_to_csv_date_format(self): + with raises_if_pyarrow: + csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv( + engine=engine, na_rep="ZZZZZ" + ) + assert expected == csv + + @xfail_pyarrow + def test_to_csv_date_format(self, engine): # GH 10209 + raises_if_pyarrow = check_raises_if_pyarrow("date_format", engine) df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")}) df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="D")}) @@ -222,7 +321,7 @@ def test_to_csv_date_format(self): "4,2013-01-01 00:00:04", ] expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_sec.to_csv() == expected_default_sec + assert df_sec.to_csv(engine=engine) == expected_default_sec expected_rows = [ ",A", @@ -233,7 +332,11 @@ def test_to_csv_date_format(self): "4,2013-01-05 00:00:00", ] expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day + with raises_if_pyarrow: + assert ( + df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S", engine=engine) + == expected_ymdhms_day + ) expected_rows = [ ",A", @@ -244,7 +347,10 @@ def test_to_csv_date_format(self): "4,2013-01-01", ] expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec + with raises_if_pyarrow: + assert ( + df_sec.to_csv(date_format="%Y-%m-%d", engine=engine) == expected_ymd_sec + ) expected_rows = [ ",A", @@ -255,8 +361,13 @@ def test_to_csv_date_format(self): "4,2013-01-05", ] expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows) - assert df_day.to_csv() == expected_default_day - assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day + with raises_if_pyarrow: + assert df_day.to_csv(engine=engine) == expected_default_day + with raises_if_pyarrow: + assert ( + df_day.to_csv(date_format="%Y-%m-%d", engine=engine) + == expected_default_day + ) # see gh-7791 # @@ -269,9 +380,14 @@ def test_to_csv_date_format(self): expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows) df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"]) - assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec - - def test_to_csv_different_datetime_formats(self): + with raises_if_pyarrow: + assert ( + df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d", engine=engine) + == expected_ymd_sec + ) + + @xfail_pyarrow + def test_to_csv_different_datetime_formats(self, engine): # GH#21734 df = DataFrame( { @@ -285,14 +401,16 @@ def test_to_csv_different_datetime_formats(self): "1970-01-01,1970-01-01 01:00:00", ] expected = tm.convert_rows_list_to_csv_str(expected_rows) - assert df.to_csv(index=False) == expected + assert df.to_csv(index=False, engine=engine) == expected - def test_to_csv_date_format_in_categorical(self): + @xfail_pyarrow + def test_to_csv_date_format_in_categorical(self, engine): # GH#40754 + raises_if_pyarrow = check_raises_if_pyarrow("date_format", engine) ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d")) ser = ser.astype("category") expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""']) - assert ser.to_csv(index=False) == expected + assert ser.to_csv(index=False, engine=engine) == expected ser = pd.Series( pd.date_range( @@ -300,39 +418,47 @@ def test_to_csv_date_format_in_categorical(self): ).append(pd.DatetimeIndex([pd.NaT])) ) ser = ser.astype("category") - assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected + with raises_if_pyarrow: + assert ( + ser.to_csv(index=False, engine=engine, date_format="%Y-%m-%d") + == expected + ) - def test_to_csv_float_ea_float_format(self): + def test_to_csv_float_ea_float_format(self, engine): # GH#45991 + raises_if_pyarrow = check_raises_if_pyarrow("float_format", engine) df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) df["a"] = df["a"].astype("Float64") - result = df.to_csv(index=False, float_format="%.5f") - expected = tm.convert_rows_list_to_csv_str( - ["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"] - ) - assert result == expected - - def test_to_csv_float_ea_no_float_format(self): + with raises_if_pyarrow: + result = df.to_csv(index=False, engine=engine, float_format="%.5f") + expected = tm.convert_rows_list_to_csv_str( + ["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"] + ) + assert result == expected + + @xfail_pyarrow + def test_to_csv_float_ea_no_float_format(self, engine): # GH#45991 df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"}) df["a"] = df["a"].astype("Float64") - result = df.to_csv(index=False) + result = df.to_csv(index=False, engine=engine) expected = tm.convert_rows_list_to_csv_str( ["a,b", "1.1,c", "2.02,c", ",c", "6.000006,c"] ) assert result == expected - def test_to_csv_multi_index(self): + @xfail_pyarrow + def test_to_csv_multi_index(self, engine): # see gh-6618 df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]])) exp_rows = [",1", ",2", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv() == exp + assert df.to_csv(engine=engine) == exp exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv(index=False) == exp + assert df.to_csv(index=False, engine=engine) == exp df = DataFrame( [1], @@ -342,22 +468,23 @@ def test_to_csv_multi_index(self): exp_rows = [",,1", ",,2", "1,2,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv() == exp + assert df.to_csv(engine=engine) == exp exp_rows = ["1", "2", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv(index=False) == exp + assert df.to_csv(index=False, engine=engine) == exp df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]])) exp_rows = [",foo", ",bar", "0,1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv() == exp + assert df.to_csv(engine=engine) == exp exp_rows = ["foo", "bar", "1"] exp = tm.convert_rows_list_to_csv_str(exp_rows) - assert df.to_csv(index=False) == exp + assert df.to_csv(index=False, engine=engine) == exp + @xfail_pyarrow @pytest.mark.parametrize( "ind,expected", [ @@ -373,15 +500,17 @@ def test_to_csv_multi_index(self): ), ], ) - def test_to_csv_single_level_multi_index(self, ind, expected, frame_or_series): + def test_to_csv_single_level_multi_index( + self, ind, expected, frame_or_series, engine + ): # see gh-19589 obj = frame_or_series(pd.Series([1], ind, name="data")) - - result = obj.to_csv(lineterminator="\n", header=True) + result = obj.to_csv(lineterminator="\n", header=True, engine=engine) assert result == expected - def test_to_csv_string_array_ascii(self, temp_file): + def test_to_csv_string_array_ascii(self, temp_file, engine): # GH 10813 + raises_if_pyarrow = check_raises_if_pyarrow("encoding", engine) str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) expected_ascii = """\ @@ -389,11 +518,13 @@ def test_to_csv_string_array_ascii(self, temp_file): 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ - df.to_csv(temp_file, encoding="ascii") - with open(temp_file, encoding="utf-8") as f: - assert f.read() == expected_ascii + with raises_if_pyarrow: + df.to_csv(temp_file, encoding="ascii", engine=engine) + with open(temp_file, encoding="utf-8") as f: + assert f.read() == expected_ascii - def test_to_csv_string_array_utf8(self, temp_file): + @xfail_pyarrow + def test_to_csv_string_array_utf8(self, temp_file, engine): # GH 10813 str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}] df = DataFrame(str_array) @@ -402,12 +533,14 @@ def test_to_csv_string_array_utf8(self, temp_file): 0,"['foo', 'bar']" 1,"['baz', 'qux']" """ - df.to_csv(temp_file, encoding="utf-8") + df.to_csv(temp_file, encoding="utf-8", engine=engine) with open(temp_file, encoding="utf-8") as f: assert f.read() == expected_utf8 - def test_to_csv_string_with_lf(self, temp_file): + @xfail_pyarrow + def test_to_csv_string_with_lf(self, temp_file, engine): # GH 20353 + raises_if_pyarrow = check_raises_if_pyarrow("lineterminator", engine) data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]} df = DataFrame(data) @@ -423,25 +556,29 @@ def test_to_csv_string_with_lf(self, temp_file): + b'3,"g\nh\n\ni"' + os_linesep ) - df.to_csv(temp_file, index=False) + df.to_csv(temp_file, index=False, engine=engine) with open(temp_file, "rb") as f: assert f.read() == expected_noarg # case 2: LF as line terminator expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n' - df.to_csv(temp_file, lineterminator="\n", index=False) - with open(temp_file, "rb") as f: - assert f.read() == expected_lf + with raises_if_pyarrow: + df.to_csv(temp_file, lineterminator="\n", index=False, engine=engine) + with open(temp_file, "rb") as f: + assert f.read() == expected_lf # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n' - df.to_csv(temp_file, lineterminator="\r\n", index=False) - with open(temp_file, "rb") as f: - assert f.read() == expected_crlf + with raises_if_pyarrow: + df.to_csv(temp_file, lineterminator="\r\n", index=False, engine=engine) + with open(temp_file, "rb") as f: + assert f.read() == expected_crlf - def test_to_csv_string_with_crlf(self, temp_file): + @xfail_pyarrow + def test_to_csv_string_with_crlf(self, temp_file, engine): # GH 20353 + raises_if_pyarrow = check_raises_if_pyarrow("lineterminator", engine) data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]} df = DataFrame(data) # case 1: The default line terminator(=os.linesep)(PR 21406) @@ -456,37 +593,41 @@ def test_to_csv_string_with_crlf(self, temp_file): + b'3,"g\r\nh\r\n\r\ni"' + os_linesep ) - df.to_csv(temp_file, index=False) + df.to_csv(temp_file, index=False, engine=engine) with open(temp_file, "rb") as f: assert f.read() == expected_noarg # case 2: LF as line terminator expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n' - df.to_csv(temp_file, lineterminator="\n", index=False) - with open(temp_file, "rb") as f: - assert f.read() == expected_lf + with raises_if_pyarrow: + df.to_csv(temp_file, lineterminator="\n", index=False, engine=engine) + with open(temp_file, "rb") as f: + assert f.read() == expected_lf # case 3: CRLF as line terminator # 'lineterminator' should not change inner element expected_crlf = ( b'int,str_crlf\r\n1,abc\r\n2,"d\r\nef"\r\n3,"g\r\nh\r\n\r\ni"\r\n' ) - df.to_csv(temp_file, lineterminator="\r\n", index=False) - with open(temp_file, "rb") as f: - assert f.read() == expected_crlf + with raises_if_pyarrow: + df.to_csv(temp_file, lineterminator="\r\n", index=False, engine=engine) + with open(temp_file, "rb") as f: + assert f.read() == expected_crlf - def test_to_csv_stdout_file(self, capsys): + @xfail_pyarrow + def test_to_csv_stdout_file(self, capsys, engine): # GH 21561 df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"]) expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"] expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows) - df.to_csv(sys.stdout, encoding="ascii") + df.to_csv(sys.stdout, encoding="ascii", engine=engine) captured = capsys.readouterr() assert captured.out == expected_ascii assert not sys.stdout.closed + @xfail_pyarrow @pytest.mark.xfail( compat.is_platform_windows(), reason=( @@ -495,7 +636,7 @@ def test_to_csv_stdout_file(self, capsys): "(https://docs.python.org/3/library/csv.html#csv.writer)" ), ) - def test_to_csv_write_to_open_file(self, temp_file): + def test_to_csv_write_to_open_file(self, temp_file, engine): # GH 21696 df = DataFrame({"a": ["x", "y", "z"]}) expected = """\ @@ -506,20 +647,38 @@ def test_to_csv_write_to_open_file(self, temp_file): """ with open(temp_file, "w", encoding="utf-8") as f: f.write("manual header\n") - df.to_csv(f, header=None, index=None) + if engine == "pyarrow": + raise_if_pyarrow = pytest.raises( + ValueError, + match="The pyarrow engine can only open files in abinary mode.", + ) + else: + raise_if_pyarrow = contextlib.nullcontext() + with raise_if_pyarrow: + df.to_csv(f, header=None, index=None, engine=engine) with open(temp_file, encoding="utf-8") as f: assert f.read() == expected - def test_to_csv_write_to_open_file_with_newline_py3(self, temp_file): + @xfail_pyarrow + def test_to_csv_write_to_open_file_with_newline_py3(self, temp_file, engine): # see gh-21696 # see gh-20353 df = DataFrame({"a": ["x", "y", "z"]}) expected_rows = ["x", "y", "z"] expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows) + # TODO: Open in bytes mode for pyarrow with open(temp_file, "w", newline="", encoding="utf-8") as f: f.write("manual header\n") - df.to_csv(f, header=None, index=None) + if engine == "pyarrow": + raise_if_pyarrow = pytest.raises( + ValueError, + match="The pyarrow engine can only open file in abinary mode.", + ) + else: + raise_if_pyarrow = contextlib.nullcontext() + with raise_if_pyarrow: + df.to_csv(f, header=None, index=None, engine=engine) with open(temp_file, "rb") as f: assert f.read() == bytes(expected, "utf-8") @@ -533,6 +692,7 @@ def test_to_csv_compression( to_infer, compression_to_extension, temp_file, + engine, ): # see gh-15008 compression = compression_only @@ -543,11 +703,11 @@ def test_to_csv_compression( read_compression = "infer" if read_infer else compression path_ext = str(temp_file) + "." + compression_to_extension[compression] - df.to_csv(path_ext, compression=to_compression) + df.to_csv(path_ext, compression=to_compression, engine=engine) result = pd.read_csv(path_ext, index_col=0, compression=read_compression) tm.assert_frame_equal(result, df) - def test_to_csv_compression_dict(self, compression_only, temp_file): + def test_to_csv_compression_dict(self, compression_only, temp_file, engine): # GH 26023 method = compression_only df = DataFrame({"ABC": [1]}) @@ -557,28 +717,30 @@ def test_to_csv_compression_dict(self, compression_only, temp_file): }.get(method, method) path = str(temp_file) + "." + extension - df.to_csv(path, compression={"method": method}) + df.to_csv(path, compression={"method": method}, engine=engine) read_df = pd.read_csv(path, index_col=0) tm.assert_frame_equal(read_df, df) - def test_to_csv_compression_dict_no_method_raises(self, temp_file): + def test_to_csv_compression_dict_no_method_raises(self, temp_file, engine): # GH 26023 df = DataFrame({"ABC": [1]}) compression = {"some_option": True} msg = "must have key 'method'" with pytest.raises(ValueError, match=msg): - df.to_csv(temp_file, compression=compression) + df.to_csv(temp_file, compression=compression, engine=engine) @pytest.mark.parametrize("compression", ["zip", "infer"]) @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"]) - def test_to_csv_zip_arguments(self, compression, archive_name, temp_file): + def test_to_csv_zip_arguments(self, compression, archive_name, temp_file, engine): # GH 26023 df = DataFrame({"ABC": [1]}) path = str(temp_file) + ".zip" df.to_csv( - path, compression={"method": compression, "archive_name": archive_name} + path, + compression={"method": compression, "archive_name": archive_name}, + engine=engine, ) with ZipFile(path) as zp: assert len(zp.filelist) == 1 @@ -595,33 +757,38 @@ def test_to_csv_zip_arguments(self, compression, archive_name, temp_file): ("archive.zip", "archive"), ], ) - def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname): + def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname, engine): # GH 39465 df = DataFrame({"ABC": [1]}) path = tmp_path / filename - df.to_csv(path, compression="zip") + df.to_csv(path, compression="zip", engine=engine) with ZipFile(path) as zp: assert len(zp.filelist) == 1 archived_file = zp.filelist[0].filename assert archived_file == expected_arcname @pytest.mark.parametrize("df_new_type", ["Int64"]) - def test_to_csv_na_rep_long_string(self, df_new_type): + def test_to_csv_na_rep_long_string(self, df_new_type, engine): # see gh-25099 + raises_if_pyarrow = check_raises_if_pyarrow("na_rep", engine) df = DataFrame({"c": [pd.NA] * 3}) df = df.astype(df_new_type) expected_rows = ["c", "mynull", "mynull", "mynull"] expected = tm.convert_rows_list_to_csv_str(expected_rows) - result = df.to_csv(index=False, na_rep="mynull", encoding="ascii") + with raises_if_pyarrow: + result = df.to_csv( + index=False, na_rep="mynull", encoding="ascii", engine=engine + ) - assert expected == result + assert expected == result - def test_to_csv_timedelta_precision(self): + @xfail_pyarrow + def test_to_csv_timedelta_precision(self, engine): # GH 6783 s = pd.Series([1, 1]).astype("timedelta64[ns]") buf = io.StringIO() - s.to_csv(buf) + s.to_csv(buf, engine=engine) result = buf.getvalue() expected_rows = [ ",0", @@ -631,32 +798,39 @@ def test_to_csv_timedelta_precision(self): expected = tm.convert_rows_list_to_csv_str(expected_rows) assert result == expected - def test_na_rep_truncated(self): + def test_na_rep_truncated(self, engine): # https://github.com/pandas-dev/pandas/issues/31447 - result = pd.Series(range(8, 12)).to_csv(na_rep="-") - expected = tm.convert_rows_list_to_csv_str([",0", "0,8", "1,9", "2,10", "3,11"]) - assert result == expected - - result = pd.Series([True, False]).to_csv(na_rep="nan") - expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"]) - assert result == expected - - result = pd.Series([1.1, 2.2]).to_csv(na_rep=".") - expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) - assert result == expected + raises_if_pyarrow = check_raises_if_pyarrow("na_rep", engine) + with raises_if_pyarrow: + result = pd.Series(range(8, 12)).to_csv(na_rep="-", engine=engine) + expected = tm.convert_rows_list_to_csv_str( + [",0", "0,8", "1,9", "2,10", "3,11"] + ) + assert result == expected + + with raises_if_pyarrow: + result = pd.Series([True, False]).to_csv(na_rep="nan", engine=engine) + expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"]) + assert result == expected + + with raises_if_pyarrow: + result = pd.Series([1.1, 2.2]).to_csv(na_rep=".", engine=engine) + expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"]) + assert result == expected @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"]) - def test_to_csv_errors(self, errors, temp_file): + def test_to_csv_errors(self, errors, temp_file, engine): # GH 22610 + raises_if_pyarrow = check_raises_if_pyarrow("errors", engine) data = ["\ud800foo"] - ser = pd.Series(data, index=Index(data, dtype=object), dtype=object) - - ser.to_csv(temp_file, errors=errors) + with raises_if_pyarrow: + ser = pd.Series(data, index=Index(data, dtype=object), dtype=object) + ser.to_csv(temp_file, errors=errors, engine=engine) # No use in reading back the data as it is not the same anymore # due to the error handling @pytest.mark.parametrize("mode", ["wb", "w"]) - def test_to_csv_binary_handle(self, mode, temp_file): + def test_to_csv_binary_handle(self, mode, temp_file, engine): """ Binary file objects should work (if 'mode' contains a 'b') or even without it in most cases. @@ -670,48 +844,71 @@ def test_to_csv_binary_handle(self, mode, temp_file): ) with open(temp_file, mode="w+b") as handle: - df.to_csv(handle, mode=mode) - tm.assert_frame_equal(df, pd.read_csv(temp_file, index_col=0)) + if engine == "pyarrow" and mode == "w": + raises_if_pyarrow = pytest.raises( + ValueError, + match="The pyarrow engine can only open files in binary mode.", + ) + else: + raises_if_pyarrow = contextlib.nullcontext() + with raises_if_pyarrow: + df.to_csv(handle, mode=mode, engine=engine) + if not engine == "pyarrow" and mode == "w": + tm.assert_frame_equal(df, pd.read_csv(temp_file, index_col=0)) @pytest.mark.parametrize("mode", ["wb", "w"]) - def test_to_csv_encoding_binary_handle(self, mode, temp_file): + def test_to_csv_encoding_binary_handle(self, mode, temp_file, engine, request): """ Binary file objects should honor a specified encoding. GH 23854 and GH 13068 with binary handles """ + + if mode == "w" and engine == "pyarrow": + mark = pytest.mark.xfail( + reason="pyarrow doesn't support non-binary handles." + ) + request.applymarker(mark) + + raises_if_pyarrow = check_raises_if_pyarrow("encoding", engine) # example from GH 23854 content = "a, b, 🐟".encode("utf-8-sig") buffer = io.BytesIO(content) df = pd.read_csv(buffer, encoding="utf-8-sig") buffer = io.BytesIO() - df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False) - buffer.seek(0) # tests whether file handle wasn't closed - assert buffer.getvalue().startswith(content) + with raises_if_pyarrow: + df.to_csv( + buffer, mode=mode, encoding="utf-8-sig", index=False, engine=engine + ) + buffer.seek(0) # tests whether file handle wasn't closed + assert buffer.getvalue().startswith(content) # example from GH 13068 with open(temp_file, "w+b") as handle: - DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig") + with raises_if_pyarrow: + DataFrame().to_csv( + handle, mode=mode, encoding="utf-8-sig", engine=engine + ) - handle.seek(0) - assert handle.read().startswith(b'\xef\xbb\xbf""') + handle.seek(0) + assert handle.read().startswith(b'\xef\xbb\xbf""') -def test_to_csv_iterative_compression_name(compression, temp_file): +def test_to_csv_iterative_compression_name(compression, temp_file, engine): # GH 38714 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), columns=Index(list("ABCD")), index=Index([f"i-{i}" for i in range(30)]), ) - df.to_csv(temp_file, compression=compression, chunksize=1) + df.to_csv(temp_file, compression=compression, chunksize=1, engine=engine) tm.assert_frame_equal( pd.read_csv(temp_file, compression=compression, index_col=0), df ) -def test_to_csv_iterative_compression_buffer(compression): +def test_to_csv_iterative_compression_buffer(compression, engine): # GH 38714 df = DataFrame( 1.1 * np.arange(120).reshape((30, 4)), @@ -719,7 +916,7 @@ def test_to_csv_iterative_compression_buffer(compression): index=Index([f"i-{i}" for i in range(30)]), ) with io.BytesIO() as buffer: - df.to_csv(buffer, compression=compression, chunksize=1) + df.to_csv(buffer, compression=compression, chunksize=1, engine=engine) buffer.seek(0) tm.assert_frame_equal( pd.read_csv(buffer, compression=compression, index_col=0), df