Do no write index by default when exporting a dataset (#5583)

mariosasko · web-flow · commit c4f14de325e2 · 2023-02-28T14:44:03.000+01:00
diff --git a/setup.py b/setup.py
@@ -170,6 +170,7 @@
     "lz4",
     "py7zr",
     "rarfile>=4.0",
+    "sqlalchemy<2.0.0",
     "s3fs>=2021.11.1;python_version<'3.8'",  # aligned with fsspec[http]>=2021.11.1; test only on python 3.7 for now
     "tensorflow>=2.3,!=2.6.0,!=2.6.1; sys_platform != 'darwin' or platform_machine != 'arm64'",
     "tensorflow-macos; sys_platform == 'darwin' and platform_machine == 'arm64'",
@@ -196,7 +197,6 @@
     "scipy",
     "sentencepiece",  # for bleurt
     "seqeval",
-    "sqlalchemy<2.0.0",
     "spacy>=3.0.0",
     "tldextract",
     # to speed up pip backtracking
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -4597,7 +4597,6 @@ def to_csv(
         path_or_buf: Union[PathLike, BinaryIO],
         batch_size: Optional[int] = None,
         num_proc: Optional[int] = None,
-        index: bool = False,
         **to_csv_kwargs,
     ) -> int:
         """Exports the dataset to csv
@@ -4613,20 +4612,18 @@ def to_csv(
                 use multiprocessing. `batch_size` in this case defaults to
                 `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default
                 value if you have sufficient compute power.
-            index (`bool`, default `False`): Write row names (index).
+            **to_csv_kwargs (additional keyword arguments):
+                Parameters to pass to pandas's [`pandas.DataFrame.to_csv`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html).
 
                 <Changed version="2.10.0">
 
-                Now, `index` defaults to `False`.
+                Now, `index` defaults to `False` if not specified.
 
-                If you would like to write the index, set it to `True` and also set a name for the index column by
+                If you would like to write the index, pass `index=True` and also set a name for the index column by
                 passing `index_label`.
 
                 </Changed>
 
-            **to_csv_kwargs (additional keyword arguments):
-                Parameters to pass to pandas's `pandas.DataFrame.to_csv`.
-
         Returns:
             `int`: The number of characters or bytes written.
 
@@ -4639,9 +4636,7 @@ def to_csv(
         # Dynamic import to avoid circular dependency
         from .io.csv import CsvDatasetWriter
 
-        return CsvDatasetWriter(
-            self, path_or_buf, batch_size=batch_size, num_proc=num_proc, index=index, **to_csv_kwargs
-        ).write()
+        return CsvDatasetWriter(self, path_or_buf, batch_size=batch_size, num_proc=num_proc, **to_csv_kwargs).write()
 
     def to_dict(self, batch_size: Optional[int] = None, batched: bool = False) -> Union[dict, Iterator[dict]]:
         """Returns the dataset as a Python dict. Can also return a generator for large datasets.
@@ -4699,22 +4694,17 @@ def to_json(
                 use multiprocessing. `batch_size` in this case defaults to
                 `datasets.config.DEFAULT_MAX_BATCH_SIZE` but feel free to make it 5x or 10x of the default
                 value if you have sufficient compute power.
-            lines (`bool`, defaults to `True`):
-                Whether output JSON lines format.
-                Only possible if `orient="records"`. It will throw ValueError with `orient` different from
-                `"records"`, since the others are not list-like.
-            orient (`str`, defaults to `"records"`):
-                Format of the JSON:
-
-                - `"records"`: list like `[{column -> value}, … , {column -> value}]`
-                - `"split"`: dict like `{"index" -> [index], "columns" -> [columns], "data" -> [values]}`
-                - `"index"`: dict like `{index -> {column -> value}}`
-                - `"columns"`: dict like `{column -> {index -> value}}`
-                - `"values"`: just the values array
-                - `"table"`: dict like `{"schema": {schema}, "data": {data}}`
             **to_json_kwargs (additional keyword arguments):
                 Parameters to pass to pandas's [`pandas.DataFrame.to_json`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html).
 
+                <Changed version="2.11.0">
+
+                Now, `index` defaults to `False` if `orint` is  `"split"` or `"table"` is  specified.
+
+                If you would like to write the index, pass `index=True`.
+
+                </Changed>
+
         Returns:
             `int`: The number of characters or bytes written.
 
@@ -4817,7 +4807,16 @@ def to_sql(
                 Size of the batch to load in memory and write at once.
                 Defaults to `datasets.config.DEFAULT_MAX_BATCH_SIZE`.
             **sql_writer_kwargs (additional keyword arguments):
-                Parameters to pass to pandas's [`Dataframe.to_sql`].
+                Parameters to pass to pandas's [`pandas.DataFrame.to_sql`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_sql.html).
+
+                <Changed version="2.11.0">
+
+                Now, `index` defaults to `False` if not specified.
+
+                If you would like to write the index, pass `index=True` and also set a name for the index column by
+                passing `index_label`.
+
+                </Changed>
 
         Returns:
             `int`: The number of records written.
diff --git a/src/datasets/io/csv.py b/src/datasets/io/csv.py
@@ -86,29 +86,30 @@ def __init__(
 
     def write(self) -> int:
         _ = self.to_csv_kwargs.pop("path_or_buf", None)
+        header = self.to_csv_kwargs.pop("header", True)
         index = self.to_csv_kwargs.pop("index", False)
 
         if isinstance(self.path_or_buf, (str, bytes, os.PathLike)):
             with open(self.path_or_buf, "wb+") as buffer:
-                written = self._write(file_obj=buffer, index=index, **self.to_csv_kwargs)
+                written = self._write(file_obj=buffer, header=header, index=index, **self.to_csv_kwargs)
         else:
-            written = self._write(file_obj=self.path_or_buf, index=index, **self.to_csv_kwargs)
+            written = self._write(file_obj=self.path_or_buf, header=header, index=index, **self.to_csv_kwargs)
         return written
 
     def _batch_csv(self, args):
-        offset, header, to_csv_kwargs = args
+        offset, header, index, to_csv_kwargs = args
 
         batch = query_table(
             table=self.dataset.data,
             key=slice(offset, offset + self.batch_size),
             indices=self.dataset._indices,
         )
         csv_str = batch.to_pandas().to_csv(
-            path_or_buf=None, header=header if (offset == 0) else False, **to_csv_kwargs
+            path_or_buf=None, header=header if (offset == 0) else False, index=index, **to_csv_kwargs
         )
         return csv_str.encode(self.encoding)
 
-    def _write(self, file_obj: BinaryIO, header: bool = True, **to_csv_kwargs) -> int:
+    def _write(self, file_obj: BinaryIO, header, index, **to_csv_kwargs) -> int:
         """Writes the pyarrow table as CSV to a binary file handle.
 
         Caller is responsible for opening and closing the handle.
@@ -122,7 +123,7 @@ def _write(self, file_obj: BinaryIO, header: bool = True, **to_csv_kwargs) -> in
                 disable=not logging.is_progress_bar_enabled(),
                 desc="Creating CSV from Arrow format",
             ):
-                csv_str = self._batch_csv((offset, header, to_csv_kwargs))
+                csv_str = self._batch_csv((offset, header, index, to_csv_kwargs))
                 written += file_obj.write(csv_str)
 
         else:
@@ -131,7 +132,7 @@ def _write(self, file_obj: BinaryIO, header: bool = True, **to_csv_kwargs) -> in
                 for csv_str in logging.tqdm(
                     pool.imap(
                         self._batch_csv,
-                        [(offset, header, to_csv_kwargs) for offset in range(0, num_rows, batch_size)],
+                        [(offset, header, index, to_csv_kwargs) for offset in range(0, num_rows, batch_size)],
                     ),
                     total=(num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size,
                     unit="ba",
diff --git a/src/datasets/io/json.py b/src/datasets/io/json.py
@@ -92,33 +92,38 @@ def __init__(
     def write(self) -> int:
         _ = self.to_json_kwargs.pop("path_or_buf", None)
         orient = self.to_json_kwargs.pop("orient", "records")
-        lines = self.to_json_kwargs.pop("lines", True)
+        lines = self.to_json_kwargs.pop("lines", True if orient == "records" else False)
+        index = self.to_json_kwargs.pop("index", False if orient in ["split", "table"] else True)
         compression = self.to_json_kwargs.pop("compression", None)
 
         if compression not in [None, "infer", "gzip", "bz2", "xz"]:
             raise NotImplementedError(f"`datasets` currently does not support {compression} compression")
 
         if isinstance(self.path_or_buf, (str, bytes, os.PathLike)):
             with fsspec.open(self.path_or_buf, "wb", compression=compression) as buffer:
-                written = self._write(file_obj=buffer, orient=orient, lines=lines, **self.to_json_kwargs)
+                written = self._write(file_obj=buffer, orient=orient, lines=lines, index=index, **self.to_json_kwargs)
         else:
             if compression:
                 raise NotImplementedError(
                     f"The compression parameter is not supported when writing to a buffer, but compression={compression}"
                     " was passed. Please provide a local path instead."
                 )
-            written = self._write(file_obj=self.path_or_buf, orient=orient, lines=lines, **self.to_json_kwargs)
+            written = self._write(
+                file_obj=self.path_or_buf, orient=orient, lines=lines, index=index, **self.to_json_kwargs
+            )
         return written
 
     def _batch_json(self, args):
-        offset, orient, lines, to_json_kwargs = args
+        offset, orient, lines, index, to_json_kwargs = args
 
         batch = query_table(
             table=self.dataset.data,
             key=slice(offset, offset + self.batch_size),
             indices=self.dataset._indices,
         )
-        json_str = batch.to_pandas().to_json(path_or_buf=None, orient=orient, lines=lines, **to_json_kwargs)
+        json_str = batch.to_pandas().to_json(
+            path_or_buf=None, orient=orient, lines=lines, index=index, **to_json_kwargs
+        )
         if not json_str.endswith("\n"):
             json_str += "\n"
         return json_str.encode(self.encoding)
@@ -128,6 +133,7 @@ def _write(
         file_obj: BinaryIO,
         orient,
         lines,
+        index,
         **to_json_kwargs,
     ) -> int:
         """Writes the pyarrow table as JSON lines to a binary file handle.
@@ -143,15 +149,15 @@ def _write(
                 disable=not logging.is_progress_bar_enabled(),
                 desc="Creating json from Arrow format",
             ):
-                json_str = self._batch_json((offset, orient, lines, to_json_kwargs))
+                json_str = self._batch_json((offset, orient, lines, index, to_json_kwargs))
                 written += file_obj.write(json_str)
         else:
             num_rows, batch_size = len(self.dataset), self.batch_size
             with multiprocessing.Pool(self.num_proc) as pool:
                 for json_str in logging.tqdm(
                     pool.imap(
                         self._batch_json,
-                        [(offset, orient, lines, to_json_kwargs) for offset in range(0, num_rows, batch_size)],
+                        [(offset, orient, lines, index, to_json_kwargs) for offset in range(0, num_rows, batch_size)],
                     ),
                     total=(num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size,
                     unit="ba",
diff --git a/src/datasets/io/sql.py b/src/datasets/io/sql.py
@@ -77,23 +77,24 @@ def __init__(
     def write(self) -> int:
         _ = self.to_sql_kwargs.pop("sql", None)
         _ = self.to_sql_kwargs.pop("con", None)
+        index = self.to_sql_kwargs.pop("index", False)
 
-        written = self._write(**self.to_sql_kwargs)
+        written = self._write(index=index, **self.to_sql_kwargs)
         return written
 
     def _batch_sql(self, args):
-        offset, to_sql_kwargs = args
+        offset, index, to_sql_kwargs = args
         to_sql_kwargs = {**to_sql_kwargs, "if_exists": "append"} if offset > 0 else to_sql_kwargs
         batch = query_table(
             table=self.dataset.data,
             key=slice(offset, offset + self.batch_size),
             indices=self.dataset._indices,
         )
         df = batch.to_pandas()
-        num_rows = df.to_sql(self.name, self.con, **to_sql_kwargs)
+        num_rows = df.to_sql(self.name, self.con, index=index, **to_sql_kwargs)
         return num_rows or len(df)
 
-    def _write(self, **to_sql_kwargs) -> int:
+    def _write(self, index, **to_sql_kwargs) -> int:
         """Writes the pyarrow table as SQL to a database.
 
         Caller is responsible for opening and closing the SQL connection.
@@ -107,14 +108,14 @@ def _write(self, **to_sql_kwargs) -> int:
                 disable=not logging.is_progress_bar_enabled(),
                 desc="Creating SQL from Arrow format",
             ):
-                written += self._batch_sql((offset, to_sql_kwargs))
+                written += self._batch_sql((offset, index, to_sql_kwargs))
         else:
             num_rows, batch_size = len(self.dataset), self.batch_size
             with multiprocessing.Pool(self.num_proc) as pool:
                 for num_rows in logging.tqdm(
                     pool.imap(
                         self._batch_sql,
-                        [(offset, to_sql_kwargs) for offset in range(0, num_rows, batch_size)],
+                        [(offset, index, to_sql_kwargs) for offset in range(0, num_rows, batch_size)],
                     ),
                     total=(num_rows // batch_size) + 1 if num_rows % batch_size else num_rows // batch_size,
                     unit="ba",
diff --git a/tests/io/test_json.py b/tests/io/test_json.py
@@ -188,7 +188,7 @@ def test_dataset_to_json_lines(self, lines, load_json_function, dataset):
         "orient, container, keys, len_at",
         [
             ("records", list, {"tokens", "labels", "answers", "id"}, None),
-            ("split", dict, {"index", "columns", "data"}, "data"),
+            ("split", dict, {"columns", "data"}, "data"),
             ("index", dict, set("0123456789"), None),
             ("columns", dict, {"tokens", "labels", "answers", "id"}, "tokens"),
             ("values", list, None, None),
@@ -227,7 +227,7 @@ def test_dataset_to_json_lines_multiproc(self, lines, load_json_function, datase
         "orient, container, keys, len_at",
         [
             ("records", list, {"tokens", "labels", "answers", "id"}, None),
-            ("split", dict, {"index", "columns", "data"}, "data"),
+            ("split", dict, {"columns", "data"}, "data"),
             ("index", dict, set("0123456789"), None),
             ("columns", dict, {"tokens", "labels", "answers", "id"}, "tokens"),
             ("values", list, None, None),
diff --git a/tests/io/test_sql.py b/tests/io/test_sql.py
@@ -66,7 +66,7 @@ def test_dataset_to_sql(sqlite_path, tmp_path):
     cache_dir = tmp_path / "cache"
     output_sqlite_path = os.path.join(cache_dir, "tmp.sql")
     dataset = SqlDatasetReader("dataset", "sqlite:///" + sqlite_path, cache_dir=cache_dir).read()
-    SqlDatasetWriter(dataset, "dataset", "sqlite:///" + output_sqlite_path, index=False, num_proc=1).write()
+    SqlDatasetWriter(dataset, "dataset", "sqlite:///" + output_sqlite_path, num_proc=1).write()
 
     original_sql = iter_sql_file(sqlite_path)
     expected_sql = iter_sql_file(output_sqlite_path)
@@ -80,7 +80,7 @@ def test_dataset_to_sql_multiproc(sqlite_path, tmp_path):
     cache_dir = tmp_path / "cache"
     output_sqlite_path = os.path.join(cache_dir, "tmp.sql")
     dataset = SqlDatasetReader("dataset", "sqlite:///" + sqlite_path, cache_dir=cache_dir).read()
-    SqlDatasetWriter(dataset, "dataset", "sqlite:///" + output_sqlite_path, index=False, num_proc=2).write()
+    SqlDatasetWriter(dataset, "dataset", "sqlite:///" + output_sqlite_path, num_proc=2).write()
 
     original_sql = iter_sql_file(sqlite_path)
     expected_sql = iter_sql_file(output_sqlite_path)
@@ -95,4 +95,4 @@ def test_dataset_to_sql_invalidproc(sqlite_path, tmp_path):
     output_sqlite_path = os.path.join(cache_dir, "tmp.sql")
     dataset = SqlDatasetReader("dataset", "sqlite:///" + sqlite_path, cache_dir=cache_dir).read()
     with pytest.raises(ValueError):
-        SqlDatasetWriter(dataset, "dataset", "sqlite:///" + output_sqlite_path, index=False, num_proc=0).write()
+        SqlDatasetWriter(dataset, "dataset", "sqlite:///" + output_sqlite_path, num_proc=0).write()
diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -2259,7 +2259,7 @@ def test_to_sql(self, in_memory):
             # Destionation specified as database URI string
             with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:
                 file_path = os.path.join(tmp_dir, "test_path.sqlite")
-                _ = dset.to_sql("data", "sqlite:///" + file_path, index=False)
+                _ = dset.to_sql("data", "sqlite:///" + file_path)
 
                 self.assertTrue(os.path.isfile(file_path))
                 sql_dset = pd.read_sql("data", "sqlite:///" + file_path)
@@ -2273,7 +2273,7 @@ def test_to_sql(self, in_memory):
 
                 file_path = os.path.join(tmp_dir, "test_path.sqlite")
                 with contextlib.closing(sqlite3.connect(file_path)) as con:
-                    _ = dset.to_sql("data", con, index=False, if_exists="replace")
+                    _ = dset.to_sql("data", con, if_exists="replace")
 
                 self.assertTrue(os.path.isfile(file_path))
                 sql_dset = pd.read_sql("data", "sqlite:///" + file_path)
@@ -2284,7 +2284,7 @@ def test_to_sql(self, in_memory):
             # Test writing to a database in chunks
             with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:
                 file_path = os.path.join(tmp_dir, "test_path.sqlite")
-                _ = dset.to_sql("data", "sqlite:///" + file_path, batch_size=1, index=False, if_exists="replace")
+                _ = dset.to_sql("data", "sqlite:///" + file_path, batch_size=1, if_exists="replace")
 
                 self.assertTrue(os.path.isfile(file_path))
                 sql_dset = pd.read_sql("data", "sqlite:///" + file_path)
@@ -2296,7 +2296,7 @@ def test_to_sql(self, in_memory):
             with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:
                 dset = dset.select(range(0, len(dset), 2)).shuffle()
                 file_path = os.path.join(tmp_dir, "test_path.sqlite")
-                _ = dset.to_sql("data", "sqlite:///" + file_path, index=False, if_exists="replace")
+                _ = dset.to_sql("data", "sqlite:///" + file_path, if_exists="replace")
 
                 self.assertTrue(os.path.isfile(file_path))
                 sql_dset = pd.read_sql("data", "sqlite:///" + file_path)
@@ -2307,7 +2307,7 @@ def test_to_sql(self, in_memory):
             # With array features
             with self._create_dummy_dataset(in_memory, tmp_dir, array_features=True) as dset:
                 file_path = os.path.join(tmp_dir, "test_path.sqlite")
-                _ = dset.to_sql("data", "sqlite:///" + file_path, index=False, if_exists="replace")
+                _ = dset.to_sql("data", "sqlite:///" + file_path, if_exists="replace")
 
                 self.assertTrue(os.path.isfile(file_path))
                 sql_dset = pd.read_sql("data", "sqlite:///" + file_path)