Flatten dataset on the fly in save_to_disk (#5588)

mariosasko · web-flow · commit a2a83a8ea4b3 · 2023-02-28T18:21:17.000+01:00
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1345,8 +1345,6 @@ def save_to_disk(
         if self.list_indexes():
             raise ValueError("please remove all the indexes using `dataset.drop_index` before saving a dataset")
 
-        dataset = self.flatten_indices(num_proc=num_proc) if self._indices is not None else self
-
         if is_local:
             Path(dataset_path).resolve().mkdir(parents=True, exist_ok=True)
             parent_cache_files_paths = {
@@ -1360,7 +1358,7 @@ def save_to_disk(
 
         # Get json serializable state
         state = {
-            key: dataset.__dict__[key]
+            key: self.__dict__[key]
             for key in [
                 "_fingerprint",
                 "_format_columns",
@@ -1369,7 +1367,7 @@ def save_to_disk(
                 "_output_all_columns",
             ]
         }
-        state["_split"] = str(dataset.split) if dataset.split is not None else dataset.split
+        state["_split"] = str(self.split) if self.split is not None else self.split
         state["_data_files"] = [
             {"filename": f"data-{shard_idx:05d}-of-{num_shards:05d}.arrow"} for shard_idx in range(num_shards)
         ]
@@ -1381,20 +1379,20 @@ def save_to_disk(
                     str(e) + f"\nThe format kwargs must be JSON serializable, but key '{k}' isn't."
                 ) from None
         # Get json serializable dataset info
-        dataset_info = asdict(dataset._info)
+        dataset_info = asdict(self._info)
 
         shards_done = 0
         pbar = logging.tqdm(
             disable=not logging.is_progress_bar_enabled(),
             unit=" examples",
-            total=len(dataset),
+            total=len(self),
             leave=False,
             desc=f"Saving the dataset ({shards_done}/{num_shards} shards)",
         )
         kwargs_per_job = (
             {
                 "job_id": shard_idx,
-                "shard": dataset.shard(num_shards=num_shards, index=shard_idx, contiguous=True),
+                "shard": self.shard(num_shards=num_shards, index=shard_idx, contiguous=True),
                 "fpath": path_join(dataset_path, f"data-{shard_idx:05d}-of-{num_shards:05d}.arrow"),
                 "storage_options": storage_options,
             }
@@ -1439,12 +1437,6 @@ def save_to_disk(
     def _save_to_disk_single(job_id: int, shard: "Dataset", fpath: str, storage_options: Optional[dict]):
         batch_size = config.DEFAULT_MAX_BATCH_SIZE
 
-        if shard._indices is not None:
-            raise ValueError(
-                "`_save_to_disk_single` only support shards with flattened indices. "
-                "Please call ds.flatten_indices() before saving to disk."
-            )
-
         num_examples_progress_update = 0
         writer = ArrowWriter(
             features=shard.features,
@@ -1454,7 +1446,7 @@ def _save_to_disk_single(job_id: int, shard: "Dataset", fpath: str, storage_opti
         )
         try:
             _time = time.time()
-            for pa_table in table_iter(shard.data, batch_size=batch_size):
+            for pa_table in shard.with_format("arrow").iter(batch_size):
                 writer.write_table(pa_table)
                 num_examples_progress_update += len(pa_table)
                 if time.time() > _time + config.PBAR_REFRESH_TIME_INTERVAL: