Add writer_batch_size for ArrowBasedBuilder (#5565)

lhoestq · mariosasko · web-flow · commit c5ca1d86949e · 2023-03-10T14:45:42.000+01:00
* add writer_batch_size to ArrowBasedBuilder

* style

* Update src/datasets/builder.py

Co-authored-by: Mario Šaško &lt;mariosasko777@gmail.com&gt;

---------

Co-authored-by: Mario Šaško &lt;mariosasko777@gmail.com&gt;
diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -251,6 +251,11 @@ class DatasetBuilder:
             `os.path.join(data_dir, "**")` as `data_files`.
             For builders that require manual download, it must be the path to the local directory containing the
             manually downloaded data.
+        writer_batch_size (`int`, *optional*):
+            Batch size used by the ArrowWriter.
+            It defines the number of samples that are kept in memory before writing them
+            and also the length of the arrow chunks.
+            None means that the ArrowWriter will use its default value.
         name (`str`): Configuration name for the dataset.
 
             <Deprecated version="2.3.0">
@@ -276,6 +281,12 @@ class DatasetBuilder:
     # Optional default config name to be used when name is None
     DEFAULT_CONFIG_NAME = None
 
+    # Default batch size used by the ArrowWriter
+    # It defines the number of samples that are kept in memory before writing them
+    # and also the length of the arrow chunks
+    # None means that the ArrowWriter will use its default value
+    DEFAULT_WRITER_BATCH_SIZE = None
+
     def __init__(
         self,
         cache_dir: Optional[str] = None,
@@ -288,6 +299,7 @@ def __init__(
         repo_id: Optional[str] = None,
         data_files: Optional[Union[str, list, dict, DataFilesDict]] = None,
         data_dir: Optional[str] = None,
+        writer_batch_size: Optional[int] = None,
         name="deprecated",
         **config_kwargs,
     ):
@@ -303,6 +315,7 @@ def __init__(
         self.base_path = base_path
         self.use_auth_token = use_auth_token
         self.repo_id = repo_id
+        self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
 
         if data_files is not None and not isinstance(data_files, DataFilesDict):
             data_files = DataFilesDict.from_local_or_remote(
@@ -1384,23 +1397,6 @@ class GeneratorBasedBuilder(DatasetBuilder):
     (`_split_generators`). See the method docstrings for details.
     """
 
-    # GeneratorBasedBuilder should have dummy data for tests by default
-    test_dummy_data = True
-
-    # Default batch size used by the ArrowWriter
-    # It defines the number of samples that are kept in memory before writing them
-    # and also the length of the arrow chunks
-    # None means that the ArrowWriter will use its default value
-    DEFAULT_WRITER_BATCH_SIZE = None
-
-    def __init__(self, *args, writer_batch_size=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        # Batch size used by the ArrowWriter
-        # It defines the number of samples that are kept in memory before writing them
-        # and also the length of the arrow chunks
-        # None means that the ArrowWriter will use its default value
-        self._writer_batch_size = writer_batch_size or self.DEFAULT_WRITER_BATCH_SIZE
-
     @abc.abstractmethod
     def _generate_examples(self, **kwargs):
         """Default function generating examples for each `SplitGenerator`.
@@ -1662,9 +1658,6 @@ def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> E
 class ArrowBasedBuilder(DatasetBuilder):
     """Base class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet)."""
 
-    # ArrowBasedBuilder should have dummy data for tests by default
-    test_dummy_data = True
-
     @abc.abstractmethod
     def _generate_tables(self, **kwargs):
         """Default function generating examples for each `SplitGenerator`.
@@ -1853,6 +1846,7 @@ def _prepare_split_single(
             writer = writer_class(
                 features=self.info.features,
                 path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                writer_batch_size=self._writer_batch_size,
                 storage_options=self._fs.storage_options,
                 embed_local_files=embed_local_files,
             )
@@ -1869,6 +1863,7 @@ def _prepare_split_single(
                         writer = writer_class(
                             features=writer._features,
                             path=fpath.replace("SSSSS", f"{shard_id:05d}").replace("JJJJJ", f"{job_id:05d}"),
+                            writer_batch_size=self._writer_batch_size,
                             storage_options=self._fs.storage_options,
                             embed_local_files=embed_local_files,
                         )
@@ -1907,9 +1902,6 @@ class MissingBeamOptions(ValueError):
 class BeamBasedBuilder(DatasetBuilder):
     """Beam-based Builder."""
 
-    # BeamBasedBuilder does not have dummy data for tests yet
-    test_dummy_data = False
-
     def __init__(self, *args, beam_runner=None, beam_options=None, **kwargs):
         self._beam_runner = beam_runner
         self._beam_options = beam_options
@@ -1988,6 +1980,10 @@ def _download_and_prepare(self, dl_manager, verification_mode, **prepare_splits_
                 "`DirectRunner` (you may run out of memory). \nExample of usage: "
                 f"\n\t`{usage_example}`"
             )
+        if self._writer_batch_size is not None:
+            logger.warning(
+                "`writer_batch_size` is not supported for beam pipelines yet. Using the default chunk size for writing."
+            )
 
         # Beam type checking assumes transforms multiple outputs are of same type,
         # which is not our case. Plus it doesn't handle correctly all types, so we