tensorflow
diff --git a/‎tensorflow_datasets/core/dataset_builder.py‎
Lines changed: 106 additions & 13 deletions b/‎tensorflow_datasets/core/dataset_builder.py‎
Lines changed: 106 additions & 13 deletions
diff --git a/‎tensorflow_datasets/core/dataset_builder_beam_test.py‎
Lines changed: 46 additions & 2 deletions b/‎tensorflow_datasets/core/dataset_builder_beam_test.py‎
Lines changed: 46 additions & 2 deletions
diff --git a/‎tensorflow_datasets/core/dataset_builder_test.py‎
Lines changed: 48 additions & 0 deletions b/‎tensorflow_datasets/core/dataset_builder_test.py‎
Lines changed: 48 additions & 0 deletions
@@ -19,14 +19,14 @@
 
 import abc
 import collections
-from collections.abc import Sequence
+from collections.abc import Iterable, Iterator, Mapping, Sequence
 import dataclasses
 import functools
 import inspect
 import json
 import os
 import sys
-from typing import Any, ClassVar, Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
+from typing import Any, ClassVar, Dict, List, Optional, Tuple, Type, Union
 
 from absl import logging
 from etils import epy
@@ -1445,6 +1445,17 @@ def builder_configs(cls) -> dict[str, BuilderConfig]:
       )
     return config_dict
 
+  def _get_filename_template(
+      self, split_name: str
+  ) -> naming.ShardedFileTemplate:
+    """Returns a filename template for the given split."""
+    return naming.ShardedFileTemplate(
+        split=split_name,
+        dataset_name=self.name,
+        data_dir=self.data_path,
+        filetype_suffix=self.info.file_format.file_suffix,  # pytype: disable=attribute-error
+    )
+
 
 class FileReaderBuilder(DatasetBuilder):
   """Base class for datasets reading files.
@@ -1675,17 +1686,6 @@ def _example_writer(self) -> writer_lib.ExampleWriter:
     """
     return writer_lib.ExampleWriter(file_format=self.info.file_format)
 
-  def _get_filename_template(
-      self, split_name: str
-  ) -> naming.ShardedFileTemplate:
-    """Returns a filename template for the given split."""
-    return naming.ShardedFileTemplate(
-        split=split_name,
-        dataset_name=self.name,
-        data_dir=self.data_path,
-        filetype_suffix=self.info.file_format.file_suffix,  # pytype: disable=attribute-error
-    )
-
   def _generate_splits(
       self,
       dl_manager: download.DownloadManager,
@@ -1852,6 +1852,99 @@ def read_tfrecord_beam(
     )
 
 
+class ShardBasedBuilder(FileReaderBuilder):
+  """Base class for datasets with data generated shard by shard.
+
+  Like `GeneratorBasedBuilder`, this base class can be used to create datasets.
+  However, `ShardBasedBuilder` gives strict control over the number of shards
+  and what data ends up in what shard.
+
+  This is useful for datasets where you want to keep the same ordering as the
+  original data source, and/or where you want to keep the same sharding as the
+  original data source.
+
+  You have to implement the `_shard_iterators_per_split` method, which returns
+  a mapping from split name to a list of `ExampleGeneratorFn` functions that
+  return an example iterator. The signature of the function is `Callable[[],
+  Iterator[KeyExample]]` where `KeyExample` is a tuple of (key, example) where
+  key is a unique key for the example and example is a dict of features.
+
+  Note that a `ExampleGeneratorFn` can also be a class that implements a
+  `__call__` method that returns a `Iterator[KeyExample]`.
+
+  Also note that shuffling is not supported. Also, the following fields in
+  `DownloadConfig` are not supported:
+  - `ignore_duplicates`
+  - `max_examples_per_split`
+  - `shard_config`
+  """
+
+  def _download_and_prepare(
+      self,
+      dl_manager: download.DownloadManager,
+      download_config: download.DownloadConfig | None = None,
+  ) -> None:
+    download_config = download_config or download.DownloadConfig()
+
+    split_builder = split_builder_lib.SplitBuilder(
+        split_dict=self.info.splits,
+        features=self.info.features,
+        dataset_size=self.info.dataset_size,
+        beam_options=download_config.beam_options,
+        beam_runner=download_config.beam_runner,
+        example_writer=self._example_writer(),
+        # The following options are ignored by `ShardBasedBuilder`.
+        ignore_duplicates=None,
+        max_examples_per_split=None,
+        shard_config=None,
+    )
+
+    shard_iterators_per_split = self._shard_iterators_per_split(dl_manager)
+    split_info_futures = []
+    for split_name, example_gen_per_shard in shard_iterators_per_split.items():
+      logging.info("Generating split %s", split_name)
+      split_info_future = split_builder.submit_shard_based_generation(
+          split_name=split_name,
+          example_gen_per_shard=example_gen_per_shard,
+          filename_template=self._get_filename_template(split_name=split_name),
+      )
+      split_info_futures.append(split_info_future)
+
+    # Update the info object with the splits.
+    split_infos: list[splits_lib.SplitInfo] = [
+        future.result() for future in split_info_futures
+    ]
+    split_dict = splits_lib.SplitDict(split_infos)
+    self.info.set_splits(split_dict)
+
+  @abc.abstractmethod
+  @utils.docs.do_not_doc_in_subclasses
+  @utils.docs.doc_private
+  def _shard_iterators_per_split(
+      self, dl_manager: download.DownloadManager
+  ) -> Mapping[str, Sequence[split_builder_lib.ExampleGeneratorFn]]:
+    """Returns a mapping from split name to example generators per shard.
+
+    The example generators are functions with signature `Callable[[],
+    Iterator[KeyExample]]` that take no parameters and return
+    an iterator of tuples of (key, example). The order of the example generators
+    is the order in which the shards will be written.
+
+    Args:
+      dl_manager: `tfds.download.DownloadManager` used to download/extract the
+        data.
+    """
+    raise NotImplementedError()
+
+  def _example_writer(self) -> writer_lib.ExampleWriter:
+    """Returns an example writer.
+
+    If datasets should be written to a custom storage, e.g., a database, then
+    implement a custom `ExampleWriter` and inject it here.
+    """
+    return writer_lib.ExampleWriter(file_format=self.info.file_format)
+
+
 @utils.docs.deprecated
 class BeamBasedBuilder(GeneratorBasedBuilder):
   """Beam based Builder.
 
@@ -13,8 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for tensorflow_datasets.core.dataset_builder."""
-
+import functools
 import pathlib
 from typing import Callable
 from unittest import mock
@@ -102,6 +101,31 @@ def _generate_examples(self, examples, num_examples):
     return examples
 
 
+class ShardBuilderBeam(dataset_builder.ShardBasedBuilder):
+  VERSION = utils.Version('0.0.1')
+
+  def _info(self):
+    return dataset_info.DatasetInfo(
+        builder=self,
+        features=features.FeaturesDict({'x': np.int64}),
+    )
+
+  def _shard_iterators_per_split(self, dl_manager):
+    del dl_manager
+
+    def gen_examples(start: int, end: int):
+      for i in range(start, end):
+        yield i, {'x': i}
+
+    return {
+        'train': [
+            functools.partial(gen_examples, start=0, end=10),
+            functools.partial(gen_examples, start=10, end=20),
+        ],
+        'test': [functools.partial(gen_examples, start=100, end=110)],
+    }
+
+
 def _gen_example(x):
   return (
       x,
@@ -198,6 +222,26 @@ def _assert_values_equal(nested_lhs, nested_rhs):
       np.testing.assert_array_equal(lhs, rhs)
 
 
+@pytest.mark.parametrize(
+    'make_dl_config',
+    [
+        make_default_config,
+    ],
+)
+def test_beam_shard_builder_dataset(
+    tmp_path: pathlib.Path,
+    make_dl_config: Callable[[], download.DownloadConfig],
+):
+  builder = ShardBuilderBeam(data_dir=tmp_path, version='0.0.1')
+  builder.download_and_prepare(
+      file_format='array_record', download_config=make_dl_config()
+  )
+  actual_train_data = list(builder.as_data_source(split='train'))
+  assert actual_train_data == [{'x': i} for i in range(20)]
+  actual_test_data = list(builder.as_data_source(split='test'))
+  assert actual_test_data == [{'x': i} for i in range(100, 110)]
+
+
 def test_read_tfrecord_beam():
   builder = DummyBeamDataset()
   with mock.patch.object(
 
@@ -15,7 +15,9 @@
 
 """Tests for tensorflow_datasets.core.dataset_builder."""
 
+from collections.abc import Iterator, Mapping, Sequence
 import dataclasses
+import functools
 import os
 import tempfile
 from unittest import mock
@@ -37,9 +39,11 @@
 from tensorflow_datasets.core import load
 from tensorflow_datasets.core import naming
 from tensorflow_datasets.core import read_only_builder
+from tensorflow_datasets.core import split_builder
 from tensorflow_datasets.core import splits as splits_lib
 from tensorflow_datasets.core import utils
 from tensorflow_datasets.core.data_sources import array_record
+from tensorflow_datasets.core.download import download_manager
 from tensorflow_datasets.core.utils import file_utils
 from tensorflow_datasets.core.utils import read_config as read_config_lib
 from tensorflow_datasets.testing.dummy_config_based_datasets.dummy_ds_1 import dummy_ds_1_dataset_builder
@@ -147,6 +151,50 @@ def _split_generators(self, _):
     return {"all": self._generate_examples(range(5))}
 
 
+class ShardBuilder(dataset_builder.ShardBasedBuilder):
+  VERSION = utils.Version("0.0.1")
+  BUILDER_CONFIGS = [DummyBuilderConfig(name="cfg1")]
+
+  def _info(self):
+    return dataset_info.DatasetInfo(
+        builder=self,
+        features=features.FeaturesDict({"x": np.int64}),
+    )
+
+  def _shard_iterators_per_split(
+      self, dl_manager: download_manager.DownloadManager
+  ) -> Mapping[str, Sequence[Iterator[split_builder.KeyExample]]]:
+    del dl_manager
+
+    def gen_examples(
+        start: int, end: int
+    ) -> Iterator[split_builder.KeyExample]:
+      for i in range(start, end):
+        yield i, {"x": i}
+
+    return {
+        # train split has 2 shards
+        "train": [
+            functools.partial(gen_examples, start=0, end=10),
+            functools.partial(gen_examples, start=10, end=20),
+        ],
+        "test": [functools.partial(gen_examples, start=100, end=110)],
+    }
+
+
+class ShardBuilderTest(testing.TestCase):
+
+  def test_download_and_prepare(self):
+    with testing.tmp_dir(self.get_temp_dir()) as tmp_dir:
+      builder = ShardBuilder(data_dir=tmp_dir, config="cfg1", version="0.0.1")
+      builder.download_and_prepare(file_format="array_record")
+      actual_data = list(builder.as_data_source(split="train"))
+      self.assertEqual(
+          actual_data,
+          [{"x": i} for i in range(20)],
+      )
+
+
 class GetBuilderDatadirPathTest(testing.TestCase):
 
   def test_builder_data_dir_path_is_correct(self):