tensorflow
diff --git a/‎tensorflow_datasets/core/dataset_builder.py‎
Lines changed: 8 additions & 1 deletion b/‎tensorflow_datasets/core/dataset_builder.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎tensorflow_datasets/core/dataset_builder_beam_test.py‎
Lines changed: 39 additions & 22 deletions b/‎tensorflow_datasets/core/dataset_builder_beam_test.py‎
Lines changed: 39 additions & 22 deletions
diff --git a/‎tensorflow_datasets/core/dataset_info.py‎
Lines changed: 18 additions & 1 deletion b/‎tensorflow_datasets/core/dataset_info.py‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎tensorflow_datasets/core/dataset_info_test.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorflow_datasets/core/dataset_info_test.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorflow_datasets/core/download/download_manager.py‎
Lines changed: 4 additions & 0 deletions b/‎tensorflow_datasets/core/download/download_manager.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎tensorflow_datasets/core/file_adapters.py‎
Lines changed: 34 additions & 0 deletions b/‎tensorflow_datasets/core/file_adapters.py‎
Lines changed: 34 additions & 0 deletions
@@ -623,6 +623,10 @@ def download_and_prepare(
     data_path = self.data_path
     data_exists = data_path.exists()
 
+    # Saving nondeterministic_order in the DatasetInfo for documentation.
+    if download_config.nondeterministic_order:
+      self.info.set_nondeterministic_order(True)
+
     if download_config.download_mode == UPDATE_DATASET_INFO:
       self._update_dataset_info()
       return
@@ -1427,11 +1431,13 @@ def _get_filename_template(
       self, split_name: str
   ) -> naming.ShardedFileTemplate:
     """Returns a filename template for the given split."""
+    if self.info.file_format is None:
+      raise ValueError("File format is not set!")
     return naming.ShardedFileTemplate(
         split=split_name,
         dataset_name=self.name,
         data_dir=self.data_path,
-        filetype_suffix=self.info.file_format.file_suffix,  # pytype: disable=attribute-error
+        filetype_suffix=self.info.file_format.file_suffix,
     )
 
 
@@ -1729,6 +1735,7 @@ def _generate_splits(
             generator=generator,
             filename_template=filename_template,
             disable_shuffling=self.info.disable_shuffling,
+            nondeterministic_order=download_config.nondeterministic_order,
         )
         split_info_futures.append(future)
 
 
@@ -39,14 +39,16 @@ class DummyBeamDataset(dataset_builder.GeneratorBasedBuilder):
       'valid_725': 725,
   }
 
+  FEATURE_DICT = features.FeaturesDict({
+      'image': features.Image(shape=(16, 16, 1)),
+      'label': features.ClassLabel(names=['dog', 'cat']),
+      'id': tf.int32,
+  })
+
   def _info(self):
     return dataset_info.DatasetInfo(
         builder=self,
-        features=features.FeaturesDict({
-            'image': features.Image(shape=(16, 16, 1)),
-            'label': features.ClassLabel(names=['dog', 'cat']),
-            'id': tf.int32,
-        }),
+        features=self.FEATURE_DICT,
         supervised_keys=('x', 'x'),
         metadata=dataset_info.BeamMetadataDict(),
     )
@@ -71,6 +73,18 @@ def _generate_examples(self, num_examples):
     return examples
 
 
+class UnshuffledDummyBeamDataset(DummyBeamDataset):
+
+  def _info(self) -> dataset_info.DatasetInfo:
+    return dataset_info.DatasetInfo(
+        builder=self,
+        features=self.FEATURE_DICT,
+        supervised_keys=('x', 'x'),
+        metadata=dataset_info.BeamMetadataDict(),
+        disable_shuffling=True,
+    )
+
+
 class CommonPipelineDummyBeamDataset(DummyBeamDataset):
   EXPECTED_METADATA = {
       'label_sum_1000': 500,
@@ -151,12 +165,21 @@ def _compute_mean(examples):
   )
 
 
+def get_id(ex):
+  return ex['id']
+
+
 def make_default_config():
   return download.DownloadConfig()
 
 
 @pytest.mark.parametrize(
-    'dataset_cls', [DummyBeamDataset, CommonPipelineDummyBeamDataset]
+    'dataset_cls',
+    [
+        DummyBeamDataset,
+        CommonPipelineDummyBeamDataset,
+        UnshuffledDummyBeamDataset,
+    ],
 )
 @pytest.mark.parametrize(
     'make_dl_config',
@@ -178,29 +201,23 @@ def test_beam_datasets(
   assert data_path.exists()  # Dataset has been generated
 
   # Check number of shards/generated files
-  _test_shards(
-      data_path,
-      pattern='%s-test.tfrecord-{:05}-of-{:05}' % dataset_name,
-      # Liquid sharding is not guaranteed to always use the same number.
-      num_shards=builder.info.splits['test'].num_shards,
-  )
-  _test_shards(
-      data_path,
-      pattern='%s-train.tfrecord-{:05}-of-{:05}' % dataset_name,
-      num_shards=1,
-  )
+  for split in ['test', 'train']:
+    _test_shards(
+        data_path,
+        pattern='%s-%s.tfrecord-{:05}-of-{:05}' % (dataset_name, split),
+        num_shards=builder.info.splits[split].num_shards,
+    )
 
   ds = dataset_utils.as_numpy(builder.as_dataset())
 
-  def get_id(ex):
-    return ex['id']
-
+  test_examples = list(ds['test'])
+  train_examples = list(ds['train'])
   _assert_values_equal(
-      sorted(list(ds['test']), key=get_id),
+      sorted(test_examples, key=get_id),
       sorted([_gen_example(i)[1] for i in range(725)], key=get_id),
   )
   _assert_values_equal(
-      sorted(list(ds['train']), key=get_id),
+      sorted(train_examples, key=get_id),
       sorted([_gen_example(i)[1] for i in range(1000)], key=get_id),
   )
 
 
@@ -186,6 +186,7 @@ def __init__(
       features: feature_lib.FeatureConnector | None = None,
       supervised_keys: SupervisedKeysType | None = None,
       disable_shuffling: bool = False,
+      nondeterministic_order: bool = False,
       homepage: str | None = None,
       citation: str | None = None,
       metadata: Metadata | None = None,
@@ -228,7 +229,11 @@ def __init__(
 
         Note that selecting features in nested `tfds.features.FeaturesDict`
         objects is not supported.
-      disable_shuffling: `bool`, specify whether to shuffle the examples.
+      disable_shuffling: `bool`, specifies whether to shuffle the examples.
+      nondeterministic_order: `bool`, if True and the dataset uses beam, it will
+        use `NoShuffleBeamWriter` which does not assure deterministic
+        shuffling when writing' examples to disk. This might result in quicker
+        dataset preparation.
       homepage: `str`, optional, the homepage for this dataset.
       citation: `str`, optional, the citation to use for this dataset.
       metadata: `tfds.core.Metadata`, additonal object which will be
@@ -268,6 +273,7 @@ def __init__(
         version=str(self._identity.version),
         release_notes=self._identity.release_notes,
         disable_shuffling=disable_shuffling,
+        nondeterministic_order=nondeterministic_order,
         config_name=self._identity.config_name,
         config_description=self._identity.config_description,
         config_tags=self._identity.config_tags,
@@ -342,6 +348,7 @@ def from_proto(
         features=features,
         supervised_keys=supervised_keys,
         disable_shuffling=proto.disable_shuffling,
+        nondeterministic_order=proto.nondeterministic_order,
         citation=proto.citation,
         license=proto.redistribution_info.license,
         split_dict=splits_lib.SplitDict.from_proto(
@@ -400,6 +407,13 @@ def release_notes(self) -> dict[str, str] | None:
   def disable_shuffling(self) -> bool:
     return self.as_proto.disable_shuffling
 
+  @property
+  def nondeterministic_order(self) -> bool:
+    return self._info_proto.nondeterministic_order
+
+  def set_nondeterministic_order(self, nondeterministic_order: bool) -> None:
+    self._info_proto.nondeterministic_order = nondeterministic_order
+
   @property
   def homepage(self) -> str:
     urls = self.as_proto.location.urls
@@ -923,6 +937,7 @@ def __repr__(self):
         ("features", _indent(repr(self.features))),
         ("supervised_keys", self.supervised_keys),
         ("disable_shuffling", self.disable_shuffling),
+        ("nondeterministic_order", self.nondeterministic_order),
         ("splits", splits),
         ("citation", _indent(f'"""{self.citation}"""')),
         # Proto add a \n that we strip.
@@ -940,6 +955,7 @@ def __getstate__(self):
         "features": self.features,
         "supervised_keys": self.supervised_keys,
         "disable_shuffling": self.disable_shuffling,
+        "nondeterministic_order": self.nondeterministic_order,
         "homepage": self.homepage,
         "citation": self.citation,
         "metadata": self.metadata,
@@ -956,6 +972,7 @@ def __setstate__(self, state):
         features=state["features"],
         supervised_keys=state["supervised_keys"],
         disable_shuffling=state["disable_shuffling"],
+        nondeterministic_order=state["nondeterministic_order"],
         homepage=state["homepage"],
         citation=state["citation"],
         metadata=state["metadata"],
 
@@ -818,6 +818,7 @@ def test_get_split_info_from_proto_unavailable_format(self):
     }),
     supervised_keys=('image', 'label'),
     disable_shuffling=False,
+    nondeterministic_order=False,
     splits={
         'test': <SplitInfo num_examples=20, num_shards=1>,
         'train': <SplitInfo num_examples=20, num_shards=1>,
 
@@ -108,6 +108,9 @@ class DownloadConfig:
       used.
     ignore_duplicates: whether to ignore duplicated examples with the same key.
       If there are multiple examples with the same key, the first one is kept.
+    nondeterministic_order: If True, it will not assure deterministic ordering
+      when writing' examples to disk in the case of beam datasets. This might
+      result in quicker dataset preparation.
   """
 
   extract_dir: epath.PathLike | None = None
@@ -126,6 +129,7 @@ class DownloadConfig:
   min_shard_size: int = shard_utils.DEFAULT_MIN_SHARD_SIZE
   max_shard_size: int = shard_utils.DEFAULT_MAX_SHARD_SIZE
   ignore_duplicates: bool = False
+  nondeterministic_order: bool = False
 
   def get_shard_config(self) -> shard_utils.ShardConfig:
     return shard_utils.ShardConfig(
 
@@ -26,14 +26,17 @@
 from typing import Any, ClassVar, Type, TypeVar
 
 from etils import epy
+from tensorflow_datasets.core.utils.lazy_imports_utils import apache_beam as beam
 from tensorflow_datasets.core.utils.lazy_imports_utils import array_record_module
 from tensorflow_datasets.core.utils.lazy_imports_utils import parquet as pq
 from tensorflow_datasets.core.utils.lazy_imports_utils import pyarrow as pa
 from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
 
+
 with epy.lazy_imports():
   # pylint: disable=g-import-not-at-top
   from etils import epath
+  from tensorflow_datasets.core import naming
   from tensorflow_datasets.core.utils import file_utils
   from tensorflow_datasets.core.utils import type_utils
 
@@ -167,6 +170,23 @@ def deserialize(cls, raw_example: bytes) -> Any:
     """
     return tf.train.Example.FromString(raw_example)
 
+  @classmethod
+  def beam_sink(
+      cls,
+      filename_template: naming.ShardedFileTemplate,
+      num_shards: int | None = None,
+  ) -> beam.PTransform:
+    """Returns a Beam sink for writing examples in the given file format."""
+    raise NotImplementedError()
+
+  @classmethod
+  def num_examples(cls, filename: epath.PathLike) -> int:
+    """Returns the number of examples in the given file."""
+    n = 0
+    for _ in cls.make_tf_data(filename):
+      n += 1
+    return n
+
 
 class TfRecordFileAdapter(FileAdapter):
   """File adapter for TFRecord file format."""
@@ -205,6 +225,20 @@ def write_examples(
         writer.write(serialized_example)
       writer.flush()
 
+  @classmethod
+  def beam_sink(
+      cls,
+      filename_template: naming.ShardedFileTemplate,
+      num_shards: int | None = None,
+  ) -> beam.PTransform:
+    """Returns a Beam sink for writing examples in the given file format."""
+    file_path_prefix = filename_template.sharded_filepaths_pattern(
+        num_shards=num_shards, use_at_notation=True
+    ).removesuffix('@*')
+    return beam.io.WriteToTFRecord(
+        file_path_prefix=file_path_prefix, num_shards=num_shards
+    )
+
 
 class RiegeliFileAdapter(FileAdapter):
   """File adapter for Riegeli file format."""