tensorflow
diff --git a/‎tensorflow_datasets/core/dataset_builder.py‎
Lines changed: 1 addition & 8 deletions b/‎tensorflow_datasets/core/dataset_builder.py‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎tensorflow_datasets/core/dataset_builder_beam_test.py‎
Lines changed: 22 additions & 39 deletions b/‎tensorflow_datasets/core/dataset_builder_beam_test.py‎
Lines changed: 22 additions & 39 deletions
diff --git a/‎tensorflow_datasets/core/dataset_info.py‎
Lines changed: 1 addition & 18 deletions b/‎tensorflow_datasets/core/dataset_info.py‎
Lines changed: 1 addition & 18 deletions
diff --git a/‎tensorflow_datasets/core/dataset_info_test.py‎
Lines changed: 0 additions & 1 deletion b/‎tensorflow_datasets/core/dataset_info_test.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tensorflow_datasets/core/download/download_manager.py‎
Lines changed: 0 additions & 4 deletions b/‎tensorflow_datasets/core/download/download_manager.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎tensorflow_datasets/core/file_adapters.py‎
Lines changed: 0 additions & 34 deletions b/‎tensorflow_datasets/core/file_adapters.py‎
Lines changed: 0 additions & 34 deletions
@@ -623,10 +623,6 @@ def download_and_prepare(
     data_path = self.data_path
     data_exists = data_path.exists()
 
-    # Saving nondeterministic_order in the DatasetInfo for documentation.
-    if download_config.nondeterministic_order:
-      self.info.set_nondeterministic_order(True)
-
     if download_config.download_mode == UPDATE_DATASET_INFO:
       self._update_dataset_info()
       return
@@ -1431,13 +1427,11 @@ def _get_filename_template(
       self, split_name: str
   ) -> naming.ShardedFileTemplate:
     """Returns a filename template for the given split."""
-    if self.info.file_format is None:
-      raise ValueError("File format is not set!")
     return naming.ShardedFileTemplate(
         split=split_name,
         dataset_name=self.name,
         data_dir=self.data_path,
-        filetype_suffix=self.info.file_format.file_suffix,
+        filetype_suffix=self.info.file_format.file_suffix,  # pytype: disable=attribute-error
     )
 
 
@@ -1735,7 +1729,6 @@ def _generate_splits(
             generator=generator,
             filename_template=filename_template,
             disable_shuffling=self.info.disable_shuffling,
-            nondeterministic_order=download_config.nondeterministic_order,
         )
         split_info_futures.append(future)
 
 
@@ -39,16 +39,14 @@ class DummyBeamDataset(dataset_builder.GeneratorBasedBuilder):
       'valid_725': 725,
   }
 
-  FEATURE_DICT = features.FeaturesDict({
-      'image': features.Image(shape=(16, 16, 1)),
-      'label': features.ClassLabel(names=['dog', 'cat']),
-      'id': tf.int32,
-  })
-
   def _info(self):
     return dataset_info.DatasetInfo(
         builder=self,
-        features=self.FEATURE_DICT,
+        features=features.FeaturesDict({
+            'image': features.Image(shape=(16, 16, 1)),
+            'label': features.ClassLabel(names=['dog', 'cat']),
+            'id': tf.int32,
+        }),
         supervised_keys=('x', 'x'),
         metadata=dataset_info.BeamMetadataDict(),
     )
@@ -73,18 +71,6 @@ def _generate_examples(self, num_examples):
     return examples
 
 
-class UnshuffledDummyBeamDataset(DummyBeamDataset):
-
-  def _info(self) -> dataset_info.DatasetInfo:
-    return dataset_info.DatasetInfo(
-        builder=self,
-        features=self.FEATURE_DICT,
-        supervised_keys=('x', 'x'),
-        metadata=dataset_info.BeamMetadataDict(),
-        disable_shuffling=True,
-    )
-
-
 class CommonPipelineDummyBeamDataset(DummyBeamDataset):
   EXPECTED_METADATA = {
       'label_sum_1000': 500,
@@ -165,21 +151,12 @@ def _compute_mean(examples):
   )
 
 
-def get_id(ex):
-  return ex['id']
-
-
 def make_default_config():
   return download.DownloadConfig()
 
 
 @pytest.mark.parametrize(
-    'dataset_cls',
-    [
-        DummyBeamDataset,
-        CommonPipelineDummyBeamDataset,
-        UnshuffledDummyBeamDataset,
-    ],
+    'dataset_cls', [DummyBeamDataset, CommonPipelineDummyBeamDataset]
 )
 @pytest.mark.parametrize(
     'make_dl_config',
@@ -201,23 +178,29 @@ def test_beam_datasets(
   assert data_path.exists()  # Dataset has been generated
 
   # Check number of shards/generated files
-  for split in ['test', 'train']:
-    _test_shards(
-        data_path,
-        pattern='%s-%s.tfrecord-{:05}-of-{:05}' % (dataset_name, split),
-        num_shards=builder.info.splits[split].num_shards,
-    )
+  _test_shards(
+      data_path,
+      pattern='%s-test.tfrecord-{:05}-of-{:05}' % dataset_name,
+      # Liquid sharding is not guaranteed to always use the same number.
+      num_shards=builder.info.splits['test'].num_shards,
+  )
+  _test_shards(
+      data_path,
+      pattern='%s-train.tfrecord-{:05}-of-{:05}' % dataset_name,
+      num_shards=1,
+  )
 
   ds = dataset_utils.as_numpy(builder.as_dataset())
 
-  test_examples = list(ds['test'])
-  train_examples = list(ds['train'])
+  def get_id(ex):
+    return ex['id']
+
   _assert_values_equal(
-      sorted(test_examples, key=get_id),
+      sorted(list(ds['test']), key=get_id),
       sorted([_gen_example(i)[1] for i in range(725)], key=get_id),
   )
   _assert_values_equal(
-      sorted(train_examples, key=get_id),
+      sorted(list(ds['train']), key=get_id),
       sorted([_gen_example(i)[1] for i in range(1000)], key=get_id),
   )
 
 
@@ -186,7 +186,6 @@ def __init__(
       features: feature_lib.FeatureConnector | None = None,
       supervised_keys: SupervisedKeysType | None = None,
       disable_shuffling: bool = False,
-      nondeterministic_order: bool = False,
       homepage: str | None = None,
       citation: str | None = None,
       metadata: Metadata | None = None,
@@ -229,11 +228,7 @@ def __init__(
 
         Note that selecting features in nested `tfds.features.FeaturesDict`
         objects is not supported.
-      disable_shuffling: `bool`, specifies whether to shuffle the examples.
-      nondeterministic_order: `bool`, if True and the dataset uses beam, it will
-        use `NoShuffleBeamWriter` which does not assure deterministic
-        shuffling when writing' examples to disk. This might result in quicker
-        dataset preparation.
+      disable_shuffling: `bool`, specify whether to shuffle the examples.
       homepage: `str`, optional, the homepage for this dataset.
       citation: `str`, optional, the citation to use for this dataset.
       metadata: `tfds.core.Metadata`, additonal object which will be
@@ -273,7 +268,6 @@ def __init__(
         version=str(self._identity.version),
         release_notes=self._identity.release_notes,
         disable_shuffling=disable_shuffling,
-        nondeterministic_order=nondeterministic_order,
         config_name=self._identity.config_name,
         config_description=self._identity.config_description,
         config_tags=self._identity.config_tags,
@@ -348,7 +342,6 @@ def from_proto(
         features=features,
         supervised_keys=supervised_keys,
         disable_shuffling=proto.disable_shuffling,
-        nondeterministic_order=proto.nondeterministic_order,
         citation=proto.citation,
         license=proto.redistribution_info.license,
         split_dict=splits_lib.SplitDict.from_proto(
@@ -407,13 +400,6 @@ def release_notes(self) -> dict[str, str] | None:
   def disable_shuffling(self) -> bool:
     return self.as_proto.disable_shuffling
 
-  @property
-  def nondeterministic_order(self) -> bool:
-    return self._info_proto.nondeterministic_order
-
-  def set_nondeterministic_order(self, nondeterministic_order: bool) -> None:
-    self._info_proto.nondeterministic_order = nondeterministic_order
-
   @property
   def homepage(self) -> str:
     urls = self.as_proto.location.urls
@@ -937,7 +923,6 @@ def __repr__(self):
         ("features", _indent(repr(self.features))),
         ("supervised_keys", self.supervised_keys),
         ("disable_shuffling", self.disable_shuffling),
-        ("nondeterministic_order", self.nondeterministic_order),
         ("splits", splits),
         ("citation", _indent(f'"""{self.citation}"""')),
         # Proto add a \n that we strip.
@@ -955,7 +940,6 @@ def __getstate__(self):
         "features": self.features,
         "supervised_keys": self.supervised_keys,
         "disable_shuffling": self.disable_shuffling,
-        "nondeterministic_order": self.nondeterministic_order,
         "homepage": self.homepage,
         "citation": self.citation,
         "metadata": self.metadata,
@@ -972,7 +956,6 @@ def __setstate__(self, state):
         features=state["features"],
         supervised_keys=state["supervised_keys"],
         disable_shuffling=state["disable_shuffling"],
-        nondeterministic_order=state["nondeterministic_order"],
         homepage=state["homepage"],
         citation=state["citation"],
         metadata=state["metadata"],
 
@@ -818,7 +818,6 @@ def test_get_split_info_from_proto_unavailable_format(self):
     }),
     supervised_keys=('image', 'label'),
     disable_shuffling=False,
-    nondeterministic_order=False,
     splits={
         'test': <SplitInfo num_examples=20, num_shards=1>,
         'train': <SplitInfo num_examples=20, num_shards=1>,
 
@@ -108,9 +108,6 @@ class DownloadConfig:
       used.
     ignore_duplicates: whether to ignore duplicated examples with the same key.
       If there are multiple examples with the same key, the first one is kept.
-    nondeterministic_order: If True, it will not assure deterministic ordering
-      when writing' examples to disk in the case of beam datasets. This might
-      result in quicker dataset preparation.
   """
 
   extract_dir: epath.PathLike | None = None
@@ -129,7 +126,6 @@ class DownloadConfig:
   min_shard_size: int = shard_utils.DEFAULT_MIN_SHARD_SIZE
   max_shard_size: int = shard_utils.DEFAULT_MAX_SHARD_SIZE
   ignore_duplicates: bool = False
-  nondeterministic_order: bool = False
 
   def get_shard_config(self) -> shard_utils.ShardConfig:
     return shard_utils.ShardConfig(
 
@@ -26,17 +26,14 @@
 from typing import Any, ClassVar, Type, TypeVar
 
 from etils import epy
-from tensorflow_datasets.core.utils.lazy_imports_utils import apache_beam as beam
 from tensorflow_datasets.core.utils.lazy_imports_utils import array_record_module
 from tensorflow_datasets.core.utils.lazy_imports_utils import parquet as pq
 from tensorflow_datasets.core.utils.lazy_imports_utils import pyarrow as pa
 from tensorflow_datasets.core.utils.lazy_imports_utils import tensorflow as tf
 
-
 with epy.lazy_imports():
   # pylint: disable=g-import-not-at-top
   from etils import epath
-  from tensorflow_datasets.core import naming
   from tensorflow_datasets.core.utils import file_utils
   from tensorflow_datasets.core.utils import type_utils
 
@@ -170,23 +167,6 @@ def deserialize(cls, raw_example: bytes) -> Any:
     """
     return tf.train.Example.FromString(raw_example)
 
-  @classmethod
-  def beam_sink(
-      cls,
-      filename_template: naming.ShardedFileTemplate,
-      num_shards: int | None = None,
-  ) -> beam.PTransform:
-    """Returns a Beam sink for writing examples in the given file format."""
-    raise NotImplementedError()
-
-  @classmethod
-  def num_examples(cls, filename: epath.PathLike) -> int:
-    """Returns the number of examples in the given file."""
-    n = 0
-    for _ in cls.make_tf_data(filename):
-      n += 1
-    return n
-
 
 class TfRecordFileAdapter(FileAdapter):
   """File adapter for TFRecord file format."""
@@ -225,20 +205,6 @@ def write_examples(
         writer.write(serialized_example)
       writer.flush()
 
-  @classmethod
-  def beam_sink(
-      cls,
-      filename_template: naming.ShardedFileTemplate,
-      num_shards: int | None = None,
-  ) -> beam.PTransform:
-    """Returns a Beam sink for writing examples in the given file format."""
-    file_path_prefix = filename_template.sharded_filepaths_pattern(
-        num_shards=num_shards, use_at_notation=True
-    ).removesuffix('@*')
-    return beam.io.WriteToTFRecord(
-        file_path_prefix=file_path_prefix, num_shards=num_shards
-    )
-
 
 class RiegeliFileAdapter(FileAdapter):
   """File adapter for Riegeli file format."""