Add a beam writer that doesn't shuffle

The TensorFlow Datasets Authors · The TensorFlow Datasets Authors · commit 3e5515f51294 · 2024-11-05T13:32:13.000-08:00
PiperOrigin-RevId: 693459753
diff --git a/tensorflow_datasets/core/writer.py b/tensorflow_datasets/core/writer.py
@@ -717,3 +717,97 @@ def finalize(self) -> tuple[list[int], int]:
       split_info_path.unlink()
 
     return self._split_info["shard_lengths"], self._split_info["total_size"]
+
+
+class NoShuffleBeamWriter:
+  """Shuffles / writes Examples beam collection to sharded files."""
+
+  _OUTPUT_TAG_BUCKETS_LEN_SIZE = "tag_buckets_len_size"
+
+  def __init__(
+      self,
+      serializer: example_serializer.Serializer,
+      filename_template: naming.ShardedFileTemplate,
+      file_format: file_adapters.FileFormat,
+  ):
+    """Init BeamWriter.
+
+    Note that file "{filepath_prefix}.shard_lengths.json" is also created. It
+    contains a list with the number of examples in each final shard. Eg:
+    "[10,11,10,11]".
+
+    Args:
+      serializer: class that can serialize examples.
+      filename_template: template to format sharded filenames.
+      file_format: the file format to use.
+    """
+    self._original_state = dict(
+        serializer=serializer,
+        filename_template=filename_template,
+        file_format=file_format,
+    )
+    self._file_format = file_format
+    self._file_adapter = file_adapters.ADAPTER_FOR_FORMAT[self._file_format]
+    self._filename_template = filename_template
+    self._serializer = serializer
+
+  @functools.lru_cache()
+  def _get_counter(self, name: str, namespace: str = "BeamWriter"):
+    return beam.metrics.Metrics.counter(namespace, name)
+
+  def inc_counter(self, name: str, value: int = 1) -> None:
+    self._get_counter(name).inc(value)
+
+  def __getstate__(self):
+    return self._original_state
+
+  def __setstate__(self, state):
+    self.__init__(**state)
+
+  def _serialize_example(
+      self,
+      key_example: tuple[hashing.HashKey, Example],
+  ) -> bytes:
+    """Returns (serialized_example)."""
+    _, example = key_example
+    self.inc_counter(name="serialized_examples")
+    return self._serializer.serialize_example(example)
+
+  def write_from_pcollection(self, examples_pcollection):
+    """Returns PTransform to write (key, example) PCollection."""
+    return (
+        examples_pcollection
+        | "Serialize" >> beam.Map(self._serialize_example)
+        | "Write"
+        >> self._file_adapter.beam_sink(
+            filename_template=self._filename_template
+        )
+    )
+
+  def finalize(self) -> tuple[list[int], int]:
+    """Returns the computed shard_lengths and total_size.
+
+    Returns:
+      List of length <number of shards> containing the number of examples stored
+      in each shard, and size of the files (in bytes).
+    """
+    logging.info("Finalizing writer for %s", self._filename_template.split)
+    # We don't know the number of shards, the length of each shard, nor the
+    # total size, so we compute them here.
+    length_per_shard = {}
+    total_size_bytes = 0
+    prefix = epath.Path(self._filename_template.filepath_prefix())
+    for shard in self._filename_template.data_dir.glob(f"{prefix.name}*"):
+      length = self._file_adapter.num_examples(shard)
+      length_per_shard[shard] = length
+      total_size_bytes += shard.stat().length
+    shard_lengths: list[int] = []
+    for _, length in sorted(length_per_shard.items()):
+      shard_lengths.append(length)
+    logging.info(
+        "Found %d shards with a total size of %d bytes.",
+        len(shard_lengths),
+        total_size_bytes,
+    )
+
+    return shard_lengths, total_size_bytes
diff --git a/tensorflow_datasets/core/writer_test.py b/tensorflow_datasets/core/writer_test.py
@@ -13,21 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for tensorflow_datasets.core.writer."""
-
 import json
 import os
+import tempfile
 from typing import Optional
 from unittest import mock
 
 from absl.testing import parameterized
+import apache_beam as beam
 from etils import epath
 import tensorflow as tf
 from tensorflow_datasets import testing
 from tensorflow_datasets.core import dataset_utils
 from tensorflow_datasets.core import example_parser
 from tensorflow_datasets.core import file_adapters
-from tensorflow_datasets.core import lazy_imports_lib
 from tensorflow_datasets.core import naming
 from tensorflow_datasets.core import writer as writer_lib
 from tensorflow_datasets.core.utils import shard_utils
@@ -409,6 +408,10 @@ def test_too_small_split(self):
       self._write(to_write=to_write)
 
 
+def _get_runner() -> beam.runners.PipelineRunner:
+  return beam.runners.DirectRunner()
+
+
 class TfrecordsWriterBeamTest(testing.TestCase):
   NUM_SHARDS = 3
   RECORDS_TO_WRITE = [(i, str(i).encode('utf-8')) for i in range(10)]
@@ -455,7 +458,6 @@ def _write(
     shard_config = shard_config or shard_utils.ShardConfig(
         num_shards=self.NUM_SHARDS
     )
-    beam = lazy_imports_lib.lazy_imports.apache_beam
     writer = writer_lib.BeamWriter(
         serializer=testing.DummySerializer('dummy specs'),
         filename_template=filename_template,
@@ -581,6 +583,50 @@ def test_write_tfrecord_sorted_by_key_with_holes(self):
     self.assertEmpty(all_indices)
 
 
+class NoShuffleBeamWriterTest(parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('tfrecord', file_adapters.FileFormat.TFRECORD),
+  )
+  def test_write_beam(self, file_format: file_adapters.FileFormat):
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+      tmp_dir = epath.Path(tmp_dir)
+      filename_template = naming.ShardedFileTemplate(
+          dataset_name='foo',
+          split='train',
+          filetype_suffix=file_format.file_suffix,
+          data_dir=tmp_dir,
+      )
+      writer = writer_lib.NoShuffleBeamWriter(
+          serializer=testing.DummySerializer('dummy specs'),
+          filename_template=filename_template,
+          file_format=file_format,
+      )
+      to_write = [(i, str(i).encode('utf-8')) for i in range(10)]
+      # Here we need to disable type check as `beam.Create` is not capable of
+      # inferring the type of the PCollection elements.
+      options = beam.options.pipeline_options.PipelineOptions(
+          pipeline_type_check=False
+      )
+      with beam.Pipeline(options=options, runner=_get_runner()) as pipeline:
+
+        @beam.ptransform_fn
+        def _build_pcollection(pipeline):
+          pcollection = pipeline | 'Start' >> beam.Create(to_write)
+          return writer.write_from_pcollection(pcollection)
+
+        _ = pipeline | 'test' >> _build_pcollection()  # pylint: disable=no-value-for-parameter
+      shard_lengths, total_size = writer.finalize()
+      self.assertNotEmpty(shard_lengths)
+      self.assertEqual(sum(shard_lengths), 10)
+      self.assertGreater(total_size, 10)
+      files = list(tmp_dir.iterdir())
+      self.assertGreaterEqual(len(files), 1)
+      for f in files:
+        self.assertIn(file_format.file_suffix, f.name)
+
+
 class CustomExampleWriter(writer_lib.ExampleWriter):
 
   def __init__(self):