Add test to verify correct shard computation with overlapping splits

lgeiger · lgeiger · commit 6f96eaa26bcc · 2025-06-25T09:49:00.000Z
diff --git a/tensorflow_datasets/core/writer_test.py b/tensorflow_datasets/core/writer_test.py
@@ -592,39 +592,40 @@ def test_write_beam(self, file_format: file_adapters.FileFormat):
 
     with tempfile.TemporaryDirectory() as tmp_dir:
       tmp_dir = epath.Path(tmp_dir)
-      filename_template = naming.ShardedFileTemplate(
-          dataset_name='foo',
-          split='train',
-          filetype_suffix=file_format.file_suffix,
-          data_dir=tmp_dir,
-      )
-      writer = writer_lib.NoShuffleBeamWriter(
-          serializer=testing.DummySerializer('dummy specs'),
-          filename_template=filename_template,
-          file_format=file_format,
-      )
-      to_write = [(i, str(i).encode('utf-8')) for i in range(10)]
-      # Here we need to disable type check as `beam.Create` is not capable of
-      # inferring the type of the PCollection elements.
-      options = beam.options.pipeline_options.PipelineOptions(
-          pipeline_type_check=False
-      )
-      with beam.Pipeline(options=options, runner=_get_runner()) as pipeline:
-
-        @beam.ptransform_fn
-        def _build_pcollection(pipeline):
-          pcollection = pipeline | 'Start' >> beam.Create(to_write)
-          return writer.write_from_pcollection(pcollection)
-
-        _ = pipeline | 'test' >> _build_pcollection()  # pylint: disable=no-value-for-parameter
-      shard_lengths, total_size = writer.finalize()
-      self.assertNotEmpty(shard_lengths)
-      self.assertEqual(sum(shard_lengths), 10)
-      self.assertGreater(total_size, 10)
-      files = list(tmp_dir.iterdir())
-      self.assertGreaterEqual(len(files), 1)
-      for f in files:
-        self.assertIn(file_format.file_suffix, f.name)
+      for split in ('train-b', 'train'):
+        filename_template = naming.ShardedFileTemplate(
+            dataset_name='foo',
+            split=split,
+            filetype_suffix=file_format.file_suffix,
+            data_dir=tmp_dir,
+        )
+        writer = writer_lib.NoShuffleBeamWriter(
+            serializer=testing.DummySerializer('dummy specs'),
+            filename_template=filename_template,
+            file_format=file_format,
+        )
+        to_write = [(i, str(i).encode('utf-8')) for i in range(10)]
+        # Here we need to disable type check as `beam.Create` is not capable
+        # of inferring the type of the PCollection elements.
+        options = beam.options.pipeline_options.PipelineOptions(
+            pipeline_type_check=False
+        )
+        with beam.Pipeline(options=options, runner=_get_runner()) as pipeline:
+
+          @beam.ptransform_fn
+          def _build_pcollection(pipeline):
+            pcollection = pipeline | 'Start' >> beam.Create(to_write)
+            return writer.write_from_pcollection(pcollection)
+
+          _ = pipeline | 'test' >> _build_pcollection()  # pylint: disable=no-value-for-parameter
+        shard_lengths, total_size = writer.finalize()
+        self.assertNotEmpty(shard_lengths)
+        self.assertEqual(sum(shard_lengths), 10)
+        self.assertGreater(total_size, 10)
+        files = list(tmp_dir.iterdir())
+        self.assertGreaterEqual(len(files), 1)
+        for f in files:
+          self.assertIn(file_format.file_suffix, f.name)
 
 
 class CustomExampleWriter(writer_lib.ExampleWriter):