Fix incompatibility between MultiSplitInfo/SubSplitInfo and EvenSplit

The TensorFlow Datasets Authors · The TensorFlow Datasets Authors · commit 7c3d7a6336db · 2025-02-19T05:27:31.000-08:00
PiperOrigin-RevId: 728628832
diff --git a/tensorflow_datasets/core/splits.py b/tensorflow_datasets/core/splits.py
@@ -536,11 +536,18 @@ def _file_instructions_for_split(
     )
     return []
   to = split_info.num_examples if instruction.to is None else instruction.to
+  if isinstance(split_info, (SubSplitInfo, MultiSplitInfo)):
+    examples_in_shards = [
+        f.examples_in_shard for f in split_info.file_instructions
+    ]
+  else:
+    examples_in_shards = None
   return shard_utils.get_file_instructions(
       from_=instruction.from_ or 0,
       to=to,
       filenames=[os.fspath(fp) for fp in split_info.filepaths],
       shard_lengths=split_info.shard_lengths,
+      examples_in_shards=examples_in_shards,
   )
 
 
diff --git a/tensorflow_datasets/core/splits_test.py b/tensorflow_datasets/core/splits_test.py
@@ -217,6 +217,44 @@ def split_info_for(name: str, shard_lengths, template) -> splits.SplitInfo:
     assert merged.get('test').split_infos == [split_info_a2, split_info_b2]
     assert merged.get('banana').split_infos == [split_info_a3]
 
+  def test_multi_split_sub_split(self):
+    split_info = splits.MultiSplitInfo(
+        name='train',
+        split_infos=[
+            splits.SubSplitInfo(
+                name='train[:2]',
+                file_instructions=[
+                    shard_utils.FileInstruction(
+                        filename='/a/file-00000-of-00001',
+                        skip=0,
+                        take=2,
+                        examples_in_shard=10,
+                    )
+                ],
+            ),
+            splits.SubSplitInfo(
+                name='train[:10]',
+                file_instructions=[
+                    shard_utils.FileInstruction(
+                        filename='/b/file-00000-of-00001',
+                        skip=0,
+                        take=10,
+                        examples_in_shard=20,
+                    )
+                ],
+            ),
+        ],
+    )
+    split_dict = splits.SplitDict([split_info])
+    sub_split = split_dict['train[:2]']
+    self.assertEqual(sub_split.name, 'train[:2]')
+    self.assertLen(sub_split.file_instructions, 1)
+    file_instruction = sub_split.file_instructions[0]
+    self.assertEqual(file_instruction.filename, '/a/file-00000-of-00001')
+    self.assertEqual(file_instruction.skip, 0)
+    self.assertEqual(file_instruction.take, 2)
+    self.assertEqual(file_instruction.examples_in_shard, 10)
+
 
 class SplitsTest(testing.TestCase):
 
diff --git a/tensorflow_datasets/core/utils/shard_utils.py b/tensorflow_datasets/core/utils/shard_utils.py
@@ -216,6 +216,7 @@ def get_file_instructions(
     to: int,
     filenames: Sequence[str],
     shard_lengths: Sequence[int],
+    examples_in_shards: Sequence[int] | None = None,
 ) -> list[FileInstruction]:
   """Returns a list of files (+skip/take) to read [from_:to] items from shards.
 
@@ -225,14 +226,18 @@ def get_file_instructions(
     filenames: list of strings or ints, the filenames of the shards. Not really
       used, but to place in result.
     shard_lengths: the number of elements in every shard.
+    examples_in_shards: the number of examples in every shard. If not provided,
+      then `shard_lengths` is used.
 
   Returns:
     list of dict(filename, skip, take).
   """
   index_start = 0  # Beginning (included) of moving window.
   index_end = 0  # End (excluded) of moving window.
   file_instructions = []
-  for filename, length in zip(filenames, shard_lengths):
+  for shard_index, (filename, length) in enumerate(
+      zip(filenames, shard_lengths)
+  ):
     if not length:
       continue  # Empty shard - can happen with temporary buckets.
     index_end += length
@@ -241,9 +246,18 @@ def get_file_instructions(
       take = to - index_start - skip if to < index_end else -1
       if take == 0:
         continue
+      if examples_in_shards is not None:
+        examples_in_shard = examples_in_shards[shard_index]
+        if take == -1 and examples_in_shard != length:
+          take = length
+      else:
+        examples_in_shard = length
       file_instructions.append(
           FileInstruction(
-              filename=filename, skip=skip, take=take, examples_in_shard=length
+              filename=filename,
+              skip=skip,
+              take=take,
+              examples_in_shard=examples_in_shard,
           )
       )
     index_start += length