Introduce a ConvertConfig dataclass to not have to pass around so many parameters

tomvdw · The TensorFlow Datasets Authors · commit 7d88e04c36fa · 2024-10-21T04:55:26.000-07:00
Also introduce an option to not fail the entire conversion pipeline when a single shard has an error.

PiperOrigin-RevId: 688089477
diff --git a/tensorflow_datasets/core/naming.py b/tensorflow_datasets/core/naming.py
@@ -691,6 +691,17 @@ def sharded_filepaths_pattern(
       replacement = '*'
     return _replace_shard_pattern(os.fspath(a_filepath), replacement)
 
+  def glob_pattern(self, num_shards: int | None = None) -> str:
+    """Returns a glob pattern for all the file paths captured by this template."""
+    if num_shards is None:
+      # e.g., `dataset_name-split.fileformat*`
+      return self.sharded_filepaths_pattern(num_shards=None)
+    first_shard = self.sharded_filepath(shard_index=0, num_shards=num_shards)
+    file_name = first_shard.name
+    file_pattern = re.sub(r'0{5,}-of-', '*-of-', file_name)
+    # e.g., `dataset_name-split.fileformat-*-of-00042`
+    return os.fspath(first_shard.parent / file_pattern)
+
   def sharded_filenames(self, num_shards: int) -> list[str]:
     return [path.name for path in self.sharded_filepaths(num_shards=num_shards)]
 
diff --git a/tensorflow_datasets/core/naming_test.py b/tensorflow_datasets/core/naming_test.py
@@ -516,6 +516,19 @@ def test_sharded_file_template_shard_index():
   )
 
 
+def test_glob_pattern():
+  template = naming.ShardedFileTemplate(
+      dataset_name='ds',
+      split='train',
+      filetype_suffix='tfrecord',
+      data_dir=epath.Path('/data'),
+  )
+  assert '/data/ds-train.tfrecord*' == template.glob_pattern()
+  assert '/data/ds-train.tfrecord-*-of-00042' == template.glob_pattern(
+      num_shards=42
+  )
+
+
 def test_sharded_file_template_sharded_filepath_shard_x_of_y():
   builder_dir = epath.Path('/my/path')
   template_explicit = naming.ShardedFileTemplate(
diff --git a/tensorflow_datasets/core/splits.py b/tensorflow_datasets/core/splits.py
@@ -120,6 +120,38 @@ def __post_init__(self):
     # Normalize bytes
     super().__setattr__('num_bytes', units.Size(self.num_bytes))
 
+  def get_available_shards(
+      self,
+      data_dir: epath.Path | None = None,
+      file_format: file_adapters.FileFormat | None = None,
+      strict_matching: bool = True,
+  ) -> list[epath.Path]:
+    """Returns the list of shards that are present in the data dir.
+
+    Args:
+      data_dir: The data directory to look for shards in. If not provided, the
+        data directory from the filename template is used.
+      file_format: The file format to look for shards in. If not provided, the
+        file format from the filename template is used.
+      strict_matching: If True, only shards that match the filename template
+        exactly are returned taking into account the number of shards.
+        Otherwise, shards that match the template with a wildcard for the shard
+        number are returned.
+    """
+    if filename_template := self.filename_template:
+      if file_format:
+        filename_template = filename_template.replace(
+            filetype_suffix=file_format.file_suffix
+        )
+      data_dir = data_dir or filename_template.data_dir
+      if strict_matching:
+        pattern = filename_template.glob_pattern(num_shards=self.num_shards)
+      else:
+        pattern = filename_template.sharded_filepaths_pattern(num_shards=None)
+      return list(data_dir.glob(pattern))
+    else:
+      raise ValueError(f'Filename template for split {self.name} is empty.')
+
   @classmethod
   def from_proto(
       cls,
@@ -382,7 +414,7 @@ class Split(str):
   """
 
   def __repr__(self) -> str:
-    return '{}({})'.format(type(self).__name__, super(Split, self).__repr__())  # pytype: disable=wrong-arg-types
+    return f'{type(self).__name__}({super().__repr__()})'
 
 
 Split.TRAIN = Split('train')
@@ -735,7 +767,9 @@ def _str_to_relative_instruction(spec: str) -> AbstractSplit:
   else:  # split='train[x:y]' or split='train[x]'
     slices = [_SLICE_RE.match(x) for x in split_selector.split(':')]
     # Make sure all slices are valid, and at least one is not empty
-    if not all(slices) or not any(x.group(0) for x in slices):  # pytype: disable=attribute-error  # re-none
+    if not all(slices) or not any(
+        x.group(0) for x in slices if x is not None
+    ):  # re-none
       raise ValueError(err_msg)
     if len(slices) == 1:  # split='train[x]'
       (from_match,) = slices
diff --git a/tensorflow_datasets/core/splits_test.py b/tensorflow_datasets/core/splits_test.py
@@ -15,6 +15,8 @@
 
 """Tests for the Split API."""
 
+import os
+from etils import epath
 from tensorflow_datasets import testing
 from tensorflow_datasets.core import naming
 from tensorflow_datasets.core import proto
@@ -669,6 +671,32 @@ def test_file_spec_missing_template(self):
           file_format=tfds.core.file_adapters.FileFormat.TFRECORD
       )
 
+  def test_get_available_shards(self):
+    tmp_dir = epath.Path(self.tmp_dir)
+    train_shard1 = tmp_dir / 'ds-train.tfrecord-00000-of-00002'
+    train_shard1.touch()
+    train_shard_incorrect = tmp_dir / 'ds-train.tfrecord-00000-of-12345'
+    train_shard_incorrect.touch()
+    test_shard1 = tmp_dir / 'ds-test.tfrecord-00000-of-00001'
+    test_shard1.touch()
+
+    split_info = splits.SplitInfo(
+        name='train',
+        shard_lengths=[1, 2],
+        num_bytes=42,
+        filename_template=_filename_template(
+            split='train', data_dir=os.fspath(tmp_dir), dataset_name='ds'
+        ),
+    )
+    self.assertEqual(
+        [train_shard1, train_shard_incorrect],
+        split_info.get_available_shards(tmp_dir, strict_matching=False),
+    )
+    self.assertEqual(
+        [train_shard1],
+        split_info.get_available_shards(tmp_dir, strict_matching=True),
+    )
+
 
 if __name__ == '__main__':
   testing.test_main()
diff --git a/tensorflow_datasets/scripts/cli/convert_format_utils.py b/tensorflow_datasets/scripts/cli/convert_format_utils.py
diff --git a/tensorflow_datasets/scripts/cli/convert_format_utils_test.py b/tensorflow_datasets/scripts/cli/convert_format_utils_test.py