Add option to specify multiple dataset folders to convert_format

tomvdw · The TensorFlow Datasets Authors · commit b7d054a92905 · 2024-06-27T02:51:08.000-07:00
PiperOrigin-RevId: 647245196
diff --git a/tensorflow_datasets/scripts/cli/convert_format.py b/tensorflow_datasets/scripts/cli/convert_format.py
@@ -18,16 +18,17 @@
 Example usage:
 ```
 tfds convert_format \
-  --dataset_dir=/data/dataset/config/1.2.3 \
+  --dataset_version_dir=/data/dataset/config/1.2.3 \
   --out_file_format=array_record \
   --out_dir=/data_array_record/dataset/config/1.2.3 \
   --use_beam=True
 ```
 """
 
 import argparse
-import pathlib
+from collections.abc import Sequence
 
+from etils import epath
 from tensorflow_datasets.core import file_adapters
 from tensorflow_datasets.scripts.cli import convert_format_utils
 
@@ -57,7 +58,10 @@ def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
       type=str,
       help=(
           'Path where the dataset to be converted is located. Should include'
-          ' config and version.'
+          ' config and version. Can also be a comma-separated list of paths. If'
+          ' multiple paths are specified, `--out_dir` should not be specified,'
+          ' since each dataset will be converted in the same directory as the'
+          ' input dataset.'
       ),
       required=False,
   )
@@ -70,14 +74,14 @@ def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
   )
   parser.add_argument(
       '--out_dir',
-      type=pathlib.Path,
+      type=str,
       help=(
           'Path where the converted dataset will be stored. Should include the'
           ' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
           ' specified, the converted shards will be stored in the same'
           ' directory as the input dataset.'
       ),
-      default=None,
+      default='',
       required=False,
   )
   parser.add_argument(
@@ -109,13 +113,23 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
       help='Converts a dataset from one file format to another format.',
   )
   add_parser_arguments(parser)
+
+  def _parse_dataset_version_dir(
+      dataset_version_dir: str | None,
+  ) -> Sequence[epath.Path] | None:
+    if not dataset_version_dir:
+      return None
+    return [epath.Path(path) for path in dataset_version_dir.split(',')]
+
   parser.set_defaults(
       subparser_fn=lambda args: convert_format_utils.convert_dataset(
-          out_dir=args.out_dir,
+          out_dir=args.out_dir if args.out_dir else None,
           out_file_format=args.out_file_format,
           dataset_dir=args.dataset_dir or None,
           root_data_dir=args.root_data_dir or None,
-          dataset_version_dir=args.dataset_version_dir or None,
+          dataset_version_dir=_parse_dataset_version_dir(
+              args.dataset_version_dir
+          ),
           overwrite=args.overwrite,
           use_beam=args.use_beam,
           num_workers=args.num_workers,
diff --git a/tensorflow_datasets/scripts/cli/convert_format_utils.py b/tensorflow_datasets/scripts/cli/convert_format_utils.py
@@ -15,7 +15,7 @@
 
 r"""Library to convert a dataset from one file format to another."""
 
-from collections.abc import Iterable, Iterator, Mapping
+from collections.abc import Iterable, Iterator, Mapping, Sequence
 import dataclasses
 import functools
 import os
@@ -491,7 +491,9 @@ def convert_dataset(
     out_file_format: str | file_adapters.FileFormat,
     root_data_dir: epath.PathLike | None = None,
     dataset_dir: epath.PathLike | None = None,
-    dataset_version_dir: epath.PathLike | None = None,
+    dataset_version_dir: (
+        epath.PathLike | Sequence[epath.PathLike] | None
+    ) = None,
     overwrite: bool = False,
     use_beam: bool = False,
     num_workers: int = 8,
@@ -511,7 +513,10 @@ def convert_dataset(
       their own configs and versions.
     dataset_dir: folder that contains a single dataset with all its configs and
       versions.
-    dataset_version_dir: folder that contains a single dataset version.
+    dataset_version_dir: a single or list of folders that each contains a single
+      dataset version. If multiple folders are specified, `out_dir` should be
+      `None`, since each dataset will be converted in the same folder as the
+      input dataset.
     overwrite: whether to overwrite folders in `out_dir` if they already exist.
     use_beam: whether to use Beam to convert datasets. Useful for big datasets.
     num_workers: number of workers to use when not using Beam. If `use_beam` is
@@ -548,9 +553,23 @@ def convert_dataset(
         overwrite=overwrite,
     )
   elif dataset_version_dir:
-    if out_dir is None:
-      out_dir = dataset_version_dir
-    from_to_dirs = {epath.Path(dataset_version_dir): epath.Path(out_dir)}
+    if isinstance(dataset_version_dir, str):
+      dataset_version_dir = [dataset_version_dir]
+
+    if len(dataset_version_dir) > 1 and out_dir is not None:
+      raise ValueError(
+          'If multiple dataset version dirs are specified, `out_dir` must be'
+          ' `None`, since each dataset will be converted in the same folder as'
+          ' the input dataset.'
+      )
+
+    from_to_dirs = {}
+    for path in dataset_version_dir:
+      if out_dir is None:
+        from_to_dirs[epath.Path(path)] = epath.Path(path)
+      else:
+        from_to_dirs[epath.Path(path)] = epath.Path(out_dir)
+
     _convert_dataset_dirs(
         from_to_dirs=from_to_dirs,
         out_file_format=out_file_format,
diff --git a/tensorflow_datasets/scripts/convert_format.py b/tensorflow_datasets/scripts/convert_format.py
@@ -56,12 +56,15 @@
     ),
     default=None,
 )
-_DATASET_VERSION_DIR = flags.DEFINE_string(
+_DATASET_VERSION_DIR = flags.DEFINE_list(
     'dataset_version_dir',
     required=False,
     help=(
         'Path where the dataset to be converted is located. Should include'
-        ' config and version.'
+        ' config and version. Can also be a comma-separated list of paths. If'
+        ' multiple paths are specified, `--out_dir` should not be specified,'
+        ' since each dataset will be converted in the same directory as the'
+        ' input dataset.'
     ),
     default=None,
 )
@@ -76,10 +79,12 @@
 
 _OUT_DIR = flags.DEFINE_string(
     'out_dir',
-    required=True,
+    required=False,
     help=(
         'Path where the converted dataset will be stored. Should include the'
-        ' config and version, e.g. `/data/dataset_name/config/1.2.3`.'
+        ' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
+        ' specified, the converted shards will be stored in the same directory'
+        ' as the input dataset.'
     ),
     default=None,
 )
@@ -90,6 +95,13 @@
     help='Whether to use beam to convert the dataset.',
 )
 
+_NUM_WORKERS = flags.DEFINE_integer(
+    'num_workers',
+    default=8,
+    help='Number of workers to use if `use_beam` is `False`.',
+)
+
+
 _OVERWRITE = flags.DEFINE_bool(
     'overwrite',
     default=False,
@@ -98,6 +110,7 @@
 
 
 def main(_):
+
   convert_format_utils.convert_dataset(
       root_data_dir=_ROOT_DATA_DIR.value,
       dataset_dir=_DATASET_DIR.value,
@@ -106,6 +119,7 @@ def main(_):
       out_dir=_OUT_DIR.value,
       use_beam=_USE_BEAM.value,
       overwrite=_OVERWRITE.value,
+      num_workers=_NUM_WORKERS.value,
   )