Use simple_parsing for convert_format cli command.

fineguy · The TensorFlow Datasets Authors · commit 44239d85006b · 2025-08-07T00:59:55.000-07:00
PiperOrigin-RevId: 792034867
diff --git a/tensorflow_datasets/scripts/cli/convert_format.py b/tensorflow_datasets/scripts/cli/convert_format.py
@@ -26,94 +26,74 @@
 """
 
 import argparse
-from collections.abc import Sequence
+import dataclasses
+import typing
 
 from etils import epath
+import simple_parsing
 from tensorflow_datasets.core import file_adapters
 from tensorflow_datasets.scripts.cli import convert_format_utils
 
 
-def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
-  """Add arguments for `convert_format` subparser."""
-  parser.add_argument(
-      '--root_data_dir',
-      type=str,
-      help=(
-          'Root data dir that contains all datasets. All datasets and all their'
-          ' configs and versions that are in this folder will be converted.'
-      ),
-      required=False,
-  )
-  parser.add_argument(
-      '--dataset_dir',
-      type=str,
-      help=(
-          'Path where the dataset to be converted is located. Converts all'
-          ' configs and versions in this folder.'
-      ),
-      required=False,
-  )
-  parser.add_argument(
-      '--dataset_version_dir',
-      type=str,
-      help=(
-          'Path where the dataset to be converted is located. Should include'
-          ' config and version. Can also be a comma-separated list of paths. If'
-          ' multiple paths are specified, `--out_dir` should not be specified,'
-          ' since each dataset will be converted in the same directory as the'
-          ' input dataset.'
-      ),
-      required=False,
-  )
-  parser.add_argument(
-      '--out_file_format',
-      type=str,
-      choices=[file_format.value for file_format in file_adapters.FileFormat],
-      help='File format to convert the dataset to.',
-      required=True,
-  )
-  parser.add_argument(
-      '--out_dir',
-      type=str,
-      help=(
-          'Path where the converted dataset will be stored. Should include the'
-          ' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
-          ' specified, the converted shards will be stored in the same'
-          ' directory as the input dataset.'
-      ),
-      default='',
-      required=False,
-  )
-  parser.add_argument(
-      '--overwrite',
-      action='store_true',
-      help='Whether to overwrite the output directory if it already exists.',
-  )
-  parser.add_argument(
-      '--use_beam',
-      action='store_true',
-      help='Use beam to convert the dataset.',
-  )
-  parser.add_argument(
-      '--num_workers',
-      type=int,
-      default=8,
-      help=(
-          'Number of workers to use when not using Beam. If `--use_beam` is'
-          ' set, this flag is ignored. If `--num_workers=1`, the conversion'
-          ' will be done sequentially.'
-      ),
+@dataclasses.dataclass(frozen=True, kw_only=True)
+class Args:
+  """CLI arguments for converting datasets from one file format to another.
+
+  Attributes:
+    root_data_dir: Root data dir that contains all datasets. All datasets and
+      all their configs and versions that are in this folder will be converted.
+    dataset_dir: Path where the dataset to be converted is located. Converts all
+      configs and versions in this folder.
+    dataset_version_dir: Path where the dataset to be converted is located.
+      Should include config and version. Can also be a comma-separated list of
+      paths. If multiple paths are specified, `--out_dir` should not be
+      specified, since each dataset will be converted in the same directory as
+      the input dataset.
+    out_file_format: File format to convert the dataset to.
+    out_dir: Path where the converted dataset will be stored. Datasets will be
+      stored with the same folder structure as the input folder. If `None`, the
+      converted shards will be stored in the same folder as the input datasets.
+    overwrite: Whether to overwrite the output directory if it already exists.
+    use_beam: Use beam to convert the dataset.
+    num_workers: Number of workers to use when not using Beam. If `--use_beam`
+      is set, this flag is ignored. If `--num_workers=1`, the conversion will be
+      done sequentially.
+    only_log_errors: If set, errors during the conversion will be logged as
+      errors and will not crash the conversion. If you are converting a large
+      number of datasets, you might want to set this flag to true.
+  """
+
+  root_data_dir: epath.Path | None = None
+  dataset_dir: epath.Path | None = None
+  dataset_version_dir: list[epath.Path] = simple_parsing.field(
+      default_factory=list,
+      type=lambda dataset_version_dirs_str: [
+          epath.Path(path) for path in dataset_version_dirs_str.split(',')
+      ],
+      nargs='?',
   )
-  parser.add_argument(
-      '--only_log_errors',
-      action='store_true',
-      default=False,
-      help=(
-          'If set, errors during the conversion will be logged as errors and'
-          ' will not crash the conversion. If you are converting a large number'
-          ' of datasets, you might want to set this flag to true.'
-      ),
+  out_file_format: str = simple_parsing.choice(
+      *(file_format.value for file_format in file_adapters.FileFormat),
   )
+  out_dir: epath.Path | None = None
+  overwrite: bool = False
+  use_beam: bool = False
+  num_workers: int = 8
+  only_log_errors: bool = False
+
+  def execute(self) -> None:
+    """Converts a dataset from one file format to another."""
+    convert_format_utils.convert_dataset(
+        out_dir=self.out_dir,
+        out_file_format=self.out_file_format,
+        dataset_dir=self.dataset_dir,
+        root_data_dir=self.root_data_dir,
+        dataset_version_dir=self.dataset_version_dir,
+        overwrite=self.overwrite,
+        use_beam=self.use_beam,
+        num_workers=self.num_workers,
+        fail_on_error=not self.only_log_errors,
+    )
 
 
 def register_subparser(parsers: argparse._SubParsersAction) -> None:
@@ -122,27 +102,6 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
       'convert_format',
       help='Converts a dataset from one file format to another format.',
   )
-  add_parser_arguments(parser)
-
-  def _parse_dataset_version_dir(
-      dataset_version_dir: str | None,
-  ) -> Sequence[epath.Path] | None:
-    if not dataset_version_dir:
-      return None
-    return [epath.Path(path) for path in dataset_version_dir.split(',')]
-
-  parser.set_defaults(
-      subparser_fn=lambda args: convert_format_utils.convert_dataset(
-          out_dir=epath.Path(args.out_dir) if args.out_dir else None,
-          out_file_format=args.out_file_format,
-          dataset_dir=args.dataset_dir or None,
-          root_data_dir=args.root_data_dir or None,
-          dataset_version_dir=_parse_dataset_version_dir(
-              args.dataset_version_dir
-          ),
-          overwrite=args.overwrite,
-          use_beam=args.use_beam,
-          num_workers=args.num_workers,
-          fail_on_error=not args.only_log_errors,
-      )
-  )
+  parser = typing.cast(simple_parsing.ArgumentParser, parser)
+  parser.add_arguments(Args, dest='args')
+  parser.set_defaults(subparser_fn=lambda args: args.args.execute())