Skip to content

Commit 44239d8

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Use simple_parsing for convert_format cli command.
PiperOrigin-RevId: 792034867
1 parent b7007e5 commit 44239d8

File tree

1 file changed

+63
-104
lines changed

1 file changed

+63
-104
lines changed

tensorflow_datasets/scripts/cli/convert_format.py

Lines changed: 63 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -26,94 +26,74 @@
2626
"""
2727

2828
import argparse
29-
from collections.abc import Sequence
29+
import dataclasses
30+
import typing
3031

3132
from etils import epath
33+
import simple_parsing
3234
from tensorflow_datasets.core import file_adapters
3335
from tensorflow_datasets.scripts.cli import convert_format_utils
3436

3537

36-
def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
37-
"""Add arguments for `convert_format` subparser."""
38-
parser.add_argument(
39-
'--root_data_dir',
40-
type=str,
41-
help=(
42-
'Root data dir that contains all datasets. All datasets and all their'
43-
' configs and versions that are in this folder will be converted.'
44-
),
45-
required=False,
46-
)
47-
parser.add_argument(
48-
'--dataset_dir',
49-
type=str,
50-
help=(
51-
'Path where the dataset to be converted is located. Converts all'
52-
' configs and versions in this folder.'
53-
),
54-
required=False,
55-
)
56-
parser.add_argument(
57-
'--dataset_version_dir',
58-
type=str,
59-
help=(
60-
'Path where the dataset to be converted is located. Should include'
61-
' config and version. Can also be a comma-separated list of paths. If'
62-
' multiple paths are specified, `--out_dir` should not be specified,'
63-
' since each dataset will be converted in the same directory as the'
64-
' input dataset.'
65-
),
66-
required=False,
67-
)
68-
parser.add_argument(
69-
'--out_file_format',
70-
type=str,
71-
choices=[file_format.value for file_format in file_adapters.FileFormat],
72-
help='File format to convert the dataset to.',
73-
required=True,
74-
)
75-
parser.add_argument(
76-
'--out_dir',
77-
type=str,
78-
help=(
79-
'Path where the converted dataset will be stored. Should include the'
80-
' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
81-
' specified, the converted shards will be stored in the same'
82-
' directory as the input dataset.'
83-
),
84-
default='',
85-
required=False,
86-
)
87-
parser.add_argument(
88-
'--overwrite',
89-
action='store_true',
90-
help='Whether to overwrite the output directory if it already exists.',
91-
)
92-
parser.add_argument(
93-
'--use_beam',
94-
action='store_true',
95-
help='Use beam to convert the dataset.',
96-
)
97-
parser.add_argument(
98-
'--num_workers',
99-
type=int,
100-
default=8,
101-
help=(
102-
'Number of workers to use when not using Beam. If `--use_beam` is'
103-
' set, this flag is ignored. If `--num_workers=1`, the conversion'
104-
' will be done sequentially.'
105-
),
38+
@dataclasses.dataclass(frozen=True, kw_only=True)
39+
class Args:
40+
"""CLI arguments for converting datasets from one file format to another.
41+
42+
Attributes:
43+
root_data_dir: Root data dir that contains all datasets. All datasets and
44+
all their configs and versions that are in this folder will be converted.
45+
dataset_dir: Path where the dataset to be converted is located. Converts all
46+
configs and versions in this folder.
47+
dataset_version_dir: Path where the dataset to be converted is located.
48+
Should include config and version. Can also be a comma-separated list of
49+
paths. If multiple paths are specified, `--out_dir` should not be
50+
specified, since each dataset will be converted in the same directory as
51+
the input dataset.
52+
out_file_format: File format to convert the dataset to.
53+
out_dir: Path where the converted dataset will be stored. Datasets will be
54+
stored with the same folder structure as the input folder. If `None`, the
55+
converted shards will be stored in the same folder as the input datasets.
56+
overwrite: Whether to overwrite the output directory if it already exists.
57+
use_beam: Use beam to convert the dataset.
58+
num_workers: Number of workers to use when not using Beam. If `--use_beam`
59+
is set, this flag is ignored. If `--num_workers=1`, the conversion will be
60+
done sequentially.
61+
only_log_errors: If set, errors during the conversion will be logged as
62+
errors and will not crash the conversion. If you are converting a large
63+
number of datasets, you might want to set this flag to true.
64+
"""
65+
66+
root_data_dir: epath.Path | None = None
67+
dataset_dir: epath.Path | None = None
68+
dataset_version_dir: list[epath.Path] = simple_parsing.field(
69+
default_factory=list,
70+
type=lambda dataset_version_dirs_str: [
71+
epath.Path(path) for path in dataset_version_dirs_str.split(',')
72+
],
73+
nargs='?',
10674
)
107-
parser.add_argument(
108-
'--only_log_errors',
109-
action='store_true',
110-
default=False,
111-
help=(
112-
'If set, errors during the conversion will be logged as errors and'
113-
' will not crash the conversion. If you are converting a large number'
114-
' of datasets, you might want to set this flag to true.'
115-
),
75+
out_file_format: str = simple_parsing.choice(
76+
*(file_format.value for file_format in file_adapters.FileFormat),
11677
)
78+
out_dir: epath.Path | None = None
79+
overwrite: bool = False
80+
use_beam: bool = False
81+
num_workers: int = 8
82+
only_log_errors: bool = False
83+
84+
def execute(self) -> None:
85+
"""Converts a dataset from one file format to another."""
86+
convert_format_utils.convert_dataset(
87+
out_dir=self.out_dir,
88+
out_file_format=self.out_file_format,
89+
dataset_dir=self.dataset_dir,
90+
root_data_dir=self.root_data_dir,
91+
dataset_version_dir=self.dataset_version_dir,
92+
overwrite=self.overwrite,
93+
use_beam=self.use_beam,
94+
num_workers=self.num_workers,
95+
fail_on_error=not self.only_log_errors,
96+
)
11797

11898

11999
def register_subparser(parsers: argparse._SubParsersAction) -> None:
@@ -122,27 +102,6 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
122102
'convert_format',
123103
help='Converts a dataset from one file format to another format.',
124104
)
125-
add_parser_arguments(parser)
126-
127-
def _parse_dataset_version_dir(
128-
dataset_version_dir: str | None,
129-
) -> Sequence[epath.Path] | None:
130-
if not dataset_version_dir:
131-
return None
132-
return [epath.Path(path) for path in dataset_version_dir.split(',')]
133-
134-
parser.set_defaults(
135-
subparser_fn=lambda args: convert_format_utils.convert_dataset(
136-
out_dir=epath.Path(args.out_dir) if args.out_dir else None,
137-
out_file_format=args.out_file_format,
138-
dataset_dir=args.dataset_dir or None,
139-
root_data_dir=args.root_data_dir or None,
140-
dataset_version_dir=_parse_dataset_version_dir(
141-
args.dataset_version_dir
142-
),
143-
overwrite=args.overwrite,
144-
use_beam=args.use_beam,
145-
num_workers=args.num_workers,
146-
fail_on_error=not args.only_log_errors,
147-
)
148-
)
105+
parser = typing.cast(simple_parsing.ArgumentParser, parser)
106+
parser.add_arguments(Args, dest='args')
107+
parser.set_defaults(subparser_fn=lambda args: args.args.execute())

0 commit comments

Comments
 (0)