26
26
"""
27
27
28
28
import argparse
29
- from collections .abc import Sequence
29
+ import dataclasses
30
+ import typing
30
31
31
32
from etils import epath
33
+ import simple_parsing
32
34
from tensorflow_datasets .core import file_adapters
33
35
from tensorflow_datasets .scripts .cli import convert_format_utils
34
36
35
37
36
- def add_parser_arguments (parser : argparse .ArgumentParser ) -> None :
37
- """Add arguments for `convert_format` subparser."""
38
- parser .add_argument (
39
- '--root_data_dir' ,
40
- type = str ,
41
- help = (
42
- 'Root data dir that contains all datasets. All datasets and all their'
43
- ' configs and versions that are in this folder will be converted.'
44
- ),
45
- required = False ,
46
- )
47
- parser .add_argument (
48
- '--dataset_dir' ,
49
- type = str ,
50
- help = (
51
- 'Path where the dataset to be converted is located. Converts all'
52
- ' configs and versions in this folder.'
53
- ),
54
- required = False ,
55
- )
56
- parser .add_argument (
57
- '--dataset_version_dir' ,
58
- type = str ,
59
- help = (
60
- 'Path where the dataset to be converted is located. Should include'
61
- ' config and version. Can also be a comma-separated list of paths. If'
62
- ' multiple paths are specified, `--out_dir` should not be specified,'
63
- ' since each dataset will be converted in the same directory as the'
64
- ' input dataset.'
65
- ),
66
- required = False ,
67
- )
68
- parser .add_argument (
69
- '--out_file_format' ,
70
- type = str ,
71
- choices = [file_format .value for file_format in file_adapters .FileFormat ],
72
- help = 'File format to convert the dataset to.' ,
73
- required = True ,
74
- )
75
- parser .add_argument (
76
- '--out_dir' ,
77
- type = str ,
78
- help = (
79
- 'Path where the converted dataset will be stored. Should include the'
80
- ' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
81
- ' specified, the converted shards will be stored in the same'
82
- ' directory as the input dataset.'
83
- ),
84
- default = '' ,
85
- required = False ,
86
- )
87
- parser .add_argument (
88
- '--overwrite' ,
89
- action = 'store_true' ,
90
- help = 'Whether to overwrite the output directory if it already exists.' ,
91
- )
92
- parser .add_argument (
93
- '--use_beam' ,
94
- action = 'store_true' ,
95
- help = 'Use beam to convert the dataset.' ,
96
- )
97
- parser .add_argument (
98
- '--num_workers' ,
99
- type = int ,
100
- default = 8 ,
101
- help = (
102
- 'Number of workers to use when not using Beam. If `--use_beam` is'
103
- ' set, this flag is ignored. If `--num_workers=1`, the conversion'
104
- ' will be done sequentially.'
105
- ),
38
+ @dataclasses .dataclass (frozen = True , kw_only = True )
39
+ class Args :
40
+ """CLI arguments for converting datasets from one file format to another.
41
+
42
+ Attributes:
43
+ root_data_dir: Root data dir that contains all datasets. All datasets and
44
+ all their configs and versions that are in this folder will be converted.
45
+ dataset_dir: Path where the dataset to be converted is located. Converts all
46
+ configs and versions in this folder.
47
+ dataset_version_dir: Path where the dataset to be converted is located.
48
+ Should include config and version. Can also be a comma-separated list of
49
+ paths. If multiple paths are specified, `--out_dir` should not be
50
+ specified, since each dataset will be converted in the same directory as
51
+ the input dataset.
52
+ out_file_format: File format to convert the dataset to.
53
+ out_dir: Path where the converted dataset will be stored. Datasets will be
54
+ stored with the same folder structure as the input folder. If `None`, the
55
+ converted shards will be stored in the same folder as the input datasets.
56
+ overwrite: Whether to overwrite the output directory if it already exists.
57
+ use_beam: Use beam to convert the dataset.
58
+ num_workers: Number of workers to use when not using Beam. If `--use_beam`
59
+ is set, this flag is ignored. If `--num_workers=1`, the conversion will be
60
+ done sequentially.
61
+ only_log_errors: If set, errors during the conversion will be logged as
62
+ errors and will not crash the conversion. If you are converting a large
63
+ number of datasets, you might want to set this flag to true.
64
+ """
65
+
66
+ root_data_dir : epath .Path | None = None
67
+ dataset_dir : epath .Path | None = None
68
+ dataset_version_dir : list [epath .Path ] = simple_parsing .field (
69
+ default_factory = list ,
70
+ type = lambda dataset_version_dirs_str : [
71
+ epath .Path (path ) for path in dataset_version_dirs_str .split (',' )
72
+ ],
73
+ nargs = '?' ,
106
74
)
107
- parser .add_argument (
108
- '--only_log_errors' ,
109
- action = 'store_true' ,
110
- default = False ,
111
- help = (
112
- 'If set, errors during the conversion will be logged as errors and'
113
- ' will not crash the conversion. If you are converting a large number'
114
- ' of datasets, you might want to set this flag to true.'
115
- ),
75
+ out_file_format : str = simple_parsing .choice (
76
+ * (file_format .value for file_format in file_adapters .FileFormat ),
116
77
)
78
+ out_dir : epath .Path | None = None
79
+ overwrite : bool = False
80
+ use_beam : bool = False
81
+ num_workers : int = 8
82
+ only_log_errors : bool = False
83
+
84
+ def execute (self ) -> None :
85
+ """Converts a dataset from one file format to another."""
86
+ convert_format_utils .convert_dataset (
87
+ out_dir = self .out_dir ,
88
+ out_file_format = self .out_file_format ,
89
+ dataset_dir = self .dataset_dir ,
90
+ root_data_dir = self .root_data_dir ,
91
+ dataset_version_dir = self .dataset_version_dir ,
92
+ overwrite = self .overwrite ,
93
+ use_beam = self .use_beam ,
94
+ num_workers = self .num_workers ,
95
+ fail_on_error = not self .only_log_errors ,
96
+ )
117
97
118
98
119
99
def register_subparser (parsers : argparse ._SubParsersAction ) -> None :
@@ -122,27 +102,6 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
122
102
'convert_format' ,
123
103
help = 'Converts a dataset from one file format to another format.' ,
124
104
)
125
- add_parser_arguments (parser )
126
-
127
- def _parse_dataset_version_dir (
128
- dataset_version_dir : str | None ,
129
- ) -> Sequence [epath .Path ] | None :
130
- if not dataset_version_dir :
131
- return None
132
- return [epath .Path (path ) for path in dataset_version_dir .split (',' )]
133
-
134
- parser .set_defaults (
135
- subparser_fn = lambda args : convert_format_utils .convert_dataset (
136
- out_dir = epath .Path (args .out_dir ) if args .out_dir else None ,
137
- out_file_format = args .out_file_format ,
138
- dataset_dir = args .dataset_dir or None ,
139
- root_data_dir = args .root_data_dir or None ,
140
- dataset_version_dir = _parse_dataset_version_dir (
141
- args .dataset_version_dir
142
- ),
143
- overwrite = args .overwrite ,
144
- use_beam = args .use_beam ,
145
- num_workers = args .num_workers ,
146
- fail_on_error = not args .only_log_errors ,
147
- )
148
- )
105
+ parser = typing .cast (simple_parsing .ArgumentParser , parser )
106
+ parser .add_arguments (Args , dest = 'args' )
107
+ parser .set_defaults (subparser_fn = lambda args : args .args .execute ())
0 commit comments