Skip to content

Commit b7d054a

Browse files
tomvdwThe TensorFlow Datasets Authors
authored andcommitted
Add option to specify multiple dataset folders to convert_format
PiperOrigin-RevId: 647245196
1 parent 1fd44fc commit b7d054a

File tree

3 files changed

+64
-17
lines changed

3 files changed

+64
-17
lines changed

tensorflow_datasets/scripts/cli/convert_format.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,16 +18,17 @@
1818
Example usage:
1919
```
2020
tfds convert_format \
21-
--dataset_dir=/data/dataset/config/1.2.3 \
21+
--dataset_version_dir=/data/dataset/config/1.2.3 \
2222
--out_file_format=array_record \
2323
--out_dir=/data_array_record/dataset/config/1.2.3 \
2424
--use_beam=True
2525
```
2626
"""
2727

2828
import argparse
29-
import pathlib
29+
from collections.abc import Sequence
3030

31+
from etils import epath
3132
from tensorflow_datasets.core import file_adapters
3233
from tensorflow_datasets.scripts.cli import convert_format_utils
3334

@@ -57,7 +58,10 @@ def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
5758
type=str,
5859
help=(
5960
'Path where the dataset to be converted is located. Should include'
60-
' config and version.'
61+
' config and version. Can also be a comma-separated list of paths. If'
62+
' multiple paths are specified, `--out_dir` should not be specified,'
63+
' since each dataset will be converted in the same directory as the'
64+
' input dataset.'
6165
),
6266
required=False,
6367
)
@@ -70,14 +74,14 @@ def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
7074
)
7175
parser.add_argument(
7276
'--out_dir',
73-
type=pathlib.Path,
77+
type=str,
7478
help=(
7579
'Path where the converted dataset will be stored. Should include the'
7680
' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
7781
' specified, the converted shards will be stored in the same'
7882
' directory as the input dataset.'
7983
),
80-
default=None,
84+
default='',
8185
required=False,
8286
)
8387
parser.add_argument(
@@ -109,13 +113,23 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
109113
help='Converts a dataset from one file format to another format.',
110114
)
111115
add_parser_arguments(parser)
116+
117+
def _parse_dataset_version_dir(
118+
dataset_version_dir: str | None,
119+
) -> Sequence[epath.Path] | None:
120+
if not dataset_version_dir:
121+
return None
122+
return [epath.Path(path) for path in dataset_version_dir.split(',')]
123+
112124
parser.set_defaults(
113125
subparser_fn=lambda args: convert_format_utils.convert_dataset(
114-
out_dir=args.out_dir,
126+
out_dir=args.out_dir if args.out_dir else None,
115127
out_file_format=args.out_file_format,
116128
dataset_dir=args.dataset_dir or None,
117129
root_data_dir=args.root_data_dir or None,
118-
dataset_version_dir=args.dataset_version_dir or None,
130+
dataset_version_dir=_parse_dataset_version_dir(
131+
args.dataset_version_dir
132+
),
119133
overwrite=args.overwrite,
120134
use_beam=args.use_beam,
121135
num_workers=args.num_workers,

tensorflow_datasets/scripts/cli/convert_format_utils.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
r"""Library to convert a dataset from one file format to another."""
1717

18-
from collections.abc import Iterable, Iterator, Mapping
18+
from collections.abc import Iterable, Iterator, Mapping, Sequence
1919
import dataclasses
2020
import functools
2121
import os
@@ -491,7 +491,9 @@ def convert_dataset(
491491
out_file_format: str | file_adapters.FileFormat,
492492
root_data_dir: epath.PathLike | None = None,
493493
dataset_dir: epath.PathLike | None = None,
494-
dataset_version_dir: epath.PathLike | None = None,
494+
dataset_version_dir: (
495+
epath.PathLike | Sequence[epath.PathLike] | None
496+
) = None,
495497
overwrite: bool = False,
496498
use_beam: bool = False,
497499
num_workers: int = 8,
@@ -511,7 +513,10 @@ def convert_dataset(
511513
their own configs and versions.
512514
dataset_dir: folder that contains a single dataset with all its configs and
513515
versions.
514-
dataset_version_dir: folder that contains a single dataset version.
516+
dataset_version_dir: a single or list of folders that each contains a single
517+
dataset version. If multiple folders are specified, `out_dir` should be
518+
`None`, since each dataset will be converted in the same folder as the
519+
input dataset.
515520
overwrite: whether to overwrite folders in `out_dir` if they already exist.
516521
use_beam: whether to use Beam to convert datasets. Useful for big datasets.
517522
num_workers: number of workers to use when not using Beam. If `use_beam` is
@@ -548,9 +553,23 @@ def convert_dataset(
548553
overwrite=overwrite,
549554
)
550555
elif dataset_version_dir:
551-
if out_dir is None:
552-
out_dir = dataset_version_dir
553-
from_to_dirs = {epath.Path(dataset_version_dir): epath.Path(out_dir)}
556+
if isinstance(dataset_version_dir, str):
557+
dataset_version_dir = [dataset_version_dir]
558+
559+
if len(dataset_version_dir) > 1 and out_dir is not None:
560+
raise ValueError(
561+
'If multiple dataset version dirs are specified, `out_dir` must be'
562+
' `None`, since each dataset will be converted in the same folder as'
563+
' the input dataset.'
564+
)
565+
566+
from_to_dirs = {}
567+
for path in dataset_version_dir:
568+
if out_dir is None:
569+
from_to_dirs[epath.Path(path)] = epath.Path(path)
570+
else:
571+
from_to_dirs[epath.Path(path)] = epath.Path(out_dir)
572+
554573
_convert_dataset_dirs(
555574
from_to_dirs=from_to_dirs,
556575
out_file_format=out_file_format,

tensorflow_datasets/scripts/convert_format.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,15 @@
5656
),
5757
default=None,
5858
)
59-
_DATASET_VERSION_DIR = flags.DEFINE_string(
59+
_DATASET_VERSION_DIR = flags.DEFINE_list(
6060
'dataset_version_dir',
6161
required=False,
6262
help=(
6363
'Path where the dataset to be converted is located. Should include'
64-
' config and version.'
64+
' config and version. Can also be a comma-separated list of paths. If'
65+
' multiple paths are specified, `--out_dir` should not be specified,'
66+
' since each dataset will be converted in the same directory as the'
67+
' input dataset.'
6568
),
6669
default=None,
6770
)
@@ -76,10 +79,12 @@
7679

7780
_OUT_DIR = flags.DEFINE_string(
7881
'out_dir',
79-
required=True,
82+
required=False,
8083
help=(
8184
'Path where the converted dataset will be stored. Should include the'
82-
' config and version, e.g. `/data/dataset_name/config/1.2.3`.'
85+
' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
86+
' specified, the converted shards will be stored in the same directory'
87+
' as the input dataset.'
8388
),
8489
default=None,
8590
)
@@ -90,6 +95,13 @@
9095
help='Whether to use beam to convert the dataset.',
9196
)
9297

98+
_NUM_WORKERS = flags.DEFINE_integer(
99+
'num_workers',
100+
default=8,
101+
help='Number of workers to use if `use_beam` is `False`.',
102+
)
103+
104+
93105
_OVERWRITE = flags.DEFINE_bool(
94106
'overwrite',
95107
default=False,
@@ -98,6 +110,7 @@
98110

99111

100112
def main(_):
113+
101114
convert_format_utils.convert_dataset(
102115
root_data_dir=_ROOT_DATA_DIR.value,
103116
dataset_dir=_DATASET_DIR.value,
@@ -106,6 +119,7 @@ def main(_):
106119
out_dir=_OUT_DIR.value,
107120
use_beam=_USE_BEAM.value,
108121
overwrite=_OVERWRITE.value,
122+
num_workers=_NUM_WORKERS.value,
109123
)
110124

111125

0 commit comments

Comments
 (0)