Skip to content

Commit fff78da

Browse files
author
The TensorFlow Datasets Authors
committed
Include a nondeterministic_order attribute to download_config, and add a flag to TFDS build CLI to set it accordingly. The new flag/attribute defaults to False.
PiperOrigin-RevId: 691787693
1 parent 22f60a5 commit fff78da

File tree

3 files changed

+23
-1
lines changed

3 files changed

+23
-1
lines changed

tensorflow_datasets/core/download/download_manager.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ class DownloadConfig:
108108
used.
109109
ignore_duplicates: whether to ignore duplicated examples with the same key.
110110
If there are multiple examples with the same key, the first one is kept.
111+
nondeterministic_order: If True, it will not assure deterministic ordering
112+
when writing' examples to disk in the case of beam datasets. This might
113+
result in quicker dataset preparation.
111114
"""
112115

113116
extract_dir: epath.PathLike | None = None
@@ -126,6 +129,7 @@ class DownloadConfig:
126129
min_shard_size: int = shard_utils.DEFAULT_MIN_SHARD_SIZE
127130
max_shard_size: int = shard_utils.DEFAULT_MAX_SHARD_SIZE
128131
ignore_duplicates: bool = False
132+
nondeterministic_order: bool = False
129133

130134
def get_shard_config(self) -> shard_utils.ShardConfig:
131135
return shard_utils.ShardConfig(

tensorflow_datasets/scripts/cli/build.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,13 @@
3232

3333

3434
def register_subparser(parsers: argparse._SubParsersAction) -> None: # pylint: disable=protected-access
35-
"""Add subparser for `build` command."""
35+
"""Add subparser for `build` command.
36+
37+
New flags should be added to `cli_utils` module.
38+
39+
Args:
40+
parsers: The subparsers object to add the parser to.
41+
"""
3642
build_parser = parsers.add_parser(
3743
'build', help='Commands for downloading and preparing datasets.'
3844
)
@@ -357,6 +363,7 @@ def _download_and_prepare(
357363
skip_if_published=args.skip_if_published,
358364
overwrite=args.overwrite,
359365
beam_pipeline_options=args.beam_pipeline_options,
366+
nondeterministic_order=args.nondeterministic_order,
360367
)
361368

362369

tensorflow_datasets/scripts/cli/cli_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,14 @@ def add_generation_argument_group(parser: argparse.ArgumentParser):
261261
default=1,
262262
help='Number of parallel build processes.',
263263
)
264+
generation_group.add_argument(
265+
'--nondeterministic_order',
266+
action='store_false',
267+
help=(
268+
'If True, it will not assure deterministic ordering when writing'
269+
' examples to disk. This might result in quicker dataset preparation.'
270+
),
271+
)
264272

265273

266274
def add_publish_argument_group(parser: argparse.ArgumentParser):
@@ -300,6 +308,7 @@ def download_and_prepare(
300308
skip_if_published: bool,
301309
overwrite: bool,
302310
beam_pipeline_options: str | None,
311+
nondeterministic_order: bool = False,
303312
) -> None:
304313
"""Generate a single builder."""
305314
dataset = builder.info.full_name
@@ -317,6 +326,8 @@ def download_and_prepare(
317326
download_config = download.DownloadConfig()
318327
if overwrite and not download_config.download_mode.overwrite_dataset:
319328
download_config.download_mode = download.GenerateMode.REUSE_CACHE_IF_EXISTS
329+
if nondeterministic_order:
330+
download_config.nondeterministic_order = True
320331

321332
# Add Apache Beam options to download config
322333
try:

0 commit comments

Comments
 (0)