File tree Expand file tree Collapse file tree 3 files changed +23
-1
lines changed Expand file tree Collapse file tree 3 files changed +23
-1
lines changed Original file line number Diff line number Diff line change @@ -108,6 +108,9 @@ class DownloadConfig:
108
108
used.
109
109
ignore_duplicates: whether to ignore duplicated examples with the same key.
110
110
If there are multiple examples with the same key, the first one is kept.
111
+ nondeterministic_order: If True, it will not assure deterministic ordering
112
+ when writing' examples to disk in the case of beam datasets. This might
113
+ result in quicker dataset preparation.
111
114
"""
112
115
113
116
extract_dir : epath .PathLike | None = None
@@ -126,6 +129,7 @@ class DownloadConfig:
126
129
min_shard_size : int = shard_utils .DEFAULT_MIN_SHARD_SIZE
127
130
max_shard_size : int = shard_utils .DEFAULT_MAX_SHARD_SIZE
128
131
ignore_duplicates : bool = False
132
+ nondeterministic_order : bool = False
129
133
130
134
def get_shard_config (self ) -> shard_utils .ShardConfig :
131
135
return shard_utils .ShardConfig (
Original file line number Diff line number Diff line change 32
32
33
33
34
34
def register_subparser (parsers : argparse ._SubParsersAction ) -> None : # pylint: disable=protected-access
35
- """Add subparser for `build` command."""
35
+ """Add subparser for `build` command.
36
+
37
+ New flags should be added to `cli_utils` module.
38
+
39
+ Args:
40
+ parsers: The subparsers object to add the parser to.
41
+ """
36
42
build_parser = parsers .add_parser (
37
43
'build' , help = 'Commands for downloading and preparing datasets.'
38
44
)
@@ -357,6 +363,7 @@ def _download_and_prepare(
357
363
skip_if_published = args .skip_if_published ,
358
364
overwrite = args .overwrite ,
359
365
beam_pipeline_options = args .beam_pipeline_options ,
366
+ nondeterministic_order = args .nondeterministic_order ,
360
367
)
361
368
362
369
Original file line number Diff line number Diff line change @@ -261,6 +261,14 @@ def add_generation_argument_group(parser: argparse.ArgumentParser):
261
261
default = 1 ,
262
262
help = 'Number of parallel build processes.' ,
263
263
)
264
+ generation_group .add_argument (
265
+ '--nondeterministic_order' ,
266
+ action = 'store_false' ,
267
+ help = (
268
+ 'If True, it will not assure deterministic ordering when writing'
269
+ ' examples to disk. This might result in quicker dataset preparation.'
270
+ ),
271
+ )
264
272
265
273
266
274
def add_publish_argument_group (parser : argparse .ArgumentParser ):
@@ -300,6 +308,7 @@ def download_and_prepare(
300
308
skip_if_published : bool ,
301
309
overwrite : bool ,
302
310
beam_pipeline_options : str | None ,
311
+ nondeterministic_order : bool = False ,
303
312
) -> None :
304
313
"""Generate a single builder."""
305
314
dataset = builder .info .full_name
@@ -317,6 +326,8 @@ def download_and_prepare(
317
326
download_config = download .DownloadConfig ()
318
327
if overwrite and not download_config .download_mode .overwrite_dataset :
319
328
download_config .download_mode = download .GenerateMode .REUSE_CACHE_IF_EXISTS
329
+ if nondeterministic_order :
330
+ download_config .nondeterministic_order = True
320
331
321
332
# Add Apache Beam options to download config
322
333
try :
You can’t perform that action at this time.
0 commit comments