Skip to content

Commit df39eff

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Fix prepare_croissant command.
PiperOrigin-RevId: 626446760
1 parent ecb8e45 commit df39eff

File tree

2 files changed

+29
-30
lines changed

2 files changed

+29
-30
lines changed

tensorflow_datasets/scripts/cli/croissant.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
```
2020
tfds build_croissant \
2121
--jsonld=/tmp/croissant.json \
22-
--record_sets=record1 --record_sets=record2
23-
--file_format=array_record
24-
--out_dir=/tmp/foo
22+
--out_dir=/tmp/foo \
23+
--file_format=array_record \
24+
--record_sets=record1 --record_sets=record2 \
2525
--mapping='{"document.csv": "~/Downloads/document.csv"}"'
2626
```
2727
"""
@@ -43,12 +43,18 @@ def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
4343
help='The Croissant config file for the given dataset.',
4444
required=True,
4545
)
46+
parser.add_argument(
47+
'--out_dir',
48+
type=epath.Path,
49+
help='Path where the converted dataset will be stored.',
50+
required=True,
51+
)
4652
parser.add_argument(
4753
'--file_format',
54+
default=file_adapters.FileFormat.ARRAY_RECORD.value,
4855
type=str,
4956
choices=[file_format.value for file_format in file_adapters.FileFormat],
5057
help='File format to convert the dataset to.',
51-
required=True,
5258
)
5359
parser.add_argument(
5460
'--record_sets',
@@ -59,12 +65,6 @@ def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
5965
' the record sets'
6066
),
6167
)
62-
parser.add_argument(
63-
'--out_dir',
64-
type=epath.Path,
65-
help='Path where the converted dataset will be stored.',
66-
required=True,
67-
)
6868
parser.add_argument(
6969
'--mapping',
7070
type=str,
@@ -87,30 +87,30 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
8787
parser.set_defaults(
8888
subparser_fn=lambda args: prepare_croissant_builder(
8989
jsonld=args.jsonld,
90-
record_sets=args.record_sets,
91-
out_file_format=args.file_format,
9290
out_dir=args.out_dir,
91+
out_file_format=args.file_format,
92+
record_sets=args.record_sets,
9393
mapping=args.mapping,
9494
)
9595
)
9696

9797

9898
def prepare_croissant_builder(
9999
jsonld: epath.PathLike,
100-
record_sets: Sequence[str],
101-
out_file_format: str,
102100
out_dir: epath.PathLike,
101+
out_file_format: str,
102+
record_sets: Sequence[str],
103103
mapping: str | None,
104104
) -> None:
105105
"""Creates a Croissant Builder and runs the preparation.
106106
107107
Args:
108108
jsonld: The Croissant config file for the given dataset
109+
out_dir: Path where the converted dataset will be stored.
110+
out_file_format: File format to convert the dataset to.
109111
record_sets: The `@id`s of the record sets to generate. Each record set will
110112
correspond to a separate config. If not specified, it will use all the
111113
record sets
112-
out_file_format: File format to convert the dataset to.
113-
out_dir: Path where the converted dataset will be stored.
114114
mapping: Mapping filename->filepath as a Python dict[str, str] to handle
115115
manual downloads. If `document.csv` is the FileObject and you downloaded
116116
it to `~/Downloads/document.csv`, you can specify

tensorflow_datasets/scripts/prepare_croissant.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,9 @@
2020
```
2121
python tensorflow_datasets/scripts/prepare_croissant.py \
2222
--jsonld=/tmp/croissant.json \
23-
--record_sets=record1 --record_sets=record2
24-
--file_format=array_record
25-
--out_dir=/tmp/foo
23+
--out_dir=/tmp/foo \
24+
--out_file_format=array_record \
25+
--record_sets=record1,record2 \
2626
--mapping='{"document.csv": "~/Downloads/document.csv"}"'
2727
```
2828
"""
@@ -36,14 +36,19 @@
3636
_JSONLD = flags.DEFINE_string(
3737
name='jsonld', default=None, help='Path to the JSONLD file.', required=True
3838
)
39+
_OUT_DIR = flags.DEFINE_string(
40+
name='out_dir',
41+
default=None,
42+
help='Path where the converted dataset will be stored.',
43+
required=True,
44+
)
3945
_OUT_FILE_FORMAT = flags.DEFINE_enum_class(
4046
name='out_file_format',
41-
default=None,
47+
default=file_adapters.FileFormat.ARRAY_RECORD,
4248
enum_class=file_adapters.FileFormat,
4349
help='File format to convert the dataset to.',
44-
required=True,
4550
)
46-
_RECORD_SETS = flags.DEFINE_multi_string(
51+
_RECORD_SETS = flags.DEFINE_list(
4752
name='record_sets',
4853
default=[],
4954
help=(
@@ -52,12 +57,6 @@
5257
' the record sets.'
5358
),
5459
)
55-
_OUT_DIR = flags.DEFINE_string(
56-
name='out_dir',
57-
default=None,
58-
help='Path where the converted dataset will be stored.',
59-
required=True,
60-
)
6160
_MAPPING = flags.DEFINE_string(
6261
name='mapping',
6362
default=None,
@@ -73,9 +72,9 @@
7372
def main(_):
7473
croissant.prepare_croissant_builder(
7574
jsonld=_JSONLD.value,
76-
record_sets=_RECORD_SETS.value,
77-
out_file_format=_OUT_FILE_FORMAT.value,
7875
out_dir=_OUT_DIR.value,
76+
out_file_format=_OUT_FILE_FORMAT.value.value,
77+
record_sets=_RECORD_SETS.value,
7978
mapping=_MAPPING.value,
8079
)
8180

0 commit comments

Comments
 (0)