Fix prepare_croissant command.

fineguy · The TensorFlow Datasets Authors · commit df39eff624a3 · 2024-04-19T13:00:01.000-07:00
PiperOrigin-RevId: 626446760
diff --git a/tensorflow_datasets/scripts/cli/croissant.py b/tensorflow_datasets/scripts/cli/croissant.py
@@ -19,9 +19,9 @@
 ```
 tfds build_croissant \
   --jsonld=/tmp/croissant.json \
-  --record_sets=record1 --record_sets=record2
-  --file_format=array_record
-  --out_dir=/tmp/foo
+  --out_dir=/tmp/foo \
+  --file_format=array_record \
+  --record_sets=record1 --record_sets=record2 \
   --mapping='{"document.csv": "~/Downloads/document.csv"}"'
 ```
 """
@@ -43,12 +43,18 @@ def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
       help='The Croissant config file for the given dataset.',
       required=True,
   )
+  parser.add_argument(
+      '--out_dir',
+      type=epath.Path,
+      help='Path where the converted dataset will be stored.',
+      required=True,
+  )
   parser.add_argument(
       '--file_format',
+      default=file_adapters.FileFormat.ARRAY_RECORD.value,
       type=str,
       choices=[file_format.value for file_format in file_adapters.FileFormat],
       help='File format to convert the dataset to.',
-      required=True,
   )
   parser.add_argument(
       '--record_sets',
@@ -59,12 +65,6 @@ def add_parser_arguments(parser: argparse.ArgumentParser) -> None:
           ' the record sets'
       ),
   )
-  parser.add_argument(
-      '--out_dir',
-      type=epath.Path,
-      help='Path where the converted dataset will be stored.',
-      required=True,
-  )
   parser.add_argument(
       '--mapping',
       type=str,
@@ -87,30 +87,30 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
   parser.set_defaults(
       subparser_fn=lambda args: prepare_croissant_builder(
           jsonld=args.jsonld,
-          record_sets=args.record_sets,
-          out_file_format=args.file_format,
           out_dir=args.out_dir,
+          out_file_format=args.file_format,
+          record_sets=args.record_sets,
           mapping=args.mapping,
       )
   )
 
 
 def prepare_croissant_builder(
     jsonld: epath.PathLike,
-    record_sets: Sequence[str],
-    out_file_format: str,
     out_dir: epath.PathLike,
+    out_file_format: str,
+    record_sets: Sequence[str],
     mapping: str | None,
 ) -> None:
   """Creates a Croissant Builder and runs the preparation.
 
   Args:
     jsonld: The Croissant config file for the given dataset
+    out_dir: Path where the converted dataset will be stored.
+    out_file_format: File format to convert the dataset to.
     record_sets: The `@id`s of the record sets to generate. Each record set will
       correspond to a separate config. If not specified, it will use all the
       record sets
-    out_file_format: File format to convert the dataset to.
-    out_dir: Path where the converted dataset will be stored.
     mapping: Mapping filename->filepath as a Python dict[str, str] to handle
       manual downloads. If `document.csv` is the FileObject and you downloaded
       it to `~/Downloads/document.csv`, you can specify
diff --git a/tensorflow_datasets/scripts/prepare_croissant.py b/tensorflow_datasets/scripts/prepare_croissant.py
@@ -20,9 +20,9 @@
 ```
 python tensorflow_datasets/scripts/prepare_croissant.py \
   --jsonld=/tmp/croissant.json \
-  --record_sets=record1 --record_sets=record2
-  --file_format=array_record
-  --out_dir=/tmp/foo
+  --out_dir=/tmp/foo \
+  --out_file_format=array_record \
+  --record_sets=record1,record2 \
   --mapping='{"document.csv": "~/Downloads/document.csv"}"'
 ```
 """
@@ -36,14 +36,19 @@
 _JSONLD = flags.DEFINE_string(
     name='jsonld', default=None, help='Path to the JSONLD file.', required=True
 )
+_OUT_DIR = flags.DEFINE_string(
+    name='out_dir',
+    default=None,
+    help='Path where the converted dataset will be stored.',
+    required=True,
+)
 _OUT_FILE_FORMAT = flags.DEFINE_enum_class(
     name='out_file_format',
-    default=None,
+    default=file_adapters.FileFormat.ARRAY_RECORD,
     enum_class=file_adapters.FileFormat,
     help='File format to convert the dataset to.',
-    required=True,
 )
-_RECORD_SETS = flags.DEFINE_multi_string(
+_RECORD_SETS = flags.DEFINE_list(
     name='record_sets',
     default=[],
     help=(
@@ -52,12 +57,6 @@
         ' the record sets.'
     ),
 )
-_OUT_DIR = flags.DEFINE_string(
-    name='out_dir',
-    default=None,
-    help='Path where the converted dataset will be stored.',
-    required=True,
-)
 _MAPPING = flags.DEFINE_string(
     name='mapping',
     default=None,
@@ -73,9 +72,9 @@
 def main(_):
   croissant.prepare_croissant_builder(
       jsonld=_JSONLD.value,
-      record_sets=_RECORD_SETS.value,
-      out_file_format=_OUT_FILE_FORMAT.value,
       out_dir=_OUT_DIR.value,
+      out_file_format=_OUT_FILE_FORMAT.value.value,
+      record_sets=_RECORD_SETS.value,
       mapping=_MAPPING.value,
   )