Skip to content

Commit 50001e8

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Automatically parse record set ids when they're not given.
PiperOrigin-RevId: 651096174
1 parent e124446 commit 50001e8

File tree

3 files changed

+8
-6
lines changed

3 files changed

+8
-6
lines changed

tensorflow_datasets/core/proto/dataset_info.proto

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,6 @@ message DataSourceAccess {
111111
// URL referring to the data being used.
112112
// If provided together with a source, the url should correspond to the source
113113
// or part of the source.
114-
// copybara:strip_begin
115-
// If referring to a DataHub dataset, use the following format:
116-
// http://data/details/mldataset.tfds.mnist
117-
// copybara:strip_end
118114
Url url = 5;
119115
}
120116

tensorflow_datasets/core/utils/croissant_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def get_tfds_dataset_name(dataset: mlc.Dataset) -> str:
4141
return huggingface_utils.convert_hf_name(dataset_name)
4242

4343

44-
def get_record_set_ids(metadata: mlc.Metadata) -> typing.Sequence[str]:
44+
def get_record_set_ids(metadata: mlc.Metadata) -> list[str]:
4545
"""Returns record set ids of the given MLcroissant metadata.
4646
4747
Record sets which have the attribute `cr:Data` are excluded (e.g. splits that

tensorflow_datasets/scripts/cli/croissant.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,12 @@ def dataset(self) -> mlc.Dataset:
106106
def dataset_name(self) -> str:
107107
return croissant_utils.get_dataset_name(self.dataset)
108108

109+
@functools.cached_property
110+
def record_set_ids(self) -> list[str]:
111+
return self.record_sets or croissant_utils.get_record_set_ids(
112+
self.dataset.metadata
113+
)
114+
109115

110116
def register_subparser(parsers: argparse._SubParsersAction):
111117
"""Add subparser for `convert_format` command."""
@@ -133,7 +139,7 @@ def prepare_croissant_builder(args: CmdArgs) -> None:
133139
"""
134140
builder = croissant_builder.CroissantBuilder(
135141
jsonld=args.jsonld,
136-
record_set_ids=args.record_sets or None,
142+
record_set_ids=args.record_set_ids,
137143
file_format=args.file_format,
138144
data_dir=args.data_dir,
139145
mapping=args.mapping_json,

0 commit comments

Comments
 (0)