Add an option to skip cr:Split record sets when getting record_sets_ids to a CroissantBuilder.

The TensorFlow Datasets Authors · The TensorFlow Datasets Authors · commit 1fd44fc74721 · 2024-06-27T02:15:47.000-07:00
PiperOrigin-RevId: 647237046
diff --git a/tensorflow_datasets/core/dataset_builders/croissant_builder.py b/tensorflow_datasets/core/dataset_builders/croissant_builder.py
@@ -158,7 +158,8 @@ def __init__(
         a URL.
       record_set_ids: The @ids of the record sets for the dataset. Each record
         set will correspond to a separate config. If not specified, a config
-        will be generated for each record set defined in the Croissant JSON-LD.
+        will be generated for each record set defined in the Croissant JSON-LD,
+        except for the record sets which specify `cr:data`.
       disable_shuffling: Specify whether to shuffle the examples.
       int_dtype: The dtype to use for TFDS integer features. Defaults to
         np.int64.
@@ -186,9 +187,7 @@ def __init__(
     self.RELEASE_NOTES = {}  # pylint: disable=invalid-name
 
     if not record_set_ids:
-      record_set_ids = [
-          record_set.id for record_set in self.metadata.record_sets
-      ]
+      record_set_ids = croissant_utils.get_record_set_ids(self.metadata)
     config_names = [
         huggingface_utils.convert_hf_name(record_set)
         for record_set in record_set_ids
diff --git a/tensorflow_datasets/core/utils/croissant_utils.py b/tensorflow_datasets/core/utils/croissant_utils.py
@@ -39,3 +39,20 @@ def get_tfds_dataset_name(dataset: mlc.Dataset) -> str:
   """Returns TFDS compatible dataset name of the given MLcroissant dataset."""
   dataset_name = get_dataset_name(dataset)
   return huggingface_utils.convert_hf_name(dataset_name)
+
+
+def get_record_set_ids(metadata: mlc.Metadata) -> typing.Sequence[str]:
+  """Returns record set ids of the given MLcroissant metadata.
+
+  Record sets which have the attribute `cr:Data` are excluded (e.g. splits that
+  specify split or labels mappings).
+
+  Args:
+    metadata: The metadata of the dataset.
+  """
+  record_set_ids = []
+  for record_set in metadata.record_sets:
+    if record_set.data is not None:
+      continue
+    record_set_ids.append(record_set.id)
+  return record_set_ids
diff --git a/tensorflow_datasets/core/utils/croissant_utils_test.py b/tensorflow_datasets/core/utils/croissant_utils_test.py
@@ -34,3 +34,24 @@ def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
   metadata = mlc.Metadata(name=croissant_name, url=croissant_url)
   dataset = mlc.Dataset.from_metadata(metadata)
   assert croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
+
+
+def test_get_record_set_ids():
+  metadata = mlc.Metadata(
+      name='dummy_dataset',
+      url='https://dummy_url',
+      record_sets=[
+          mlc.RecordSet(
+              id='record_set_1',
+              fields=[],
+          ),
+          mlc.RecordSet(
+              id='record_set_2',
+              data_types=['http://mlcommons.org/croissant/Split'],
+              fields=[mlc.Field(name='name', data_types=mlc.DataType.TEXT)],
+              data=[{'name': 'train'}, {'name': 'test'}],
+          ),
+      ],
+  )
+  record_set_ids = croissant_utils.get_record_set_ids(metadata=metadata)
+  assert record_set_ids == ['record_set_1']