Refactor utils to support both multilingual descriptions and names in CroissantBuilder.

The TensorFlow Datasets Authors · The TensorFlow Datasets Authors · commit 4627fce0d7f5 · 2025-08-27T01:49:00.000-07:00
PiperOrigin-RevId: 799927689
diff --git a/tensorflow_datasets/core/dataset_builders/croissant_builder.py b/tensorflow_datasets/core/dataset_builders/croissant_builder.py
@@ -107,17 +107,21 @@ def array_datatype_converter(
   elif enp.lazy.is_np_dtype(field.data_type):
     field_dtype = field.data_type
 
+  description = croissant_utils.extract_localized_string(
+      field.description, field_name='description'
+  )
+
   if len(field.array_shape_tuple) == 1:
-    return sequence_feature.Sequence(feature, doc=field.description)
+    return sequence_feature.Sequence(feature, doc=description)
   elif (-1 in field.array_shape_tuple) or (field_dtype is None):
     for _ in range(len(field.array_shape_tuple)):
-      feature = sequence_feature.Sequence(feature, doc=field.description)
+      feature = sequence_feature.Sequence(feature, doc=description)
     return feature
   else:
     return tensor_feature.Tensor(
         shape=field.array_shape_tuple,
         dtype=field_dtype,
-        doc=field.description,
+        doc=description,
     )
 
 
@@ -151,6 +155,9 @@ def datatype_converter(
   }
 
   field_data_type = field.data_type
+  description = croissant_utils.extract_localized_string(
+      field.description, field_name='description'
+  )
 
   if not field_data_type:
     # Fields with sub fields are of type None.
@@ -162,22 +169,22 @@ def datatype_converter(
               )
               for subfield in field.sub_fields
           },
-          doc=field.description,
+          doc=description,
       )
     else:
       feature = None
   elif field_data_type == bytes:
-    feature = text_feature.Text(doc=field.description)
+    feature = text_feature.Text(doc=description)
   elif field_data_type in dtype_mapping:
     feature = dtype_mapping[field_data_type]
   elif enp.lazy.is_np_dtype(field_data_type):
     feature = field_data_type
   # We return a text feature for date-time features (mlc.DataType.DATE,
   # mlc.DataType.DATETIME, and mlc.DataType.TIME).
   elif field_data_type == pd.Timestamp or field_data_type == datetime.time:
-    feature = text_feature.Text(doc=field.description)
+    feature = text_feature.Text(doc=description)
   elif field_data_type == mlc.DataType.IMAGE_OBJECT:
-    feature = image_feature.Image(doc=field.description)
+    feature = image_feature.Image(doc=description)
   elif field_data_type == mlc.DataType.BOUNDING_BOX:
     # TFDS uses REL_YXYX by default, but Hugging Face doesn't enforce a format.
     if bbox_format := field.source.format:
@@ -190,14 +197,14 @@ def datatype_converter(
             f'{[format.value for format in bb_utils.BBoxFormat]}'
         ) from e
     feature = bounding_boxes.BBoxFeature(
-        doc=field.description, bbox_format=bbox_format
+        doc=description, bbox_format=bbox_format
     )
   elif field_data_type == mlc.DataType.AUDIO_OBJECT:
     feature = audio_feature.Audio(
-        doc=field.description, sample_rate=field.source.sampling_rate
+        doc=description, sample_rate=field.source.sampling_rate
     )
   elif field_data_type == mlc.DataType.VIDEO_OBJECT:
-    feature = video_feature.Video(doc=field.description)
+    feature = video_feature.Video(doc=description)
   else:
     raise ValueError(
         f'Unknown data type: {field_data_type} for field {field.id}.'
diff --git a/tensorflow_datasets/core/dataset_builders/croissant_builder_test.py b/tensorflow_datasets/core/dataset_builders/croissant_builder_test.py
@@ -262,7 +262,12 @@ def test_datatype_converter_complex(
     subfield_types: Dict[str, Type[Any]] | None,
 ):
   actual_feature = croissant_builder.datatype_converter(mlc_field)
-  assert actual_feature.doc.desc == mlc_field.description
+  expected_description = mlc_field.description
+  if isinstance(expected_description, dict):
+    expected_description = expected_description.get(
+        "en", next(iter(expected_description.values()))
+    )
+  assert actual_feature.doc.desc == expected_description
   assert isinstance(actual_feature, feature_type)
   if subfield_types is not None:
     for feature_name in actual_feature.keys():
@@ -271,6 +276,25 @@ def test_datatype_converter_complex(
       )
 
 
+def test_datatype_converter_multilingual_description():
+  mlc_field = mlc.Field(
+      data_types=mlc.DataType.TEXT,
+      description={"en": "English desc", "fr": "Description française"},
+  )
+  actual_feature = croissant_builder.datatype_converter(mlc_field)
+  assert actual_feature.doc.desc == "English desc"
+
+  mlc_field_no_en = mlc.Field(
+      data_types=mlc.DataType.TEXT,
+      description={
+          "de": "Deutsche Beschreibung",
+          "fr": "Description française",
+      },
+  )
+  actual_feature_no_en = croissant_builder.datatype_converter(mlc_field_no_en)
+  assert actual_feature_no_en.doc.desc == "Deutsche Beschreibung"
+
+
 def test_datatype_converter_none():
   field = mlc.Field(
       name="my_field", id="my_field", description="Field with empty data type."
diff --git a/tensorflow_datasets/core/utils/croissant_utils.py b/tensorflow_datasets/core/utils/croissant_utils.py
@@ -63,6 +63,65 @@ def get_croissant_version(version: str | None) -> str | None:
   return version
 
 
+def extract_localized_string(
+    attribute: str | dict[str, str] | None,
+    language: str | None = None,
+    field_name: str = "text field",
+) -> str | None:
+  """Returns the text in the specified language from a potentially localized object.
+
+  Some attributes in Croissant (e.g., `name` and `description`) can be
+  localized, meaning that they can be either simple strings, or dictionaries
+  mapping language codes to strings (e.g., `{"en": "English Name", "fr": "Nom
+  français"}`). This function extracts the text in the specified language from a
+  potentially localized object.
+
+  Args:
+    attribute: The object containing the text, which can be a simple string, a
+      dictionary mapping language codes to strings, or None.
+    language: The desired language code. If None, a heuristic is used: 'en' is
+      preferred, otherwise the first available language in the dictionary.
+    field_name: The name of the field being processed (e.g., "name",
+      "description"), used for error messages.
+
+  Returns:
+    The text string in the desired language, or None if the input is None.
+
+  Raises:
+    ValueError: If the text_object is an empty dictionary, or if the specified
+      language is not found.
+    TypeError: If attribute is not a str, dict, or None.
+  """
+  if attribute is None:
+    return None
+  if isinstance(attribute, str):
+    return attribute
+
+  if not isinstance(attribute, dict):
+    raise TypeError(
+        f"{field_name} must be a string, dictionary, or None. Got"
+        f" {type(attribute)}"
+    )
+
+  if language is None:
+    # Try a heuristic language, e.g., 'en'.
+    if "en" in attribute:
+      return attribute["en"]
+    # Otherwise, take the first language in the dict.
+    try:
+      first_lang = next(iter(attribute))
+      return attribute[first_lang]
+    except StopIteration as exc:
+      raise ValueError(f"Dataset `{field_name}` dictionary is empty.") from exc
+  elif language in attribute:
+    return attribute[language]
+  else:
+    raise ValueError(
+        f"Language '{language}' not found in {field_name} keys:"
+        f" {list(attribute.keys())}."
+    )
+
+
 def get_dataset_name(dataset: mlc.Dataset, language: str | None = None) -> str:
   """Returns dataset name of the given MLcroissant dataset.
 
@@ -73,26 +132,14 @@ def get_dataset_name(dataset: mlc.Dataset, language: str | None = None) -> str:
   """
   if (url := dataset.metadata.url) and url.startswith(_HUGGINGFACE_URL_PREFIX):
     return url.removeprefix(_HUGGINGFACE_URL_PREFIX)
-  name = dataset.metadata.name
-  if isinstance(name, dict):
-    if language is None:
-      # Try a heuristic language, e.g., 'en'.
-      if "en" in name:
-        return name["en"]
-      # Otherwise, take the first language in the dict.
-      try:
-        first_lang = next(iter(name))
-        return name[first_lang]
-      except StopIteration as exc:
-        raise ValueError("Dataset name dictionary is empty.") from exc
-    elif language not in dataset.metadata.name:
-      raise ValueError(
-          f"Language {language} not found in dataset names {name}."
-      )
-    else:
-      return name[language]
-  # At this point, name is not a dict anymore.
-  return typing.cast(str, name)
+  name = extract_localized_string(
+      dataset.metadata.name, language=language, field_name="name"
+  )
+  if name is None:
+    # This case should ideally be prevented by mlcroissant's validation
+    # ensuring metadata.name is not None.
+    raise ValueError("Dataset name is missing.")
+  return name
 
 
 def get_tfds_dataset_name(
diff --git a/tensorflow_datasets/core/utils/croissant_utils_test.py b/tensorflow_datasets/core/utils/croissant_utils_test.py
@@ -38,6 +38,53 @@ def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
   ), f'Expected TFDS name: {tfds_name}'
 
 
+@pytest.mark.parametrize(
+    'attribute,language,expected_text',
+    [
+        ({'en': 'English Text', 'fr': 'Texte Français'}, None, 'English Text'),
+        (
+            {'de': 'Deutscher Text', 'fr': 'Texte Français'},
+            None,
+            'Deutscher Text',
+        ),
+        (
+            {'en': 'English Text', 'fr': 'Texte Français'},
+            'fr',
+            'Texte Français',
+        ),
+        ('Simple Text', None, 'Simple Text'),
+        ('Simple Text', 'en', 'Simple Text'),
+        (None, None, None),
+    ],
+)
+def test_extract_localized_string(attribute, language, expected_text):
+  assert (
+      croissant_utils.extract_localized_string(attribute, language=language)
+      == expected_text
+  )
+
+
+def test_extract_localized_string_raises():
+  # Language not found.
+  with pytest.raises(
+      ValueError,
+      match=r"Language 'de' not found in text field keys:",
+  ):
+    croissant_utils.extract_localized_string(
+        {'en': 'English Text', 'fr': 'Texte Français'}, language='de'
+    )
+
+  # Empty dictionary.
+  with pytest.raises(
+      ValueError, match='Dataset `text field` dictionary is empty'
+  ):
+    croissant_utils.extract_localized_string({}, language=None)
+
+  # Incorrect type.
+  with pytest.raises(TypeError, match='must be a string, dictionary, or None'):
+    croissant_utils.extract_localized_string(123)
+
+
 @pytest.mark.parametrize(
     'croissant_name,language,expected_name',
     [
@@ -61,6 +108,25 @@ def test_get_dataset_name(croissant_name, language, expected_name):
   )
 
 
+def test_get_dataset_name_raises():
+  ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
+  # Test language not found in name.
+  metadata_lang_not_found = mlc.Metadata(
+      name={'en': 'English Name', 'fr': 'Nom Français'}, ctx=ctx, url=None
+  )
+  dataset_lang_not_found = mlc.Dataset.from_metadata(metadata_lang_not_found)
+  with pytest.raises(
+      ValueError, match=r"Language 'de' not found in name keys:"
+  ):
+    croissant_utils.get_dataset_name(dataset_lang_not_found, language='de')
+
+  # Test empty dictionary name.
+  metadata_empty_dict = mlc.Metadata(name={}, ctx=ctx, url=None)
+  dataset_empty_dict = mlc.Dataset.from_metadata(metadata_empty_dict)
+  with pytest.raises(ValueError, match='Dataset `name` dictionary is empty.'):
+    croissant_utils.get_dataset_name(dataset_empty_dict, language=None)
+
+
 def test_get_dataset_name_url_precedence():
   ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
   # Test that URL prefix removal works and takes precedence over name.
@@ -94,24 +160,6 @@ def test_get_dataset_name_url_precedence():
   assert croissant_utils.get_dataset_name(dataset_other_url) == 'Not Ignored'
 
 
-def test_get_dataset_multilingual_name_with_language_not_found():
-  ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
-  metadata_lang_not_found = mlc.Metadata(
-      name={'en': 'English Name', 'fr': 'Nom Français'}, ctx=ctx, url=None
-  )
-  dataset_lang_not_found = mlc.Dataset.from_metadata(metadata_lang_not_found)
-  with pytest.raises(ValueError, match='Language de not found'):
-    croissant_utils.get_dataset_name(dataset_lang_not_found, language='de')
-
-
-def test_get_dataset_multilingual_name_with_empty_dict():
-  ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
-  metadata_empty_dict = mlc.Metadata(name={}, ctx=ctx, url=None)
-  dataset_empty_dict = mlc.Dataset.from_metadata(metadata_empty_dict)
-  with pytest.raises(ValueError, match='Dataset name dictionary is empty'):
-    croissant_utils.get_dataset_name(dataset_empty_dict, language=None)
-
-
 @pytest.mark.parametrize(
     'croissant_version,tfds_version',
     [