Skip to content

Commit 9f1fdbf

Browse files
author
The TensorFlow Datasets Authors
committed
Support multilingual names in croissant_utils.
PiperOrigin-RevId: 799264622
1 parent fc11f4c commit 9f1fdbf

File tree

1 file changed

+40
-7
lines changed

1 file changed

+40
-7
lines changed

tensorflow_datasets/core/utils/croissant_utils.py

Lines changed: 40 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,16 +63,49 @@ def get_croissant_version(version: str | None) -> str | None:
6363
return version
6464

6565

66-
def get_dataset_name(dataset: mlc.Dataset) -> str:
67-
"""Returns dataset name of the given MLcroissant dataset."""
66+
def get_dataset_name(dataset: mlc.Dataset, language: str | None = None) -> str:
67+
"""Returns dataset name of the given MLcroissant dataset.
68+
69+
Args:
70+
dataset: The MLcroissant dataset.
71+
language: For datasets with multiple names in different languages, this
72+
argument specifies the language to use.
73+
"""
6874
if (url := dataset.metadata.url) and url.startswith(_HUGGINGFACE_URL_PREFIX):
6975
return url.removeprefix(_HUGGINGFACE_URL_PREFIX)
70-
return dataset.metadata.name
76+
name = dataset.metadata.name
77+
if isinstance(name, dict):
78+
if language is None:
79+
# Try a heuristic language, e.g., 'en'.
80+
if "en" in name:
81+
return name["en"]
82+
# Otherwise, take the first language in the dict.
83+
try:
84+
first_lang = next(iter(name))
85+
return name[first_lang]
86+
except StopIteration as exc:
87+
raise ValueError("Dataset name dictionary is empty.") from exc
88+
elif language not in dataset.metadata.name:
89+
raise ValueError(
90+
f"Language {language} not found in dataset names {name}."
91+
)
92+
else:
93+
return name[language]
94+
# At this point, name is not a dict anymore.
95+
return typing.cast(str, name)
96+
97+
98+
def get_tfds_dataset_name(
99+
dataset: mlc.Dataset, language: str | None = None
100+
) -> str:
101+
"""Returns TFDS compatible dataset name of the given MLcroissant dataset.
71102
72-
73-
def get_tfds_dataset_name(dataset: mlc.Dataset) -> str:
74-
"""Returns TFDS compatible dataset name of the given MLcroissant dataset."""
75-
dataset_name = get_dataset_name(dataset)
103+
Args:
104+
dataset: The MLcroissant dataset.
105+
language: For datasets with multiple names in different languages, this
106+
argument specifies the language to use.
107+
"""
108+
dataset_name = get_dataset_name(dataset, language=language)
76109
return conversion_utils.to_tfds_name(dataset_name)
77110

78111

0 commit comments

Comments
 (0)