@@ -63,16 +63,49 @@ def get_croissant_version(version: str | None) -> str | None:
63
63
return version
64
64
65
65
66
- def get_dataset_name (dataset : mlc .Dataset ) -> str :
67
- """Returns dataset name of the given MLcroissant dataset."""
66
+ def get_dataset_name (dataset : mlc .Dataset , language : str | None = None ) -> str :
67
+ """Returns dataset name of the given MLcroissant dataset.
68
+
69
+ Args:
70
+ dataset: The MLcroissant dataset.
71
+ language: For datasets with multiple names in different languages, this
72
+ argument specifies the language to use.
73
+ """
68
74
if (url := dataset .metadata .url ) and url .startswith (_HUGGINGFACE_URL_PREFIX ):
69
75
return url .removeprefix (_HUGGINGFACE_URL_PREFIX )
70
- return dataset .metadata .name
76
+ name = dataset .metadata .name
77
+ if isinstance (name , dict ):
78
+ if language is None :
79
+ # Try a heuristic language, e.g., 'en'.
80
+ if "en" in name :
81
+ return name ["en" ]
82
+ # Otherwise, take the first language in the dict.
83
+ try :
84
+ first_lang = next (iter (name ))
85
+ return name [first_lang ]
86
+ except StopIteration as exc :
87
+ raise ValueError ("Dataset name dictionary is empty." ) from exc
88
+ elif language not in dataset .metadata .name :
89
+ raise ValueError (
90
+ f"Language { language } not found in dataset names { name } ."
91
+ )
92
+ else :
93
+ return name [language ]
94
+ # At this point, name is not a dict anymore.
95
+ return typing .cast (str , name )
96
+
97
+
98
+ def get_tfds_dataset_name (
99
+ dataset : mlc .Dataset , language : str | None = None
100
+ ) -> str :
101
+ """Returns TFDS compatible dataset name of the given MLcroissant dataset.
71
102
72
-
73
- def get_tfds_dataset_name (dataset : mlc .Dataset ) -> str :
74
- """Returns TFDS compatible dataset name of the given MLcroissant dataset."""
75
- dataset_name = get_dataset_name (dataset )
103
+ Args:
104
+ dataset: The MLcroissant dataset.
105
+ language: For datasets with multiple names in different languages, this
106
+ argument specifies the language to use.
107
+ """
108
+ dataset_name = get_dataset_name (dataset , language = language )
76
109
return conversion_utils .to_tfds_name (dataset_name )
77
110
78
111
0 commit comments