Skip to content

Commit b13d12b

Browse files
author
The TensorFlow Datasets Authors
committed
Add tests for handling multilingual names in Croissant.
PiperOrigin-RevId: 799442432
1 parent 9f1fdbf commit b13d12b

File tree

1 file changed

+77
-1
lines changed

1 file changed

+77
-1
lines changed

tensorflow_datasets/core/utils/croissant_utils_test.py

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,83 @@
3333
def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
3434
metadata = mlc.Metadata(name=croissant_name, url=croissant_url)
3535
dataset = mlc.Dataset.from_metadata(metadata)
36-
assert croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
36+
assert (
37+
croissant_utils.get_tfds_dataset_name(dataset) == tfds_name
38+
), f'Expected TFDS name: {tfds_name}'
39+
40+
41+
@pytest.mark.parametrize(
42+
'croissant_name,language,expected_name',
43+
[
44+
({'en': 'English Name', 'fr': 'Nom Français'}, None, 'English Name'),
45+
(
46+
{'de': 'Deutscher Name', 'fr': 'Nom Français'},
47+
None,
48+
'Deutscher Name',
49+
),
50+
({'en': 'English Name', 'fr': 'Nom Français'}, 'fr', 'Nom Français'),
51+
('Simple Name', None, 'Simple Name'),
52+
],
53+
)
54+
def test_get_dataset_name(croissant_name, language, expected_name):
55+
ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
56+
metadata = mlc.Metadata(name=croissant_name, ctx=ctx, url=None)
57+
dataset = mlc.Dataset.from_metadata(metadata)
58+
assert (
59+
croissant_utils.get_dataset_name(dataset, language=language)
60+
== expected_name
61+
)
62+
63+
64+
def test_get_dataset_name_url_precedence():
65+
ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
66+
# Test that URL prefix removal works and takes precedence over name.
67+
metadata = mlc.Metadata(
68+
name='Should Be Ignored',
69+
ctx=ctx,
70+
url='https://huggingface.co/datasets/user/dataset_name',
71+
)
72+
dataset = mlc.Dataset.from_metadata(metadata)
73+
assert croissant_utils.get_dataset_name(dataset) == 'user/dataset_name'
74+
75+
# Test that URL precedence also works when the name is a dict.
76+
metadata_dict_name = mlc.Metadata(
77+
name={'en': 'Should Be Ignored'},
78+
ctx=ctx,
79+
url='https://huggingface.co/datasets/another/other_dataset',
80+
)
81+
dataset_dict_name = mlc.Dataset.from_metadata(metadata_dict_name)
82+
assert (
83+
croissant_utils.get_dataset_name(dataset_dict_name)
84+
== 'another/other_dataset'
85+
)
86+
87+
# Test that non-HuggingFace URLs don't cause name to be ignored.
88+
metadata_other_url = mlc.Metadata(
89+
name='Not Ignored',
90+
ctx=ctx,
91+
url='https://example.com/dataset',
92+
)
93+
dataset_other_url = mlc.Dataset.from_metadata(metadata_other_url)
94+
assert croissant_utils.get_dataset_name(dataset_other_url) == 'Not Ignored'
95+
96+
97+
def test_get_dataset_multilingual_name_with_language_not_found():
98+
ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
99+
metadata_lang_not_found = mlc.Metadata(
100+
name={'en': 'English Name', 'fr': 'Nom Français'}, ctx=ctx, url=None
101+
)
102+
dataset_lang_not_found = mlc.Dataset.from_metadata(metadata_lang_not_found)
103+
with pytest.raises(ValueError, match='Language de not found'):
104+
croissant_utils.get_dataset_name(dataset_lang_not_found, language='de')
105+
106+
107+
def test_get_dataset_multilingual_name_with_empty_dict():
108+
ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1')
109+
metadata_empty_dict = mlc.Metadata(name={}, ctx=ctx, url=None)
110+
dataset_empty_dict = mlc.Dataset.from_metadata(metadata_empty_dict)
111+
with pytest.raises(ValueError, match='Dataset name dictionary is empty'):
112+
croissant_utils.get_dataset_name(dataset_empty_dict, language=None)
37113

38114

39115
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)