|
33 | 33 | def test_get_tfds_dataset_name(croissant_name, croissant_url, tfds_name):
|
34 | 34 | metadata = mlc.Metadata(name=croissant_name, url=croissant_url)
|
35 | 35 | dataset = mlc.Dataset.from_metadata(metadata)
|
36 |
| - assert croissant_utils.get_tfds_dataset_name(dataset) == tfds_name |
| 36 | + assert ( |
| 37 | + croissant_utils.get_tfds_dataset_name(dataset) == tfds_name |
| 38 | + ), f'Expected TFDS name: {tfds_name}' |
| 39 | + |
| 40 | + |
| 41 | +@pytest.mark.parametrize( |
| 42 | + 'croissant_name,language,expected_name', |
| 43 | + [ |
| 44 | + ({'en': 'English Name', 'fr': 'Nom Français'}, None, 'English Name'), |
| 45 | + ( |
| 46 | + {'de': 'Deutscher Name', 'fr': 'Nom Français'}, |
| 47 | + None, |
| 48 | + 'Deutscher Name', |
| 49 | + ), |
| 50 | + ({'en': 'English Name', 'fr': 'Nom Français'}, 'fr', 'Nom Français'), |
| 51 | + ('Simple Name', None, 'Simple Name'), |
| 52 | + ], |
| 53 | +) |
| 54 | +def test_get_dataset_name(croissant_name, language, expected_name): |
| 55 | + ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1') |
| 56 | + metadata = mlc.Metadata(name=croissant_name, ctx=ctx, url=None) |
| 57 | + dataset = mlc.Dataset.from_metadata(metadata) |
| 58 | + assert ( |
| 59 | + croissant_utils.get_dataset_name(dataset, language=language) |
| 60 | + == expected_name |
| 61 | + ) |
| 62 | + |
| 63 | + |
| 64 | +def test_get_dataset_name_url_precedence(): |
| 65 | + ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1') |
| 66 | + # Test that URL prefix removal works and takes precedence over name. |
| 67 | + metadata = mlc.Metadata( |
| 68 | + name='Should Be Ignored', |
| 69 | + ctx=ctx, |
| 70 | + url='https://huggingface.co/datasets/user/dataset_name', |
| 71 | + ) |
| 72 | + dataset = mlc.Dataset.from_metadata(metadata) |
| 73 | + assert croissant_utils.get_dataset_name(dataset) == 'user/dataset_name' |
| 74 | + |
| 75 | + # Test that URL precedence also works when the name is a dict. |
| 76 | + metadata_dict_name = mlc.Metadata( |
| 77 | + name={'en': 'Should Be Ignored'}, |
| 78 | + ctx=ctx, |
| 79 | + url='https://huggingface.co/datasets/another/other_dataset', |
| 80 | + ) |
| 81 | + dataset_dict_name = mlc.Dataset.from_metadata(metadata_dict_name) |
| 82 | + assert ( |
| 83 | + croissant_utils.get_dataset_name(dataset_dict_name) |
| 84 | + == 'another/other_dataset' |
| 85 | + ) |
| 86 | + |
| 87 | + # Test that non-HuggingFace URLs don't cause name to be ignored. |
| 88 | + metadata_other_url = mlc.Metadata( |
| 89 | + name='Not Ignored', |
| 90 | + ctx=ctx, |
| 91 | + url='https://example.com/dataset', |
| 92 | + ) |
| 93 | + dataset_other_url = mlc.Dataset.from_metadata(metadata_other_url) |
| 94 | + assert croissant_utils.get_dataset_name(dataset_other_url) == 'Not Ignored' |
| 95 | + |
| 96 | + |
| 97 | +def test_get_dataset_multilingual_name_with_language_not_found(): |
| 98 | + ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1') |
| 99 | + metadata_lang_not_found = mlc.Metadata( |
| 100 | + name={'en': 'English Name', 'fr': 'Nom Français'}, ctx=ctx, url=None |
| 101 | + ) |
| 102 | + dataset_lang_not_found = mlc.Dataset.from_metadata(metadata_lang_not_found) |
| 103 | + with pytest.raises(ValueError, match='Language de not found'): |
| 104 | + croissant_utils.get_dataset_name(dataset_lang_not_found, language='de') |
| 105 | + |
| 106 | + |
| 107 | +def test_get_dataset_multilingual_name_with_empty_dict(): |
| 108 | + ctx = mlc.Context(conforms_to='http://mlcommons.org/croissant/1.1') |
| 109 | + metadata_empty_dict = mlc.Metadata(name={}, ctx=ctx, url=None) |
| 110 | + dataset_empty_dict = mlc.Dataset.from_metadata(metadata_empty_dict) |
| 111 | + with pytest.raises(ValueError, match='Dataset name dictionary is empty'): |
| 112 | + croissant_utils.get_dataset_name(dataset_empty_dict, language=None) |
37 | 113 |
|
38 | 114 |
|
39 | 115 | @pytest.mark.parametrize(
|
|
0 commit comments