|
| 1 | +import copy |
| 2 | +from typing import Dict, Any |
| 3 | + |
| 4 | +import pytest |
| 5 | + |
| 6 | +from dandischema.utils import google_dataset_metadata |
| 7 | + |
| 8 | + |
| 9 | +@pytest.fixture |
| 10 | +def sample_dandiset_metadata() -> Dict[str, Any]: |
| 11 | + """Sample DANDI metadata for testing""" |
| 12 | + return { |
| 13 | + "@context": "https://raw.githubusercontent.com/dandi/schema/master/releases/0.6.4/context.json", |
| 14 | + "schemaKey": "Dandiset", |
| 15 | + "identifier": "DANDI:000707", |
| 16 | + "name": "Test Dandiset", |
| 17 | + "description": "A test dandiset for testing Google Dataset Search compatibility", |
| 18 | + "contributor": [ |
| 19 | + { |
| 20 | + "schemaKey": "Person", |
| 21 | + "name": "Doe, John", |
| 22 | + "roleName": ["dcite:Author", "dcite:ContactPerson"], |
| 23 | + "identifier": "0000-0001-2345-6789", |
| 24 | + "email": "john.doe@example.com", |
| 25 | + "includeInCitation": True |
| 26 | + }, |
| 27 | + { |
| 28 | + "schemaKey": "Organization", |
| 29 | + "name": "Test Organization", |
| 30 | + "roleName": ["dcite:Sponsor"], |
| 31 | + "identifier": "https://ror.org/xxxxxxxxx", |
| 32 | + "includeInCitation": False |
| 33 | + } |
| 34 | + ], |
| 35 | + "license": ["spdx:CC-BY-4.0"], |
| 36 | + "schemaVersion": "0.6.4", |
| 37 | + "assetsSummary": { |
| 38 | + "schemaKey": "AssetsSummary", |
| 39 | + "numberOfBytes": 1000000, |
| 40 | + "numberOfFiles": 10, |
| 41 | + "dataStandard": [ |
| 42 | + { |
| 43 | + "name": "Neurodata Without Borders (NWB)", |
| 44 | + "identifier": "RRID:SCR_015242" |
| 45 | + } |
| 46 | + ], |
| 47 | + "species": [ |
| 48 | + { |
| 49 | + "name": "Homo sapiens", |
| 50 | + "identifier": "http://purl.obolibrary.org/obo/NCBITaxon_9606" |
| 51 | + } |
| 52 | + ], |
| 53 | + "approach": [ |
| 54 | + { |
| 55 | + "name": "electrophysiology", |
| 56 | + "identifier": "http://uri.interlex.org/base/ilx_0739363" |
| 57 | + } |
| 58 | + ], |
| 59 | + "measurementTechnique": [ |
| 60 | + { |
| 61 | + "name": "multi-electrode extracellular electrophysiology", |
| 62 | + "identifier": "http://uri.interlex.org/base/ilx_0739400" |
| 63 | + } |
| 64 | + ] |
| 65 | + } |
| 66 | + } |
| 67 | + |
| 68 | + |
| 69 | +def test_google_dataset_metadata_basic_transformation(sample_dandiset_metadata): |
| 70 | + """Test that the basic transformation works correctly""" |
| 71 | + result = google_dataset_metadata(sample_dandiset_metadata) |
| 72 | + |
| 73 | + # Check that the original metadata is not modified |
| 74 | + assert sample_dandiset_metadata != result |
| 75 | + |
| 76 | + # Check that schema:Dataset is added to schemaKey |
| 77 | + assert "schema:Dataset" in result["schemaKey"] |
| 78 | + |
| 79 | + # Check that creator is properly formatted |
| 80 | + assert "creator" in result |
| 81 | + assert isinstance(result["creator"], list) |
| 82 | + assert len(result["creator"]) > 0 |
| 83 | + |
| 84 | + # Check first creator |
| 85 | + creator = result["creator"][0] |
| 86 | + assert creator["schemaKey"] == "schema:Person" |
| 87 | + assert "name" in creator |
| 88 | + |
| 89 | + # Check that license is properly formatted |
| 90 | + assert "license" in result |
| 91 | + assert isinstance(result["license"], list) |
| 92 | + assert "https://spdx.org/licenses/CC-BY-4.0" in result["license"] |
| 93 | + |
| 94 | + # Check that version is present |
| 95 | + assert "version" in result |
| 96 | + |
| 97 | + # Check that identifier is properly formatted |
| 98 | + assert "identifier" in result |
| 99 | + assert result["identifier"] == "https://identifiers.org/DANDI:000707" |
| 100 | + |
| 101 | + # Check that keywords exist |
| 102 | + assert "keywords" in result |
| 103 | + assert isinstance(result["keywords"], list) |
| 104 | + assert len(result["keywords"]) > 0 |
| 105 | + assert "neuroscience" in result["keywords"] |
| 106 | + assert "DANDI" in result["keywords"] |
| 107 | + |
| 108 | + |
| 109 | +def test_google_dataset_metadata_preserves_original(sample_dandiset_metadata): |
| 110 | + """Test that the original metadata is not modified""" |
| 111 | + original = copy.deepcopy(sample_dandiset_metadata) |
| 112 | + google_dataset_metadata(sample_dandiset_metadata) |
| 113 | + |
| 114 | + # Verify the original is unchanged |
| 115 | + assert original == sample_dandiset_metadata |
| 116 | + |
| 117 | + |
| 118 | +def test_google_dataset_metadata_with_existing_creator(sample_dandiset_metadata): |
| 119 | + """Test that existing creator is preserved""" |
| 120 | + # Add a creator field |
| 121 | + sample_dandiset_metadata["creator"] = [ |
| 122 | + { |
| 123 | + "schemaKey": "Person", |
| 124 | + "name": "Jane Smith", |
| 125 | + "identifier": "https://orcid.org/0000-0002-3456-7890" |
| 126 | + } |
| 127 | + ] |
| 128 | + |
| 129 | + result = google_dataset_metadata(sample_dandiset_metadata) |
| 130 | + |
| 131 | + # Check that the existing creator is preserved |
| 132 | + assert result["creator"] == sample_dandiset_metadata["creator"] |
| 133 | + |
| 134 | + |
| 135 | +def test_google_dataset_metadata_with_existing_keywords(sample_dandiset_metadata): |
| 136 | + """Test that existing keywords are preserved and extended""" |
| 137 | + # Add keywords field |
| 138 | + sample_dandiset_metadata["keywords"] = ["test", "example"] |
| 139 | + |
| 140 | + result = google_dataset_metadata(sample_dandiset_metadata) |
| 141 | + |
| 142 | + # Check that the existing keywords are preserved |
| 143 | + assert "test" in result["keywords"] |
| 144 | + assert "example" in result["keywords"] |
| 145 | + |
| 146 | + # Check that additional keywords are added |
| 147 | + assert "neuroscience" in result["keywords"] |
| 148 | + assert "DANDI" in result["keywords"] |
| 149 | + |
| 150 | + |
| 151 | +def test_google_dataset_metadata_with_no_license(sample_dandiset_metadata): |
| 152 | + """Test handling when no license is present""" |
| 153 | + # Remove license field |
| 154 | + no_license_metadata = copy.deepcopy(sample_dandiset_metadata) |
| 155 | + del no_license_metadata["license"] |
| 156 | + |
| 157 | + result = google_dataset_metadata(no_license_metadata) |
| 158 | + |
| 159 | + # Check that license is not in the result |
| 160 | + assert "license" not in result |
| 161 | + |
| 162 | + |
| 163 | +def test_google_dataset_metadata_with_no_contributors(sample_dandiset_metadata): |
| 164 | + """Test handling when no contributors are present""" |
| 165 | + # Remove contributor field |
| 166 | + no_contributor_metadata = copy.deepcopy(sample_dandiset_metadata) |
| 167 | + del no_contributor_metadata["contributor"] |
| 168 | + |
| 169 | + result = google_dataset_metadata(no_contributor_metadata) |
| 170 | + |
| 171 | + # Check that creator is not in the result |
| 172 | + assert "creator" not in result |
| 173 | + |
| 174 | + |
| 175 | +def test_google_dataset_metadata_with_date_published(sample_dandiset_metadata): |
| 176 | + """Test handling of datePublished field""" |
| 177 | + # Add datePublished field |
| 178 | + sample_dandiset_metadata["datePublished"] = "2023-01-01T00:00:00Z" |
| 179 | + |
| 180 | + result = google_dataset_metadata(sample_dandiset_metadata) |
| 181 | + |
| 182 | + # Check that datePublished is preserved |
| 183 | + assert result["datePublished"] == "2023-01-01T00:00:00Z" |
| 184 | + |
| 185 | + |
| 186 | +def test_google_dataset_metadata_with_date_created_fallback(sample_dandiset_metadata): |
| 187 | + """Test fallback to dateCreated when datePublished is not present""" |
| 188 | + # Add dateCreated field |
| 189 | + sample_dandiset_metadata["dateCreated"] = "2022-01-01T00:00:00Z" |
| 190 | + |
| 191 | + result = google_dataset_metadata(sample_dandiset_metadata) |
| 192 | + |
| 193 | + # Check that datePublished is set to dateCreated |
| 194 | + assert result["datePublished"] == "2022-01-01T00:00:00Z" |
0 commit comments