diff --git a/dandischema/tests/test_google_dataset_metadata.py b/dandischema/tests/test_google_dataset_metadata.py new file mode 100644 index 00000000..d9ce53e6 --- /dev/null +++ b/dandischema/tests/test_google_dataset_metadata.py @@ -0,0 +1,210 @@ +import copy +from typing import Any, Dict + +import pytest + +from dandischema.utils import google_dataset_metadata + + +@pytest.fixture +def sample_dandiset_metadata() -> Dict[str, Any]: + """Sample DANDI metadata for testing""" + return { + "@context": "https://raw.githubusercontent.com/dandi/schema/master/releases/0.6.4/context.json", # noqa: E501 + "schemaKey": "Dandiset", + "identifier": "DANDI:000707", + "name": "Test Dandiset", + "description": "A test dandiset for testing Google Dataset Search compatibility", + "contributor": [ + { + "schemaKey": "Person", + "name": "Doe, John", + "roleName": ["dcite:Author", "dcite:ContactPerson"], + "identifier": "0000-0001-2345-6789", + "email": "john.doe@example.com", + "includeInCitation": True, + }, + { + "schemaKey": "Organization", + "name": "Test Organization", + "roleName": ["dcite:Sponsor"], + "identifier": "https://ror.org/xxxxxxxxx", + "includeInCitation": False, + }, + ], + "license": ["spdx:CC-BY-4.0"], + "schemaVersion": "0.6.4", + "assetsSummary": { + "schemaKey": "AssetsSummary", + "numberOfBytes": 1000000, + "numberOfFiles": 10, + "dataStandard": [ + { + "name": "Neurodata Without Borders (NWB)", + "identifier": "RRID:SCR_015242", + } + ], + "species": [ + { + "name": "Homo sapiens", + "identifier": "http://purl.obolibrary.org/obo/NCBITaxon_9606", + } + ], + "approach": [ + { + "name": "electrophysiology", + "identifier": "http://uri.interlex.org/base/ilx_0739363", + } + ], + "measurementTechnique": [ + { + "name": "multi-electrode extracellular electrophysiology", + "identifier": "http://uri.interlex.org/base/ilx_0739400", + } + ], + }, + } + + +def test_google_dataset_metadata_basic_transformation( + sample_dandiset_metadata: Dict[str, Any] +) -> None: + """Test that the basic transformation works correctly""" + result = google_dataset_metadata(sample_dandiset_metadata) + + # Check that the original metadata is not modified + assert sample_dandiset_metadata != result + + # Check that schema:Dataset is added to schemaKey + assert "schema:Dataset" in result["schemaKey"] + + # Check that schema:creator is properly formatted + assert "schema:creator" in result + assert isinstance(result["schema:creator"], list) + assert len(result["schema:creator"]) > 0 + + # Check first creator + creator = result["schema:creator"][0] + assert creator["schemaKey"] == "schema:Person" + assert "name" in creator + + # Check that license is properly formatted + assert "license" in result + assert isinstance(result["license"], list) + assert "https://spdx.org/licenses/CC-BY-4.0" in result["license"] + + # Check that version is present + assert "version" in result + + # Check that identifier is properly formatted + assert "identifier" in result + assert result["identifier"] == "https://identifiers.org/DANDI:000707" + + # Check that keywords exist + assert "keywords" in result + assert isinstance(result["keywords"], list) + assert len(result["keywords"]) > 0 + assert "neuroscience" in result["keywords"] + assert "DANDI" in result["keywords"] + + +def test_google_dataset_metadata_preserves_original( + sample_dandiset_metadata: Dict[str, Any] +) -> None: + """Test that the original metadata is not modified""" + original = copy.deepcopy(sample_dandiset_metadata) + google_dataset_metadata(sample_dandiset_metadata) + + # Verify the original is unchanged + assert original == sample_dandiset_metadata + + +def test_google_dataset_metadata_with_existing_creator( + sample_dandiset_metadata: Dict[str, Any] +) -> None: + """Test that existing creator is preserved""" + # Add a creator field + sample_dandiset_metadata["creator"] = [ + { + "schemaKey": "Person", + "name": "Jane Smith", + "identifier": "https://orcid.org/0000-0002-3456-7890", + } + ] + + result = google_dataset_metadata(sample_dandiset_metadata) + + # Check that the existing creator is preserved + assert result["creator"] == sample_dandiset_metadata["creator"] + + +def test_google_dataset_metadata_with_existing_keywords( + sample_dandiset_metadata: Dict[str, Any] +) -> None: + """Test that existing keywords are preserved and extended""" + # Add keywords field + sample_dandiset_metadata["keywords"] = ["test", "example"] + + result = google_dataset_metadata(sample_dandiset_metadata) + + # Check that the existing keywords are preserved + assert "test" in result["keywords"] + assert "example" in result["keywords"] + + # Check that additional keywords are added + assert "neuroscience" in result["keywords"] + assert "DANDI" in result["keywords"] + + +def test_google_dataset_metadata_with_no_license( + sample_dandiset_metadata: Dict[str, Any] +) -> None: + """Test handling when no license is present""" + # Remove license field + no_license_metadata = copy.deepcopy(sample_dandiset_metadata) + del no_license_metadata["license"] + + result = google_dataset_metadata(no_license_metadata) + + # Check that license is not in the result + assert "license" not in result + + +def test_google_dataset_metadata_with_no_contributors( + sample_dandiset_metadata: Dict[str, Any] +) -> None: + """Test handling when no contributors are present""" + # Remove contributor field + no_contributor_metadata = copy.deepcopy(sample_dandiset_metadata) + del no_contributor_metadata["contributor"] + + result = google_dataset_metadata(no_contributor_metadata) + + # Check that schema:creator is not in the result + assert "schema:creator" not in result + + +def test_google_dataset_metadata_with_date_published( + sample_dandiset_metadata: Dict[str, Any] +) -> None: + """Test handling of datePublished field""" + # Add datePublished field + sample_dandiset_metadata["datePublished"] = "2023-01-01T00:00:00Z" + + result = google_dataset_metadata(sample_dandiset_metadata) + + # Check that datePublished is preserved + assert result["datePublished"] == "2023-01-01T00:00:00Z" + + +def test_google_dataset_metadata_with_date_created_fallback( + sample_dandiset_metadata: Dict[str, Any] +) -> None: + """Test fallback to dateCreated when datePublished is not present""" + # Add dateCreated field + sample_dandiset_metadata["dateCreated"] = "2022-01-01T00:00:00Z" + + result = google_dataset_metadata(sample_dandiset_metadata) + + # Check that datePublished is set to dateCreated + assert result["datePublished"] == "2022-01-01T00:00:00Z" diff --git a/dandischema/utils.py b/dandischema/utils.py index 076b34cd..fc3e49ca 100644 --- a/dandischema/utils.py +++ b/dandischema/utils.py @@ -1,7 +1,8 @@ from __future__ import annotations +import copy import re -from typing import Any, Iterator, List, Union, cast, get_args, get_origin +from typing import Any, Dict, Iterator, List, Union, cast, get_args, get_origin from jsonschema import Draft7Validator, Draft202012Validator from jsonschema.protocols import Validator as JsonschemaValidator @@ -241,3 +242,192 @@ def validate_json(instance: Any, validator: JsonschemaValidator) -> None: # Pydantic type adapter for a JSON object, which is of type `dict[str, Any]` json_object_adapter = TypeAdapter(dict[str, Any], config=ConfigDict(strict=True)) + + +def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]: + """ + Transform DANDI metadata to be compatible with Google Dataset Search. + + This function takes a DANDI metadata JSON-LD document and transforms it to ensure + it passes the Google Dataset Search validator by adding or modifying required fields. + + Required properties for Google Dataset Search: + - @type: Dataset + - name: The name of the dataset + - description: A description of the dataset + - creator: The creator(s) of the dataset (with @type and name) + - license: The license of the dataset + - version: The version of the dataset + - identifier: An identifier for the dataset (preferably a DOI) + - keywords: Keywords describing the dataset + + Parameters + ---------- + metadata : Dict[str, Any] + The original DANDI metadata JSON-LD document + + Returns + ------- + Dict[str, Any] + The transformed metadata that is compatible with Google Dataset Search + """ + # Make a deep copy to avoid modifying the original + result = copy.deepcopy(metadata) + + # Append schema:Dataset to schemaKey + if "schemaKey" in result: + # If schemaKey is a string, convert it to a list + if isinstance(result["schemaKey"], str): + result["schemaKey"] = [result["schemaKey"], "schema:Dataset"] + # If schemaKey is already a list, append to it + elif isinstance(result["schemaKey"], list): + if "schema:Dataset" not in result["schemaKey"]: + result["schemaKey"].append("schema:Dataset") + # Otherwise, set it directly + else: + result["schemaKey"] = ["schema:Dataset"] + else: + # If no schemaKey exists, create one + result["schemaKey"] = ["schema:Dataset"] + + # Create schema:creator field from contributor if it doesn't exist + if "schema:creator" not in result and "contributor" in result: + # Filter contributors with Author role + authors = [ + contrib + for contrib in result["contributor"] + if contrib.get("roleName") and "dcite:Author" in contrib.get("roleName", []) + ] + + # If no authors found, use all contributors + creators = authors if authors else result["contributor"] + + # Format creators according to schema.org requirements + result["schema:creator"] = [] + for person in creators: + # Create a new creator object with updated schemaKey + creator = { + "schemaKey": ( + "schema:Organization" + if person.get("schemaKey") == "Organization" + else "schema:Person" + ), + "name": person.get("name", ""), + } + + # Add identifier if available (ORCID for Person, ROR for Organization) + if person.get("identifier"): + creator["identifier"] = person["identifier"] + + result["schema:creator"].append(creator) + + # Update contributor schemaKey and remove roleName + if "contributor" in result: + updated_contributors = [] + for contributor in result["contributor"]: + # Make a copy of the contributor + updated_contributor = copy.deepcopy(contributor) + + # Update schemaKey if it exists + if "schemaKey" in updated_contributor: + if updated_contributor["schemaKey"] == "Person": + updated_contributor["schemaKey"] = "schema:Person" + elif updated_contributor["schemaKey"] == "Organization": + updated_contributor["schemaKey"] = "schema:Organization" + + # Remove roleName if it exists + if "roleName" in updated_contributor: + del updated_contributor["roleName"] + + updated_contributors.append(updated_contributor) + + result["contributor"] = updated_contributors + + # Ensure license is properly formatted for schema.org + if "license" in result: + # Transform DANDI license format to schema.org format + schema_licenses = [] + for license_type in result["license"]: + # Extract the license identifier from the SPDX format + if isinstance(license_type, str) and license_type.startswith("spdx:"): + license_id = license_type.replace("spdx:", "") + schema_licenses.append(f"https://spdx.org/licenses/{license_id}") + else: + schema_licenses.append(license_type) + + result["license"] = schema_licenses + + # Ensure version is present + if "schemaVersion" in result and "version" not in result: + result["version"] = result["schemaVersion"] + + # Ensure identifier is properly formatted (preferably as a DOI URL) + if "identifier" in result and isinstance(result["identifier"], str): + # If it's a DOI in the format "DANDI:123456", convert to a URL + if result["identifier"].startswith("DANDI:"): + dandiset_id = result["identifier"].replace("DANDI:", "") + result["identifier"] = f"https://identifiers.org/DANDI:{dandiset_id}" + + # Generate keywords based on available metadata + keywords = [] + + # Add data standard as keywords + if "assetsSummary" in result and "dataStandard" in result["assetsSummary"]: + for std in result["assetsSummary"]["dataStandard"]: + if "name" in std: + keywords.append(std["name"]) + + # Add species as keywords + if "assetsSummary" in result and "species" in result["assetsSummary"]: + for species in result["assetsSummary"]["species"]: + if "name" in species: + keywords.append(species["name"]) + + # Add approach as keywords + if "assetsSummary" in result and "approach" in result["assetsSummary"]: + for approach in result["assetsSummary"]["approach"]: + if "name" in approach: + keywords.append(approach["name"]) + + # Transform measurement technique into a list of strings and add as keywords + if "assetsSummary" in result and "measurementTechnique" in result["assetsSummary"]: + # Extract technique names for keywords + for technique in result["assetsSummary"]["measurementTechnique"]: + if "name" in technique: + keywords.append(technique["name"]) + + # Transform the measurementTechnique to a list of strings (names only) + technique_names = [] + for technique in result["assetsSummary"]["measurementTechnique"]: + if "name" in technique: + technique_names.append(technique["name"]) + + # Replace the original complex objects with just the names + if technique_names: + result["assetsSummary"]["measurementTechnique"] = technique_names + + # Add "neuroscience" as a default keyword for DANDI + keywords.append("neuroscience") + keywords.append("DANDI") + + # Add keywords to result if we generated any + if keywords: + if "keywords" not in result or not result["keywords"]: + result["keywords"] = keywords + else: + # Add new keywords to existing ones, avoiding duplicates + existing_keywords = result["keywords"] + for keyword in keywords: + if keyword not in existing_keywords: + existing_keywords.append(keyword) + result["keywords"] = existing_keywords + + # Add datePublished if available + if "datePublished" in result: + # Ensure it's in the proper format + result["datePublished"] = result["datePublished"] + elif "dateCreated" in result: + # Use dateCreated as a fallback + result["datePublished"] = result["dateCreated"] + + return result