Merge remote-tracking branch 'origin/enh/googledataset' into enh/googledataset

satra · satra · commit 95f12d982d3c · 2025-03-13T00:32:04.000-04:00
* origin/enh/googledataset:
  [pre-commit.ci] auto fixes from pre-commit.com hooks
diff --git a/dandischema/tests/test_google_dataset_metadata.py b/dandischema/tests/test_google_dataset_metadata.py
@@ -1,5 +1,5 @@
 import copy
-from typing import Dict, Any
+from typing import Any, Dict
 
 import pytest
 
@@ -22,15 +22,15 @@ def sample_dandiset_metadata() -> Dict[str, Any]:
                 "roleName": ["dcite:Author", "dcite:ContactPerson"],
                 "identifier": "0000-0001-2345-6789",
                 "email": "john.doe@example.com",
-                "includeInCitation": True
+                "includeInCitation": True,
             },
             {
                 "schemaKey": "Organization",
                 "name": "Test Organization",
                 "roleName": ["dcite:Sponsor"],
                 "identifier": "https://ror.org/xxxxxxxxx",
-                "includeInCitation": False
-            }
+                "includeInCitation": False,
+            },
         ],
         "license": ["spdx:CC-BY-4.0"],
         "schemaVersion": "0.6.4",
@@ -41,38 +41,38 @@ def sample_dandiset_metadata() -> Dict[str, Any]:
             "dataStandard": [
                 {
                     "name": "Neurodata Without Borders (NWB)",
-                    "identifier": "RRID:SCR_015242"
+                    "identifier": "RRID:SCR_015242",
                 }
             ],
             "species": [
                 {
                     "name": "Homo sapiens",
-                    "identifier": "http://purl.obolibrary.org/obo/NCBITaxon_9606"
+                    "identifier": "http://purl.obolibrary.org/obo/NCBITaxon_9606",
                 }
             ],
             "approach": [
                 {
                     "name": "electrophysiology",
-                    "identifier": "http://uri.interlex.org/base/ilx_0739363"
+                    "identifier": "http://uri.interlex.org/base/ilx_0739363",
                 }
             ],
             "measurementTechnique": [
                 {
                     "name": "multi-electrode extracellular electrophysiology",
-                    "identifier": "http://uri.interlex.org/base/ilx_0739400"
+                    "identifier": "http://uri.interlex.org/base/ilx_0739400",
                 }
-            ]
-        }
+            ],
+        },
     }
 
 
 def test_google_dataset_metadata_basic_transformation(sample_dandiset_metadata):
     """Test that the basic transformation works correctly"""
     result = google_dataset_metadata(sample_dandiset_metadata)
-    
+
     # Check that the original metadata is not modified
     assert sample_dandiset_metadata != result
-    
+
     # Check that schema:Dataset is added to schemaKey
     assert "schema:Dataset" in result["schemaKey"]
     
@@ -85,19 +85,19 @@ def test_google_dataset_metadata_basic_transformation(sample_dandiset_metadata):
     creator = result["schema:creator"][0]
     assert creator["schemaKey"] == "schema:Person"
     assert "name" in creator
-    
+
     # Check that license is properly formatted
     assert "license" in result
     assert isinstance(result["license"], list)
     assert "https://spdx.org/licenses/CC-BY-4.0" in result["license"]
-    
+
     # Check that version is present
     assert "version" in result
-    
+
     # Check that identifier is properly formatted
     assert "identifier" in result
     assert result["identifier"] == "https://identifiers.org/DANDI:000707"
-    
+
     # Check that keywords exist
     assert "keywords" in result
     assert isinstance(result["keywords"], list)
@@ -110,7 +110,7 @@ def test_google_dataset_metadata_preserves_original(sample_dandiset_metadata):
     """Test that the original metadata is not modified"""
     original = copy.deepcopy(sample_dandiset_metadata)
     google_dataset_metadata(sample_dandiset_metadata)
-    
+
     # Verify the original is unchanged
     assert original == sample_dandiset_metadata
 
@@ -122,12 +122,12 @@ def test_google_dataset_metadata_with_existing_creator(sample_dandiset_metadata)
         {
             "schemaKey": "Person",
             "name": "Jane Smith",
-            "identifier": "https://orcid.org/0000-0002-3456-7890"
+            "identifier": "https://orcid.org/0000-0002-3456-7890",
         }
     ]
-    
+
     result = google_dataset_metadata(sample_dandiset_metadata)
-    
+
     # Check that the existing creator is preserved
     assert result["creator"] == sample_dandiset_metadata["creator"]
 
@@ -136,13 +136,13 @@ def test_google_dataset_metadata_with_existing_keywords(sample_dandiset_metadata
     """Test that existing keywords are preserved and extended"""
     # Add keywords field
     sample_dandiset_metadata["keywords"] = ["test", "example"]
-    
+
     result = google_dataset_metadata(sample_dandiset_metadata)
-    
+
     # Check that the existing keywords are preserved
     assert "test" in result["keywords"]
     assert "example" in result["keywords"]
-    
+
     # Check that additional keywords are added
     assert "neuroscience" in result["keywords"]
     assert "DANDI" in result["keywords"]
@@ -153,9 +153,9 @@ def test_google_dataset_metadata_with_no_license(sample_dandiset_metadata):
     # Remove license field
     no_license_metadata = copy.deepcopy(sample_dandiset_metadata)
     del no_license_metadata["license"]
-    
+
     result = google_dataset_metadata(no_license_metadata)
-    
+
     # Check that license is not in the result
     assert "license" not in result
 
@@ -165,7 +165,7 @@ def test_google_dataset_metadata_with_no_contributors(sample_dandiset_metadata):
     # Remove contributor field
     no_contributor_metadata = copy.deepcopy(sample_dandiset_metadata)
     del no_contributor_metadata["contributor"]
-    
+
     result = google_dataset_metadata(no_contributor_metadata)
     
     # Check that schema:creator is not in the result
@@ -176,9 +176,9 @@ def test_google_dataset_metadata_with_date_published(sample_dandiset_metadata):
     """Test handling of datePublished field"""
     # Add datePublished field
     sample_dandiset_metadata["datePublished"] = "2023-01-01T00:00:00Z"
-    
+
     result = google_dataset_metadata(sample_dandiset_metadata)
-    
+
     # Check that datePublished is preserved
     assert result["datePublished"] == "2023-01-01T00:00:00Z"
 
@@ -187,8 +187,8 @@ def test_google_dataset_metadata_with_date_created_fallback(sample_dandiset_meta
     """Test fallback to dateCreated when datePublished is not present"""
     # Add dateCreated field
     sample_dandiset_metadata["dateCreated"] = "2022-01-01T00:00:00Z"
-    
+
     result = google_dataset_metadata(sample_dandiset_metadata)
-    
+
     # Check that datePublished is set to dateCreated
     assert result["datePublished"] == "2022-01-01T00:00:00Z"
diff --git a/dandischema/utils.py b/dandischema/utils.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
+import copy
 import re
 from typing import Any, Dict, Iterator, List, Union, cast, get_args, get_origin
-import copy
 
 from jsonschema import Draft7Validator, Draft202012Validator
 from jsonschema.protocols import Validator as JsonschemaValidator
@@ -247,10 +247,10 @@ def validate_json(instance: Any, validator: JsonschemaValidator) -> None:
 def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
     """
     Transform DANDI metadata to be compatible with Google Dataset Search.
-    
+
     This function takes a DANDI metadata JSON-LD document and transforms it to ensure
     it passes the Google Dataset Search validator by adding or modifying required fields.
-    
+
     Required properties for Google Dataset Search:
     - @type: Dataset
     - name: The name of the dataset
@@ -260,20 +260,20 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
     - version: The version of the dataset
     - identifier: An identifier for the dataset (preferably a DOI)
     - keywords: Keywords describing the dataset
-    
+
     Parameters
     ----------
     metadata : Dict[str, Any]
         The original DANDI metadata JSON-LD document
-        
+
     Returns
     -------
     Dict[str, Any]
         The transformed metadata that is compatible with Google Dataset Search
     """
     # Make a deep copy to avoid modifying the original
     result = copy.deepcopy(metadata)
-    
+
     # Append schema:Dataset to schemaKey
     if "schemaKey" in result:
         # If schemaKey is a string, convert it to a list
@@ -294,26 +294,31 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
     if "schema:creator" not in result and "contributor" in result:
         # Filter contributors with Author role
         authors = [
-            contrib for contrib in result["contributor"] 
+            contrib
+            for contrib in result["contributor"]
             if contrib.get("roleName") and "dcite:Author" in contrib.get("roleName", [])
         ]
-        
+
         # If no authors found, use all contributors
         creators = authors if authors else result["contributor"]
-        
+
         # Format creators according to schema.org requirements
         result["schema:creator"] = []
         for person in creators:
             # Create a new creator object with updated schemaKey
             creator = {
-                "schemaKey": "schema:Organization" if person.get("schemaKey") == "Organization" else "schema:Person",
-                "name": person.get("name", "")
+                "schemaKey": (
+                    "schema:Organization"
+                    if person.get("schemaKey") == "Organization"
+                    else "schema:Person"
+                ),
+                "name": person.get("name", ""),
             }
-            
+
             # Add identifier if available (ORCID for Person, ROR for Organization)
             if person.get("identifier"):
                 creator["identifier"] = person["identifier"]
-            
+
             result["schema:creator"].append(creator)
     
     # Update contributor schemaKey and remove roleName
@@ -322,22 +327,22 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
         for contributor in result["contributor"]:
             # Make a copy of the contributor
             updated_contributor = copy.deepcopy(contributor)
-            
+
             # Update schemaKey if it exists
             if "schemaKey" in updated_contributor:
                 if updated_contributor["schemaKey"] == "Person":
                     updated_contributor["schemaKey"] = "schema:Person"
                 elif updated_contributor["schemaKey"] == "Organization":
                     updated_contributor["schemaKey"] = "schema:Organization"
-            
+
             # Remove roleName if it exists
             if "roleName" in updated_contributor:
                 del updated_contributor["roleName"]
-            
+
             updated_contributors.append(updated_contributor)
-        
+
         result["contributor"] = updated_contributors
-    
+
     # Ensure license is properly formatted for schema.org
     if "license" in result:
         # Transform DANDI license format to schema.org format
@@ -349,62 +354,62 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
                 schema_licenses.append(f"https://spdx.org/licenses/{license_id}")
             else:
                 schema_licenses.append(license_type)
-        
+
         result["license"] = schema_licenses
-    
+
     # Ensure version is present
     if "schemaVersion" in result and "version" not in result:
         result["version"] = result["schemaVersion"]
-    
+
     # Ensure identifier is properly formatted (preferably as a DOI URL)
     if "identifier" in result and isinstance(result["identifier"], str):
         # If it's a DOI in the format "DANDI:123456", convert to a URL
         if result["identifier"].startswith("DANDI:"):
             dandiset_id = result["identifier"].replace("DANDI:", "")
             result["identifier"] = f"https://identifiers.org/DANDI:{dandiset_id}"
-    
+
     # Generate keywords based on available metadata
     keywords = []
-    
+
     # Add data standard as keywords
     if "assetsSummary" in result and "dataStandard" in result["assetsSummary"]:
         for std in result["assetsSummary"]["dataStandard"]:
             if "name" in std:
                 keywords.append(std["name"])
-    
+
     # Add species as keywords
     if "assetsSummary" in result and "species" in result["assetsSummary"]:
         for species in result["assetsSummary"]["species"]:
             if "name" in species:
                 keywords.append(species["name"])
-    
+
     # Add approach as keywords
     if "assetsSummary" in result and "approach" in result["assetsSummary"]:
         for approach in result["assetsSummary"]["approach"]:
             if "name" in approach:
                 keywords.append(approach["name"])
-    
+
     # Transform measurement technique into a list of strings and add as keywords
     if "assetsSummary" in result and "measurementTechnique" in result["assetsSummary"]:
         # Extract technique names for keywords
         for technique in result["assetsSummary"]["measurementTechnique"]:
             if "name" in technique:
                 keywords.append(technique["name"])
-        
+
         # Transform the measurementTechnique to a list of strings (names only)
         technique_names = []
         for technique in result["assetsSummary"]["measurementTechnique"]:
             if "name" in technique:
                 technique_names.append(technique["name"])
-        
+
         # Replace the original complex objects with just the names
         if technique_names:
             result["assetsSummary"]["measurementTechnique"] = technique_names
-    
+
     # Add "neuroscience" as a default keyword for DANDI
     keywords.append("neuroscience")
     keywords.append("DANDI")
-    
+
     # Add keywords to result if we generated any
     if keywords:
         if "keywords" not in result or not result["keywords"]:
@@ -416,13 +421,13 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
                 if keyword not in existing_keywords:
                     existing_keywords.append(keyword)
             result["keywords"] = existing_keywords
-    
+
     # Add datePublished if available
     if "datePublished" in result:
         # Ensure it's in the proper format
         result["datePublished"] = result["datePublished"]
     elif "dateCreated" in result:
         # Use dateCreated as a fallback
         result["datePublished"] = result["dateCreated"]
-    
+
     return result