11from __future__ import annotations
22
3+ import copy
34import re
45from typing import Any , Dict , Iterator , List , Union , cast , get_args , get_origin
5- import copy
66
77from jsonschema import Draft7Validator , Draft202012Validator
88from jsonschema .protocols import Validator as JsonschemaValidator
@@ -247,10 +247,10 @@ def validate_json(instance: Any, validator: JsonschemaValidator) -> None:
247247def google_dataset_metadata (metadata : Dict [str , Any ]) -> Dict [str , Any ]:
248248 """
249249 Transform DANDI metadata to be compatible with Google Dataset Search.
250-
250+
251251 This function takes a DANDI metadata JSON-LD document and transforms it to ensure
252252 it passes the Google Dataset Search validator by adding or modifying required fields.
253-
253+
254254 Required properties for Google Dataset Search:
255255 - @type: Dataset
256256 - name: The name of the dataset
@@ -260,20 +260,20 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
260260 - version: The version of the dataset
261261 - identifier: An identifier for the dataset (preferably a DOI)
262262 - keywords: Keywords describing the dataset
263-
263+
264264 Parameters
265265 ----------
266266 metadata : Dict[str, Any]
267267 The original DANDI metadata JSON-LD document
268-
268+
269269 Returns
270270 -------
271271 Dict[str, Any]
272272 The transformed metadata that is compatible with Google Dataset Search
273273 """
274274 # Make a deep copy to avoid modifying the original
275275 result = copy .deepcopy (metadata )
276-
276+
277277 # Append schema:Dataset to schemaKey
278278 if "schemaKey" in result :
279279 # If schemaKey is a string, convert it to a list
@@ -294,26 +294,31 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
294294 if "schema:creator" not in result and "contributor" in result :
295295 # Filter contributors with Author role
296296 authors = [
297- contrib for contrib in result ["contributor" ]
297+ contrib
298+ for contrib in result ["contributor" ]
298299 if contrib .get ("roleName" ) and "dcite:Author" in contrib .get ("roleName" , [])
299300 ]
300-
301+
301302 # If no authors found, use all contributors
302303 creators = authors if authors else result ["contributor" ]
303-
304+
304305 # Format creators according to schema.org requirements
305306 result ["schema:creator" ] = []
306307 for person in creators :
307308 # Create a new creator object with updated schemaKey
308309 creator = {
309- "schemaKey" : "schema:Organization" if person .get ("schemaKey" ) == "Organization" else "schema:Person" ,
310- "name" : person .get ("name" , "" )
310+ "schemaKey" : (
311+ "schema:Organization"
312+ if person .get ("schemaKey" ) == "Organization"
313+ else "schema:Person"
314+ ),
315+ "name" : person .get ("name" , "" ),
311316 }
312-
317+
313318 # Add identifier if available (ORCID for Person, ROR for Organization)
314319 if person .get ("identifier" ):
315320 creator ["identifier" ] = person ["identifier" ]
316-
321+
317322 result ["schema:creator" ].append (creator )
318323
319324 # Update contributor schemaKey and remove roleName
@@ -322,22 +327,22 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
322327 for contributor in result ["contributor" ]:
323328 # Make a copy of the contributor
324329 updated_contributor = copy .deepcopy (contributor )
325-
330+
326331 # Update schemaKey if it exists
327332 if "schemaKey" in updated_contributor :
328333 if updated_contributor ["schemaKey" ] == "Person" :
329334 updated_contributor ["schemaKey" ] = "schema:Person"
330335 elif updated_contributor ["schemaKey" ] == "Organization" :
331336 updated_contributor ["schemaKey" ] = "schema:Organization"
332-
337+
333338 # Remove roleName if it exists
334339 if "roleName" in updated_contributor :
335340 del updated_contributor ["roleName" ]
336-
341+
337342 updated_contributors .append (updated_contributor )
338-
343+
339344 result ["contributor" ] = updated_contributors
340-
345+
341346 # Ensure license is properly formatted for schema.org
342347 if "license" in result :
343348 # Transform DANDI license format to schema.org format
@@ -349,62 +354,62 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
349354 schema_licenses .append (f"https://spdx.org/licenses/{ license_id } " )
350355 else :
351356 schema_licenses .append (license_type )
352-
357+
353358 result ["license" ] = schema_licenses
354-
359+
355360 # Ensure version is present
356361 if "schemaVersion" in result and "version" not in result :
357362 result ["version" ] = result ["schemaVersion" ]
358-
363+
359364 # Ensure identifier is properly formatted (preferably as a DOI URL)
360365 if "identifier" in result and isinstance (result ["identifier" ], str ):
361366 # If it's a DOI in the format "DANDI:123456", convert to a URL
362367 if result ["identifier" ].startswith ("DANDI:" ):
363368 dandiset_id = result ["identifier" ].replace ("DANDI:" , "" )
364369 result ["identifier" ] = f"https://identifiers.org/DANDI:{ dandiset_id } "
365-
370+
366371 # Generate keywords based on available metadata
367372 keywords = []
368-
373+
369374 # Add data standard as keywords
370375 if "assetsSummary" in result and "dataStandard" in result ["assetsSummary" ]:
371376 for std in result ["assetsSummary" ]["dataStandard" ]:
372377 if "name" in std :
373378 keywords .append (std ["name" ])
374-
379+
375380 # Add species as keywords
376381 if "assetsSummary" in result and "species" in result ["assetsSummary" ]:
377382 for species in result ["assetsSummary" ]["species" ]:
378383 if "name" in species :
379384 keywords .append (species ["name" ])
380-
385+
381386 # Add approach as keywords
382387 if "assetsSummary" in result and "approach" in result ["assetsSummary" ]:
383388 for approach in result ["assetsSummary" ]["approach" ]:
384389 if "name" in approach :
385390 keywords .append (approach ["name" ])
386-
391+
387392 # Transform measurement technique into a list of strings and add as keywords
388393 if "assetsSummary" in result and "measurementTechnique" in result ["assetsSummary" ]:
389394 # Extract technique names for keywords
390395 for technique in result ["assetsSummary" ]["measurementTechnique" ]:
391396 if "name" in technique :
392397 keywords .append (technique ["name" ])
393-
398+
394399 # Transform the measurementTechnique to a list of strings (names only)
395400 technique_names = []
396401 for technique in result ["assetsSummary" ]["measurementTechnique" ]:
397402 if "name" in technique :
398403 technique_names .append (technique ["name" ])
399-
404+
400405 # Replace the original complex objects with just the names
401406 if technique_names :
402407 result ["assetsSummary" ]["measurementTechnique" ] = technique_names
403-
408+
404409 # Add "neuroscience" as a default keyword for DANDI
405410 keywords .append ("neuroscience" )
406411 keywords .append ("DANDI" )
407-
412+
408413 # Add keywords to result if we generated any
409414 if keywords :
410415 if "keywords" not in result or not result ["keywords" ]:
@@ -416,13 +421,13 @@ def google_dataset_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
416421 if keyword not in existing_keywords :
417422 existing_keywords .append (keyword )
418423 result ["keywords" ] = existing_keywords
419-
424+
420425 # Add datePublished if available
421426 if "datePublished" in result :
422427 # Ensure it's in the proper format
423428 result ["datePublished" ] = result ["datePublished" ]
424429 elif "dateCreated" in result :
425430 # Use dateCreated as a fallback
426431 result ["datePublished" ] = result ["dateCreated" ]
427-
432+
428433 return result
0 commit comments