@@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles):
134134
135135def customize_schema_rdm (json_record ):
136136 # Get vocabularies used in InvenioRDM
137- vocabularies = get_vocabularies ()
138137
138+ vocabularies = get_vocabularies ()
139+ validate_metadata (json_record )
139140 peopleroles = vocabularies ["crr" ]
140141 resourcetypes = vocabularies ["rsrct" ]
141142 descriptiontypes = vocabularies ["dty" ]
@@ -386,6 +387,169 @@ def customize_schema_rdm(json_record):
386387 return final
387388
388389
390+ def validate_metadata (json_record ):
391+ """
392+ Validates the presence and structure of required fields in a CaltechDATA JSON record.
393+ Raises an exception if any required field is missing or structured incorrectly.
394+ """
395+ errors = []
396+
397+ # Check for 'types' and 'resourceTypeGeneral'
398+ if "types" not in json_record :
399+ errors .append ("'types' field is missing." )
400+ elif not isinstance (json_record ["types" ], dict ):
401+ errors .append ("'types' field should be a dictionary." )
402+ elif "resourceTypeGeneral" not in json_record ["types" ]:
403+ errors .append ("'resourceTypeGeneral' field is missing in 'types'." )
404+
405+ # Check for 'title'
406+ if "titles" not in json_record :
407+ errors .append ("'titles' field is missing." )
408+ elif not isinstance (json_record ["titles" ], list ) or len (json_record ["titles" ]) == 0 :
409+ errors .append ("'titles' should be a non-empty list." )
410+ else :
411+ # Ensure each title is a dictionary with 'title' field
412+ for title in json_record ["titles" ]:
413+ if not isinstance (title , dict ) or "title" not in title :
414+ errors .append (
415+ "Each entry in 'titles' must be a dictionary with a 'title' key."
416+ )
417+
418+ # Check for 'publication_date'
419+ if "publicationYear" not in json_record and "dates" not in json_record :
420+ errors .append (
421+ "A publication date is required ('publicationYear' or 'dates' field is missing)."
422+ )
423+ if "dates" in json_record :
424+ if not isinstance (json_record ["dates" ], list ):
425+ errors .append ("'dates' should be a list." )
426+ else :
427+ for date_entry in json_record ["dates" ]:
428+ if (
429+ not isinstance (date_entry , dict )
430+ or "dateType" not in date_entry
431+ or "date" not in date_entry
432+ ):
433+ errors .append (
434+ "Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys."
435+ )
436+
437+ # Check for 'creators'
438+ if "creators" not in json_record :
439+ errors .append ("'creators' field is missing." )
440+ elif (
441+ not isinstance (json_record ["creators" ], list )
442+ or len (json_record ["creators" ]) == 0
443+ ):
444+ errors .append ("'creators' should be a non-empty list." )
445+ else :
446+ for creator in json_record ["creators" ]:
447+ if not isinstance (creator , dict ) or "name" not in creator :
448+ errors .append (
449+ "Each creator in 'creators' must be a dictionary with a 'name' key."
450+ )
451+
452+ # Check for 'contributors'
453+ if "contributors" in json_record :
454+ if not isinstance (json_record ["contributors" ], list ):
455+ errors .append ("'contributors' should be a list." )
456+ else :
457+ for contributor in json_record ["contributors" ]:
458+ if not isinstance (contributor , dict ) or "name" not in contributor :
459+ errors .append (
460+ "Each contributor must be a dictionary with a 'name' key."
461+ )
462+
463+ # Check for 'resourceType'
464+ if "resourceType" not in json_record ["types" ]:
465+ errors .append ("'resourceType' field is missing in 'types'." )
466+ elif not isinstance (json_record ["types" ]["resourceType" ], str ):
467+ errors .append ("'resourceType' should be a string." )
468+
469+ # Check for 'identifiers'
470+ if "identifiers" in json_record :
471+ if not isinstance (json_record ["identifiers" ], list ):
472+ errors .append ("'identifiers' should be a list." )
473+ else :
474+ for identifier in json_record ["identifiers" ]:
475+ if (
476+ not isinstance (identifier , dict )
477+ or "identifier" not in identifier
478+ or "identifierType" not in identifier
479+ ):
480+ errors .append (
481+ "Each identifier must be a dictionary with 'identifier' and 'identifierType' keys."
482+ )
483+
484+ # Check for 'subjects'
485+ if "subjects" in json_record :
486+ if not isinstance (json_record ["subjects" ], list ):
487+ errors .append ("'subjects' should be a list." )
488+ else :
489+ for subject in json_record ["subjects" ]:
490+ if not isinstance (subject , dict ) or "subject" not in subject :
491+ errors .append (
492+ "Each subject must be a dictionary with a 'subject' key."
493+ )
494+
495+ # Check for 'relatedIdentifiers'
496+ if "relatedIdentifiers" in json_record :
497+ if not isinstance (json_record ["relatedIdentifiers" ], list ):
498+ errors .append ("'relatedIdentifiers' should be a list." )
499+ else :
500+ for related_id in json_record ["relatedIdentifiers" ]:
501+ if (
502+ not isinstance (related_id , dict )
503+ or "relatedIdentifier" not in related_id
504+ ):
505+ errors .append (
506+ "Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key."
507+ )
508+
509+ # Check for 'rightsList'
510+ if "rightsList" in json_record :
511+ if not isinstance (json_record ["rightsList" ], list ):
512+ errors .append ("'rightsList' should be a list." )
513+ else :
514+ for rights in json_record ["rightsList" ]:
515+ if not isinstance (rights , dict ) or "rights" not in rights :
516+ errors .append (
517+ "Each entry in 'rightsList' must be a dictionary with a 'rights' key."
518+ )
519+
520+ # Check for 'geoLocations'
521+ if "geoLocations" in json_record :
522+ if not isinstance (json_record ["geoLocations" ], list ):
523+ errors .append ("'geoLocations' should be a list." )
524+ else :
525+ for location in json_record ["geoLocations" ]:
526+ if not isinstance (location , dict ):
527+ errors .append ("Each entry in 'geoLocations' must be a dictionary." )
528+ elif (
529+ "geoLocationPoint" not in location
530+ and "geoLocationBox" not in location
531+ and "geoLocationPlace" not in location
532+ ):
533+ errors .append (
534+ "Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'."
535+ )
536+
537+ # Check for 'fundingReferences'
538+ if "fundingReferences" in json_record :
539+ if not isinstance (json_record ["fundingReferences" ], list ):
540+ errors .append ("'fundingReferences' should be a list." )
541+ else :
542+ for funding in json_record ["fundingReferences" ]:
543+ if not isinstance (funding , dict ):
544+ errors .append ("Each funding reference must be a dictionary." )
545+ if "funderName" not in funding :
546+ errors .append ("Each funding reference must contain 'funderName'." )
547+
548+ # Return errors if any are found
549+ if errors :
550+ raise ValueError (f"Validation errors in metadata: { ', ' .join (errors )} " )
551+
552+
389553if __name__ == "__main__" :
390554 # Read in from file for demo purposes
391555
0 commit comments