diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py index 68a1da9..e0cb0dd 100644 --- a/caltechdata_api/caltechdata_write.py +++ b/caltechdata_api/caltechdata_write.py @@ -1,7 +1,7 @@ import copy import json -import os, requests - +import os +import requests import s3fs from requests import session from json.decoder import JSONDecodeError @@ -49,8 +49,6 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal infile = open(name, "rb") else: infile = open(f_list[name], "rb") - # size = infile.seek(0, 2) - # infile.seek(0, 0) # reset at beginning result = requests.put(link, headers=f_headers, data=infile) if result.status_code != 200: raise Exception(result.text) @@ -68,7 +66,7 @@ def write_files_rdm(files, file_link, headers, f_headers, s3=None, keepfiles=Fal def add_file_links( metadata, file_links, file_descriptions=[], additional_descriptions="", s3_link=None ): - # Currently configured for S3 links, assuming all are at same endpoint + # Currently configured for S3 links, assuming all are at the same endpoint link_string = "" endpoint = "https://" + file_links[0].split("/")[2] s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint}) diff --git a/caltechdata_api/cli.py b/caltechdata_api/cli.py index c9c3eb9..3222c09 100644 --- a/caltechdata_api/cli.py +++ b/caltechdata_api/cli.py @@ -59,7 +59,7 @@ def decrypt_token(encrypted_token, key): return f.decrypt(encrypted_token).decode() -# Function to get or set token +# Function to get or set token with support for test system def get_or_set_token(production=True): key = load_or_generate_key() @@ -411,6 +411,7 @@ def main(): def create_record(production): token = get_or_set_token(production) + # keep_file = input("Do you want to keep your existing files? (yes/no): ").lower() == "yes" print("Using CaltechDATA token:", token) while True: choice = get_user_input( @@ -521,13 +522,10 @@ def print_upload_message(rec_id, production): else "https://data.caltechlibrary.dev/uploads/" ) print( - f""" - You can view and publish this record at - + f"""You can view and publish this record at {base_url}{rec_id} - - If you need to upload large files to S3, you can type `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/` - """ + If you need to upload large files to S3, you can type + `s3cmd put DATA_FILE s3://ini230004-bucket01/{rec_id}/`""" ) @@ -552,7 +550,6 @@ def edit_record(production): print(f"An error occurred during metadata editing: {e}") else: print("No metadata file found.") - choice = get_user_input("Do you want to add files? (y/n): ").lower() if choice == "y": if production: @@ -571,19 +568,32 @@ def edit_record(production): url = API_URL_TEMPLATE.format(record_id=record_id) url_draft = API_URL_TEMPLATE_DRAFT.format(record_id=record_id) - response = requests.get(url) - response_draft = requests.get(url_draft) + headers = { + "accept": "application/json", + } - filepath, file_link = upload_supporting_file(record_id) - print(file_link) + if token: + headers["Authorization"] = "Bearer %s" % token - if response.status_code == 404 and response_draft.status_code == 404: + response = requests.get(url, headers=headers) + response_draft = requests.get(url_draft, headers=headers) + data = response.json() + data_draft = response_draft.json() + # Check if 'entries' exists and its length + if ( + len(data.get("entries", [])) == 0 + and len(data_draft.get("entries", [])) == 0 + ): keepfile = False else: keepfile = ( input("Do you want to keep existing files? (y/n): ").lower() == "y" ) + filepath, file_link = upload_supporting_file(record_id) + if file_link: + print(file_link) + if filepath != "": response = caltechdata_edit( record_id, @@ -601,7 +611,7 @@ def edit_record(production): file_links=file_link, production=production, publish=False, - keepfile=keepfile, + keepfiles=keepfile, ) rec_id = response @@ -620,7 +630,6 @@ def download_file_by_id(record_id, token=None): try: response = requests.get(url, headers=headers) - if response.status_code != 200: # Might have a draft response = requests.get( @@ -628,7 +637,21 @@ def download_file_by_id(record_id, token=None): headers=headers, ) if response.status_code != 200: - raise Exception(f"Record {record_id} does not exist, cannot edit") + url = f"https://data.caltechlibrary.dev/api/records/{record_id}" + response = requests.get( + url, + headers=headers, + ) + if response.status_code != 200: + # Might have a draft + response = requests.get( + url + "/draft", + headers=headers, + ) + if response.status_code != 200: + raise Exception( + f"Record {record_id} does not exist, cannot edit" + ) file_content = response.content file_name = f"downloaded_data_{record_id}.json" with open(file_name, "wb") as file: diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py index c379e58..b3ff9ab 100644 --- a/caltechdata_api/customize_schema.py +++ b/caltechdata_api/customize_schema.py @@ -134,8 +134,9 @@ def rdm_creators_contributors(person_list, peopleroles): def customize_schema_rdm(json_record): # Get vocabularies used in InvenioRDM - vocabularies = get_vocabularies() + vocabularies = get_vocabularies() + validate_metadata(json_record) peopleroles = vocabularies["crr"] resourcetypes = vocabularies["rsrct"] descriptiontypes = vocabularies["dty"] @@ -386,6 +387,169 @@ def customize_schema_rdm(json_record): return final +def validate_metadata(json_record): + """ + Validates the presence and structure of required fields in a CaltechDATA JSON record. + Raises an exception if any required field is missing or structured incorrectly. + """ + errors = [] + + # Check for 'types' and 'resourceTypeGeneral' + if "types" not in json_record: + errors.append("'types' field is missing.") + elif not isinstance(json_record["types"], dict): + errors.append("'types' field should be a dictionary.") + elif "resourceTypeGeneral" not in json_record["types"]: + errors.append("'resourceTypeGeneral' field is missing in 'types'.") + + # Check for 'title' + if "titles" not in json_record: + errors.append("'titles' field is missing.") + elif not isinstance(json_record["titles"], list) or len(json_record["titles"]) == 0: + errors.append("'titles' should be a non-empty list.") + else: + # Ensure each title is a dictionary with 'title' field + for title in json_record["titles"]: + if not isinstance(title, dict) or "title" not in title: + errors.append( + "Each entry in 'titles' must be a dictionary with a 'title' key." + ) + + # Check for 'publication_date' + if "publicationYear" not in json_record and "dates" not in json_record: + errors.append( + "A publication date is required ('publicationYear' or 'dates' field is missing)." + ) + if "dates" in json_record: + if not isinstance(json_record["dates"], list): + errors.append("'dates' should be a list.") + else: + for date_entry in json_record["dates"]: + if ( + not isinstance(date_entry, dict) + or "dateType" not in date_entry + or "date" not in date_entry + ): + errors.append( + "Each entry in 'dates' must be a dictionary with 'dateType' and 'date' keys." + ) + + # Check for 'creators' + if "creators" not in json_record: + errors.append("'creators' field is missing.") + elif ( + not isinstance(json_record["creators"], list) + or len(json_record["creators"]) == 0 + ): + errors.append("'creators' should be a non-empty list.") + else: + for creator in json_record["creators"]: + if not isinstance(creator, dict) or "name" not in creator: + errors.append( + "Each creator in 'creators' must be a dictionary with a 'name' key." + ) + + # Check for 'contributors' + if "contributors" in json_record: + if not isinstance(json_record["contributors"], list): + errors.append("'contributors' should be a list.") + else: + for contributor in json_record["contributors"]: + if not isinstance(contributor, dict) or "name" not in contributor: + errors.append( + "Each contributor must be a dictionary with a 'name' key." + ) + + # Check for 'resourceType' + if "resourceType" not in json_record["types"]: + errors.append("'resourceType' field is missing in 'types'.") + elif not isinstance(json_record["types"]["resourceType"], str): + errors.append("'resourceType' should be a string.") + + # Check for 'identifiers' + if "identifiers" in json_record: + if not isinstance(json_record["identifiers"], list): + errors.append("'identifiers' should be a list.") + else: + for identifier in json_record["identifiers"]: + if ( + not isinstance(identifier, dict) + or "identifier" not in identifier + or "identifierType" not in identifier + ): + errors.append( + "Each identifier must be a dictionary with 'identifier' and 'identifierType' keys." + ) + + # Check for 'subjects' + if "subjects" in json_record: + if not isinstance(json_record["subjects"], list): + errors.append("'subjects' should be a list.") + else: + for subject in json_record["subjects"]: + if not isinstance(subject, dict) or "subject" not in subject: + errors.append( + "Each subject must be a dictionary with a 'subject' key." + ) + + # Check for 'relatedIdentifiers' + if "relatedIdentifiers" in json_record: + if not isinstance(json_record["relatedIdentifiers"], list): + errors.append("'relatedIdentifiers' should be a list.") + else: + for related_id in json_record["relatedIdentifiers"]: + if ( + not isinstance(related_id, dict) + or "relatedIdentifier" not in related_id + ): + errors.append( + "Each relatedIdentifier must be a dictionary with a 'relatedIdentifier' key." + ) + + # Check for 'rightsList' + if "rightsList" in json_record: + if not isinstance(json_record["rightsList"], list): + errors.append("'rightsList' should be a list.") + else: + for rights in json_record["rightsList"]: + if not isinstance(rights, dict) or "rights" not in rights: + errors.append( + "Each entry in 'rightsList' must be a dictionary with a 'rights' key." + ) + + # Check for 'geoLocations' + if "geoLocations" in json_record: + if not isinstance(json_record["geoLocations"], list): + errors.append("'geoLocations' should be a list.") + else: + for location in json_record["geoLocations"]: + if not isinstance(location, dict): + errors.append("Each entry in 'geoLocations' must be a dictionary.") + elif ( + "geoLocationPoint" not in location + and "geoLocationBox" not in location + and "geoLocationPlace" not in location + ): + errors.append( + "Each geoLocation entry must contain at least one of 'geoLocationPoint', 'geoLocationBox', or 'geoLocationPlace'." + ) + + # Check for 'fundingReferences' + if "fundingReferences" in json_record: + if not isinstance(json_record["fundingReferences"], list): + errors.append("'fundingReferences' should be a list.") + else: + for funding in json_record["fundingReferences"]: + if not isinstance(funding, dict): + errors.append("Each funding reference must be a dictionary.") + if "funderName" not in funding: + errors.append("Each funding reference must contain 'funderName'.") + + # Return errors if any are found + if errors: + raise ValueError(f"Validation errors in metadata: {', '.join(errors)}") + + if __name__ == "__main__": # Read in from file for demo purposes