Fix handling of OSN links

tmorrell · tmorrell · commit 9a58da80c965 · 2023-04-06T13:22:49.000-06:00
diff --git a/caltechdata_api/caltechdata_edit.py b/caltechdata_api/caltechdata_edit.py
@@ -72,8 +72,27 @@ def caltechdata_edit(
     if isinstance(files, str) == True:
         files = [files]
 
+    # Check if file links were provided in the metadata
+    descriptions = []
+    for d in metadata["descriptions"]:
+        if d["description"].startswith("Files available via S3"):
+            ex_file_links = []
+            file_text = d["description"]
+            file_list = file_text.split('href="')
+            # Loop over links in description, skip header text
+            for file in file_list[1:]:
+                ex_file_links.append(file.split('"\n')[0])
+        else:
+            descriptions.append(d)
+    # We remove file link descriptions, and re-add below
+    metadata["descriptions"] = descriptions
+
+    # If user has provided file links as a cli option, we add those
     if file_links:
         metadata = add_file_links(metadata, file_links)
+    # Otherwise we add file links found in the mtadata file
+    elif ex_file_links:
+        metadata = add_file_links(metadata, ex_file_links)
 
     if production == True:
         url = "https://data.caltech.edu"
@@ -101,7 +120,7 @@ def caltechdata_edit(
             headers=headers,
         )
         if existing.status_code != 200:
-            raise Exception(existing.text)
+            raise Exception(f"Record {idv} does not exist, cannot edit")
 
     status = existing.json()["status"]
 
diff --git a/caltechdata_api/caltechdata_write.py b/caltechdata_api/caltechdata_write.py
@@ -63,11 +63,8 @@ def add_file_links(metadata, file_links):
     for link in file_links:
         file = link.split("/")[-1]
         path = link.split(endpoint)[1]
-        try:
-            size = s3.info(path)["Size"]
-            size = humanbytes(size)
-        except:
-            size = 0
+        size = s3.info(path)["size"]
+        size = humanbytes(size)
         if link_string == "":
             cleaned = link.strip(file)
             link_string = f"Files available via S3 at {cleaned}&lt;/p&gt;</p>"
diff --git a/caltechdata_api/customize_schema.py b/caltechdata_api/customize_schema.py
@@ -81,8 +81,8 @@ def rdm_creators_contributors(person_list, peopleroles):
             cre["type"] = "personal"
         change_label(cre, "givenName", "given_name")
         change_label(cre, "familyName", "family_name")
-        if 'name' not in cre:
-            cre['name'] = cre['family_name']+','+cre['given_name']
+        if "name" not in cre:
+            cre["name"] = cre["family_name"] + "," + cre["given_name"]
         change_label(cre, "nameIdentifiers", "identifiers")
         if "identifiers" in cre:
             new_id = []
diff --git a/edit_osn.py b/edit_osn.py
@@ -0,0 +1,61 @@
+import argparse, os, json
+import s3fs
+from datacite import schema43
+from caltechdata_api import caltechdata_edit, get_metadata
+
+
+parser = argparse.ArgumentParser(
+    description="Edits a CaltechDATA record by adding OSN-stored pilot files"
+)
+parser.add_argument("folder", nargs=1, help="Folder")
+parser.add_argument("-id", nargs=1, help="")
+
+args = parser.parse_args()
+
+# Get access token as environment variable
+token = os.environ["RDMTOK"]
+
+endpoint = "https://renc.osn.xsede.org/"
+
+# Get metadata and files from bucket
+s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint})
+
+folder = args.folder[0]
+
+path = "ini210004tommorrell/" + folder + "/"
+
+idv = args.id[0]
+metadata = get_metadata(idv, schema="43")
+
+# Find the files
+files = s3.glob(path + "/*")
+
+file_links = []
+for link in files:
+    fname = link.split("/")[-1]
+    if "." not in fname:
+        # If there is a directory, get files
+        folder_files = s3.glob(link + "/*")
+        for file in folder_files:
+            name = file.split("/")[-1]
+            if "." not in name:
+                level_2_files = s3.glob(file + "/*")
+                for f in level_2_files:
+                    name = f.split("/")[-1]
+                    if "." not in name:
+                        level_3_files = s3.glob(f + "/*")
+                        for l3 in level_3_files:
+                            file_links.append(endpoint + l3)
+                    else:
+                        file_links.append(endpoint + f)
+            else:
+                file_links.append(endpoint + file)
+    else:
+        file_links.append(endpoint + link)
+
+production = True
+
+response = caltechdata_edit(
+    idv, metadata, token, [], production, "43", publish=True, file_links=file_links
+)
+print(response)
diff --git a/outdated/edit_pilot_phase1.py b/outdated/edit_pilot_phase1.py
diff --git a/rdm.json b/rdm.json
@@ -1,13 +1,8 @@
 {
     "pids": {
-        "doi": {
-            "identifier": "10.5281/inveniordm.1234",
-            "provider": "datacite",
-            "client": "inveniordm"
-        }
     },
     "metadata": {
-        "resource_type": {"id": "image-photo"},
+        "resource_type": {"id": "dataset"},
         "creators": [
             {
                 "person_or_org": {