Fix imports and in progress name script

tmorrell · tmorrell · commit 5c80ad603316 · 2022-09-23T16:27:38.000-07:00
diff --git a/caltechdata_api/__init__.py b/caltechdata_api/__init__.py
@@ -1,6 +1,5 @@
-from .caltechdata_write import caltechdata_write, send_s3, write_files_rdm
-from .caltechdata_edit import caltechdata_add, caltechdata_edit, caltechdata_unembargo
+from .caltechdata_write import caltechdata_write, write_files_rdm
+from .caltechdata_edit import caltechdata_edit, caltechdata_unembargo
 from .customize_schema import customize_schema
-from .decustomize_schema import decustomize_schema
 from .get_metadata import get_metadata
 from .download_file import download_file, download_url
diff --git a/caltechdata_api/caltechdata_edit.py b/caltechdata_api/caltechdata_edit.py
@@ -3,7 +3,7 @@
 import requests
 from requests import session
 
-from caltechdata_api import customize_schema, send_s3, write_files_rdm
+from caltechdata_api import customize_schema, write_files_rdm
 
 
 def caltechdata_unembargo(token, ids, production=False):
@@ -37,8 +37,7 @@ def caltechdata_edit(
         ids = [ids]
 
     data = customize_schema.customize_schema(
-        copy.deepcopy(metadata), schema=schema, pilot=True
-    )
+        copy.deepcopy(metadata), schema=schema)
     if production == True:
         url = "https://data.caltech.edu/"
         verify = True
diff --git a/fix_names.py b/fix_names.py
@@ -1,14 +1,25 @@
 import requests
 import math
 from progressbar import progressbar
+from caltechdata_api import caltechdata_edit
+
+def fix_name(metadata):
+    fixed = False
+    for name in metadata:
+        if name['nameType'] == 'Personal':
+            if 'givenName' not in name:
+                fixed = True
+                given = name['name'].split(',')[1]
+                name['givenName'] = given.strip()
+    return metadata,fixed
 
 url = "https://data.caltech.edu/api/records"
 
 headers = {
             "accept": "application/vnd.datacite.datacite+json",
         }
 
-response = requests.get(f"{url}")
+response = requests.get(f"{url}?search_type=scan&scroll=5m")
 
 total = response.json()["hits"]["total"]
 pages = math.ceil(int(total) / 1000)
@@ -18,10 +29,23 @@
     chunkurl = (
             f"{url}?&sort=newest&size=1000&page={c}"
     )
-    response = requests.get(chunkurl).json()
+    response = requests.get(chunkurl)
+    response = response.json()
     
     hits += response["hits"]["hits"]
 
 for h in progressbar(hits):
-    rid = str(h["id"])
-    print(rid)
+    idv = str(h["id"])
+    response = requests.get(f'{url}/{idv}', headers=headers)
+    if response.status_code != 200:
+        print(response.text)
+        exit()
+    else:
+        metadata = response.json()
+        metadata['creators'], fixed = fix_name(metadata['creators'])
+        if 'contributors' in metadata:
+            metadata['contributors'] = fix_name(metadata['contributors'])
+        if fixed:
+            print(idv)
+            caltechdata_edit(idv,metadata,production=True,publish=True)
+            exit()