Skip to content

Commit 9a58da8

Browse files
committed
Fix handling of OSN links
1 parent 7ea0a1e commit 9a58da8

File tree

6 files changed

+86
-105
lines changed

6 files changed

+86
-105
lines changed

caltechdata_api/caltechdata_edit.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,27 @@ def caltechdata_edit(
7272
if isinstance(files, str) == True:
7373
files = [files]
7474

75+
# Check if file links were provided in the metadata
76+
descriptions = []
77+
for d in metadata["descriptions"]:
78+
if d["description"].startswith("Files available via S3"):
79+
ex_file_links = []
80+
file_text = d["description"]
81+
file_list = file_text.split('href="')
82+
# Loop over links in description, skip header text
83+
for file in file_list[1:]:
84+
ex_file_links.append(file.split('"\n')[0])
85+
else:
86+
descriptions.append(d)
87+
# We remove file link descriptions, and re-add below
88+
metadata["descriptions"] = descriptions
89+
90+
# If user has provided file links as a cli option, we add those
7591
if file_links:
7692
metadata = add_file_links(metadata, file_links)
93+
# Otherwise we add file links found in the mtadata file
94+
elif ex_file_links:
95+
metadata = add_file_links(metadata, ex_file_links)
7796

7897
if production == True:
7998
url = "https://data.caltech.edu"
@@ -101,7 +120,7 @@ def caltechdata_edit(
101120
headers=headers,
102121
)
103122
if existing.status_code != 200:
104-
raise Exception(existing.text)
123+
raise Exception(f"Record {idv} does not exist, cannot edit")
105124

106125
status = existing.json()["status"]
107126

caltechdata_api/caltechdata_write.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,8 @@ def add_file_links(metadata, file_links):
6363
for link in file_links:
6464
file = link.split("/")[-1]
6565
path = link.split(endpoint)[1]
66-
try:
67-
size = s3.info(path)["Size"]
68-
size = humanbytes(size)
69-
except:
70-
size = 0
66+
size = s3.info(path)["size"]
67+
size = humanbytes(size)
7168
if link_string == "":
7269
cleaned = link.strip(file)
7370
link_string = f"Files available via S3 at {cleaned}&lt;/p&gt;</p>"

caltechdata_api/customize_schema.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ def rdm_creators_contributors(person_list, peopleroles):
8181
cre["type"] = "personal"
8282
change_label(cre, "givenName", "given_name")
8383
change_label(cre, "familyName", "family_name")
84-
if 'name' not in cre:
85-
cre['name'] = cre['family_name']+','+cre['given_name']
84+
if "name" not in cre:
85+
cre["name"] = cre["family_name"] + "," + cre["given_name"]
8686
change_label(cre, "nameIdentifiers", "identifiers")
8787
if "identifiers" in cre:
8888
new_id = []

edit_osn.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import argparse, os, json
2+
import s3fs
3+
from datacite import schema43
4+
from caltechdata_api import caltechdata_edit, get_metadata
5+
6+
7+
parser = argparse.ArgumentParser(
8+
description="Edits a CaltechDATA record by adding OSN-stored pilot files"
9+
)
10+
parser.add_argument("folder", nargs=1, help="Folder")
11+
parser.add_argument("-id", nargs=1, help="")
12+
13+
args = parser.parse_args()
14+
15+
# Get access token as environment variable
16+
token = os.environ["RDMTOK"]
17+
18+
endpoint = "https://renc.osn.xsede.org/"
19+
20+
# Get metadata and files from bucket
21+
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint})
22+
23+
folder = args.folder[0]
24+
25+
path = "ini210004tommorrell/" + folder + "/"
26+
27+
idv = args.id[0]
28+
metadata = get_metadata(idv, schema="43")
29+
30+
# Find the files
31+
files = s3.glob(path + "/*")
32+
33+
file_links = []
34+
for link in files:
35+
fname = link.split("/")[-1]
36+
if "." not in fname:
37+
# If there is a directory, get files
38+
folder_files = s3.glob(link + "/*")
39+
for file in folder_files:
40+
name = file.split("/")[-1]
41+
if "." not in name:
42+
level_2_files = s3.glob(file + "/*")
43+
for f in level_2_files:
44+
name = f.split("/")[-1]
45+
if "." not in name:
46+
level_3_files = s3.glob(f + "/*")
47+
for l3 in level_3_files:
48+
file_links.append(endpoint + l3)
49+
else:
50+
file_links.append(endpoint + f)
51+
else:
52+
file_links.append(endpoint + file)
53+
else:
54+
file_links.append(endpoint + link)
55+
56+
production = True
57+
58+
response = caltechdata_edit(
59+
idv, metadata, token, [], production, "43", publish=True, file_links=file_links
60+
)
61+
print(response)

outdated/edit_pilot_phase1.py

Lines changed: 0 additions & 91 deletions
This file was deleted.

rdm.json

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,8 @@
11
{
22
"pids": {
3-
"doi": {
4-
"identifier": "10.5281/inveniordm.1234",
5-
"provider": "datacite",
6-
"client": "inveniordm"
7-
}
83
},
94
"metadata": {
10-
"resource_type": {"id": "image-photo"},
5+
"resource_type": {"id": "dataset"},
116
"creators": [
127
{
138
"person_or_org": {

0 commit comments

Comments
 (0)