Skip to content

Commit 03b176f

Browse files
authored
Update ROR and ORCID harvesting (#13)
* Update ROR and ORCID harvesting
1 parent ee16fe1 commit 03b176f

File tree

8 files changed

+252
-30
lines changed

8 files changed

+252
-30
lines changed

.github/workflows/crossref_ror.yaml

Lines changed: 98 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,50 @@ on:
1111
default: 'Manual trigger'
1212

1313
jobs:
14-
Harvest:
14+
get-crossref:
1515
runs-on: ubuntu-latest
16+
outputs:
17+
dois: ${{ steps.harvest.outputs.dois }}
18+
steps:
19+
- name: Checkout
20+
uses: actions/checkout@v4
21+
- name: Python Deps
22+
shell: bash
23+
run: pip install -r requirements.txt
24+
- name: Harvest DOIs
25+
shell: bash
26+
id: harvest
27+
run: python harvest.py crossref -print >> $GITHUB_OUTPUT
28+
- name: Commit Files
29+
uses: EndBug/add-and-commit@v9
30+
with:
31+
message: 'Update run date'
32+
add: "['last_run.txt']"
33+
prepare-dois:
34+
name: Prepare Matrix Output of DOIs
35+
needs: [get-crossref]
36+
runs-on: ubuntu-latest
37+
outputs:
38+
dois: ${{ steps.step1.outputs.matrix }}
39+
env:
40+
DOI: ${{ needs.get-crossref.outputs.dois }}
41+
steps:
42+
- name: Checkout
43+
uses: actions/checkout@v4
44+
- name: Process DOIs
45+
shell: bash
46+
run: python split_doi.py >> $GITHUB_OUTPUT
47+
id: step1
48+
harvest:
49+
runs-on: ubuntu-latest
50+
needs: [prepare-dois]
51+
continue-on-error: true
52+
strategy:
53+
fail-fast: false
54+
matrix:
55+
doi: ${{ fromJSON(needs.prepare-dois.outputs.dois) }}
56+
outputs:
57+
dois: ${{ steps.step1.outputs.matrix }}
1658
steps:
1759
- name: Checkout
1860
uses: actions/checkout@v4
@@ -30,15 +72,64 @@ jobs:
3072
run: cp $HOME/bin/doi2rdm $HOME/.local/bin/.
3173
- name: Harvest DOIs
3274
shell: bash
33-
id: harvest
3475
env:
3576
RDMTOK: ${{ secrets.RDMTOK }}
36-
run: python harvest.py crossref
77+
id: harvest
78+
run: python harvest.py doi -doi "${{matrix.doi}}" -actor ${{github.actor}} >> $GITHUB_OUTPUT
79+
- uses: cloudposse/github-action-matrix-outputs-write@main
80+
id: out
81+
with:
82+
matrix-step-name: ${{ github.job }}
83+
matrix-key: ${{ matrix.doi }}
84+
outputs: |-
85+
doi: ${{ steps.harvest.outputs.doi }}
86+
error: ${{steps.harvest.outputs.error }}
87+
- name: System error on DOI
88+
if: contains(steps.harvest.outputs.error, 'system')
89+
run: |
90+
echo ${{steps.harvest.outputs.error}}
91+
false
92+
- name: System intentionally skipped DOI
93+
if: contains(steps.harvest.outputs.error, 'skipping')
94+
run: |
95+
echo ${{steps.harvest.outputs.error}}
96+
false
97+
write-output:
98+
name: Write Output
99+
runs-on: ubuntu-latest
100+
needs: [harvest]
101+
outputs:
102+
result: "${{ steps.read.outputs.result }}"
103+
steps:
104+
- name: Checkout
105+
uses: actions/checkout@v4
106+
- name: read
107+
uses: cloudposse/github-action-matrix-outputs-read@main
108+
id: read
109+
with:
110+
matrix-step-name: harvest
37111
- name: write DOI
38112
run: |
39-
python save_dois.py '${{steps.harvest.outputs.doi}}'
40-
- name: Commit Files
113+
python save_dois.py '${{steps.read.outputs.result}}'
114+
- name: Commit File
41115
uses: EndBug/add-and-commit@v9
42116
with:
43-
message: 'Update run date'
44-
add: "['last_run.txt', 'harvested_dois.txt']"
117+
message: 'Update harvested_dois.txt'
118+
add: "['harvested_dois.txt']"
119+
report-status:
120+
name: Report Status
121+
runs-on: ubuntu-latest
122+
needs: [write-output]
123+
outputs:
124+
result: "${{ steps.read.outputs.result }}"
125+
steps:
126+
- name: Checkout
127+
uses: actions/checkout@v4
128+
- name: read
129+
uses: cloudposse/github-action-matrix-outputs-read@main
130+
id: read
131+
with:
132+
matrix-step-name: harvest
133+
- name: System error on DOI
134+
run: |
135+
python check_status.py '${{steps.read.outputs.result}}'

.github/workflows/orcid.yaml

Lines changed: 95 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,50 @@ on:
77
description: 'ORCID identifier to harvest from'
88
required: true
99

10+
1011
jobs:
11-
Harvest:
12-
runs-on: ubuntu-22.04
12+
get-orcid:
13+
runs-on: ubuntu-latest
14+
outputs:
15+
dois: ${{ steps.harvest.outputs.dois }}
1316
steps:
1417
- name: Checkout
15-
uses: actions/checkout@v3
18+
uses: actions/checkout@v4
19+
- name: Python Deps
20+
shell: bash
21+
run: pip install -r requirements.txt
22+
- name: Harvest DOIs
23+
shell: bash
24+
id: harvest
25+
run: python harvest.py orcid -orcid ${{ github.event.inputs.orcid }} -print >> $GITHUB_OUTPUT
26+
prepare-dois:
27+
name: Prepare Matrix Output of DOIs
28+
needs: [get-orcid]
29+
runs-on: ubuntu-latest
30+
outputs:
31+
dois: ${{ steps.step1.outputs.matrix }}
32+
env:
33+
DOI: ${{ needs.get-orcid.outputs.dois }}
34+
steps:
35+
- name: Checkout
36+
uses: actions/checkout@v4
37+
- name: Process DOIs
38+
shell: bash
39+
run: python split_doi.py >> $GITHUB_OUTPUT
40+
id: step1
41+
harvest:
42+
runs-on: ubuntu-latest
43+
needs: [prepare-dois]
44+
continue-on-error: true
45+
strategy:
46+
fail-fast: false
47+
matrix:
48+
doi: ${{ fromJSON(needs.prepare-dois.outputs.dois) }}
49+
outputs:
50+
dois: ${{ steps.step1.outputs.matrix }}
51+
steps:
52+
- name: Checkout
53+
uses: actions/checkout@v4
1654
- name: Python Deps
1755
shell: bash
1856
run: pip install -r requirements.txt
@@ -29,9 +67,62 @@ jobs:
2967
shell: bash
3068
env:
3169
RDMTOK: ${{ secrets.RDMTOK }}
32-
run: python harvest.py orcid -orcid ${{github.event.inputs.orcid}} -actor ${{github.actor}}
70+
id: harvest
71+
run: python harvest.py doi -doi "${{matrix.doi}}" -actor ${{github.actor}} >> $GITHUB_OUTPUT
72+
- uses: cloudposse/github-action-matrix-outputs-write@main
73+
id: out
74+
with:
75+
matrix-step-name: ${{ github.job }}
76+
matrix-key: ${{ matrix.doi }}
77+
outputs: |-
78+
doi: ${{ steps.harvest.outputs.doi }}
79+
error: ${{steps.harvest.outputs.error }}
80+
- name: System error on DOI
81+
if: contains(steps.harvest.outputs.error, 'system')
82+
run: |
83+
echo ${{steps.harvest.outputs.error}}
84+
false
85+
- name: System intentionally skipped DOI
86+
if: contains(steps.harvest.outputs.error, 'skipping')
87+
run: |
88+
echo ${{steps.harvest.outputs.error}}
89+
false
90+
write-output:
91+
name: Write Output
92+
runs-on: ubuntu-latest
93+
needs: [harvest]
94+
outputs:
95+
result: "${{ steps.read.outputs.result }}"
96+
steps:
97+
- name: Checkout
98+
uses: actions/checkout@v4
99+
- name: read
100+
uses: cloudposse/github-action-matrix-outputs-read@main
101+
id: read
102+
with:
103+
matrix-step-name: harvest
104+
- name: write DOI
105+
run: |
106+
python save_dois.py '${{steps.read.outputs.result}}'
33107
- name: Commit File
34108
uses: EndBug/add-and-commit@v9
35109
with:
36110
message: 'Update harvested_dois.txt'
37111
add: "['harvested_dois.txt']"
112+
report-status:
113+
name: Report Status
114+
runs-on: ubuntu-latest
115+
needs: [write-output]
116+
outputs:
117+
result: "${{ steps.read.outputs.result }}"
118+
steps:
119+
- name: Checkout
120+
uses: actions/checkout@v4
121+
- name: read
122+
uses: cloudposse/github-action-matrix-outputs-read@main
123+
id: read
124+
with:
125+
matrix-step-name: harvest
126+
- name: System error on DOI
127+
run: |
128+
python check_status.py '${{steps.read.outputs.result}}'

check_status.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@
33
print(sys.argv[1])
44
result = json.loads(sys.argv[1])
55

6-
errors = result['error']
6+
errors = result["error"]
77

88
for error in errors.keys():
99
e = errors[error]
1010
if e:
11-
if 'system' in e:
11+
if "system" in e:
1212
print(e)
1313
exit(1)

harvest.py

Lines changed: 46 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,36 @@ def cleanup_metadata(metadata):
3737
for row in reader:
3838
licenses[row["props__url"]] = row["id"]
3939
rights = []
40-
if "rights" in "metadata":
40+
files = None
41+
if "rights" in metadata["metadata"]:
4142
for f in metadata["metadata"]["rights"]:
4243
link = f["link"]
4344
if link in licenses:
4445
f["id"] = licenses[link]
4546
else:
4647
f["title"]["en"] = "Unknown"
47-
if f["description"]["en"] == "vor":
48-
rights.append(f)
48+
# Not supporting file download till v12
49+
# if f["description"]["en"] == "vor":
50+
# rights.append(f)
51+
# if f["id"] == 'cc-by-4.0':
52+
# doi = metadata["pids"]["doi"]["identifier"]
53+
# response = requests.get('https://api.crossref.org/works/' + doi)
54+
# if response.status_code == 200:
55+
# data = response.json()
56+
# try:
57+
# links = data["message"]["link"]
58+
# for link in links:
59+
# if link["content-type"] == "application/pdf":
60+
# link = link["URL"]
61+
# requests.get(link)
62+
# fname = f"{doi.replace('/','_')}.pdf"
63+
# with open(fname, "wb") as f:
64+
# f.write(response.content)
65+
# files = fname
66+
# except:
67+
# pass
4968
metadata["metadata"]["rights"] = rights
50-
return metadata
69+
return metadata, files
5170

5271

5372
def get_orcid_works(orcid):
@@ -152,7 +171,6 @@ def read_outputs():
152171

153172

154173
if __name__ == "__main__":
155-
156174
parser = argparse.ArgumentParser(
157175
description="Harvest DOIs from Crossref or ORCID and add to CaltechAUTHORS"
158176
)
@@ -161,6 +179,9 @@ def read_outputs():
161179
parser.add_argument("-doi", help="DOI to harvest")
162180
parser.add_argument("-actor", help="Name of actor to use for review message")
163181
parser.add_argument("-report", help="Generate a report only", action="store_true")
182+
parser.add_argument(
183+
"-print", help="Print out DOIs (no harvesting)", action="store_true"
184+
)
164185
args = parser.parse_args()
165186

166187
harvest_type = args.harvest_type
@@ -176,15 +197,25 @@ def read_outputs():
176197
if harvest_type == "crossref":
177198
dois = get_crossref_ror()
178199
review_message = (
179-
"Added by Tom during testing, should be a valid article from WOS harvest"
180-
#"Automatically added from Crossref based on Caltech ROR affiliation"
200+
"Automatically added from Crossref based on Caltech ROR affiliation"
181201
)
182-
dois = ['10.1051/0004-6361/202346526','10.1016/j.palaeo.2023.111756']
202+
if args.print:
203+
ostring = "dois="
204+
for doi in dois:
205+
ostring += f" {doi}"
206+
print(ostring)
207+
dois = []
183208
elif harvest_type == "orcid":
184209
dois = get_orcid_works(args.orcid)
185210
review_message = (
186211
f"Automatically added from ORCID from record {args.orcid} by {args.actor}"
187212
)
213+
if args.print:
214+
ostring = "dois= "
215+
for doi in dois:
216+
ostring += f" {doi}"
217+
print(ostring)
218+
dois = []
188219
elif harvest_type == "doi":
189220
dois = args.doi.split(" ")
190221
review_message = f"Automatically added by {args.actor} as part of import from DOI list: {args.doi}"
@@ -224,20 +255,22 @@ def read_outputs():
224255
transformed = subprocess.check_output(["doi2rdm", doi])
225256
data = transformed.decode("utf-8")
226257
data = json.loads(data)
227-
data = cleanup_metadata(data)
258+
data, files = cleanup_metadata(data)
228259
response = caltechdata_write(
229260
data,
230261
token,
231262
production=True,
232263
authors=True,
233264
community=community,
234265
review_message=review_message,
266+
files=files,
235267
)
236-
print("doi=",doi)
237-
#with open("harvested_dois.txt", "a") as f:
268+
print("doi=", doi)
269+
# with open("harvested_dois.txt", "a") as f:
238270
# f.write(doi + "\n")
239-
except:
240-
print("error= system error with doi2rdm")
271+
except Exception as e:
272+
cleaned = e.replace("'","/")
273+
print(f"error= system error with doi2rdm {cleaned}")
241274
else:
242275
print(f"error=DOI {doi} has already been harvested, skipping")
243276
else:

harvested_dois.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,7 @@ https://doi.org/10.1016/j.psycr.2023.100173
707707
10.1073/pnas.2315787121
708708
10.1063/5.0147340
709709
10.3847/1538-4357/ad1243
710+
<<<<<<< HEAD
710711
10.3847/1538-4357/ad12c6
711712
10.1103/PRXQuantum.5.010319
712713
10.1103/PRXQuantum.5.010317
@@ -815,3 +816,10 @@ https://doi.org/10.1016/j.psycr.2023.100173
815816
10.1103/PhysRevB.109.L060305
816817
10.1103/PhysRevD.109.035025
817818
10.3847/1538-4357/acd849
819+
10.3847/1538-4357/acf3e6
820+
10.3847/1538-4357/ad12c6
821+
10.1103/PRXQuantum.5.010319
822+
10.1103/PRXQuantum.5.010317
823+
10.1103/PhysRevB.109.085108
824+
10.1103/PhysRevFluids.8.064612
825+
10.2514/6.2023-3985

last_run.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2024-02-01
1+
2024-02-22

0 commit comments

Comments
 (0)