Update ROR and ORCID harvesting (#13)

tmorrell · web-flow · commit 03b176f3aedb · 2024-02-29T13:06:49.000-08:00
* Update ROR and ORCID harvesting
diff --git a/.github/workflows/crossref_ror.yaml b/.github/workflows/crossref_ror.yaml
@@ -11,8 +11,50 @@ on:
         default: 'Manual trigger'
 
 jobs:
-  Harvest:
+  get-crossref:
     runs-on: ubuntu-latest
+    outputs:
+      dois: ${{ steps.harvest.outputs.dois }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Python Deps
+        shell: bash
+        run: pip install -r requirements.txt
+      - name: Harvest DOIs
+        shell: bash
+        id: harvest
+        run: python harvest.py crossref -print >> $GITHUB_OUTPUT
+      - name: Commit Files
+        uses: EndBug/add-and-commit@v9
+        with:
+          message: 'Update run date'
+          add: "['last_run.txt']"
+  prepare-dois:
+    name: Prepare Matrix Output of DOIs
+    needs: [get-crossref]
+    runs-on: ubuntu-latest
+    outputs: 
+      dois: ${{ steps.step1.outputs.matrix }}
+    env:
+      DOI: ${{ needs.get-crossref.outputs.dois }}
+    steps: 
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Process DOIs
+        shell: bash
+        run: python split_doi.py >> $GITHUB_OUTPUT
+        id: step1
+  harvest:
+    runs-on: ubuntu-latest
+    needs: [prepare-dois]
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      matrix:
+        doi: ${{ fromJSON(needs.prepare-dois.outputs.dois) }}
+    outputs:
+      dois: ${{ steps.step1.outputs.matrix }}
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -30,15 +72,64 @@ jobs:
         run: cp $HOME/bin/doi2rdm $HOME/.local/bin/.
       - name: Harvest DOIs
         shell: bash
-        id: harvest
         env:
             RDMTOK: ${{ secrets.RDMTOK }}
-        run: python harvest.py crossref
+        id: harvest
+        run: python harvest.py doi -doi "${{matrix.doi}}" -actor ${{github.actor}} >> $GITHUB_OUTPUT
+      - uses: cloudposse/github-action-matrix-outputs-write@main
+        id: out
+        with:
+          matrix-step-name: ${{ github.job }}
+          matrix-key: ${{ matrix.doi }}
+          outputs: |-
+            doi: ${{ steps.harvest.outputs.doi }}
+            error: ${{steps.harvest.outputs.error }}
+      - name: System error on DOI
+        if: contains(steps.harvest.outputs.error, 'system')
+        run: |
+          echo ${{steps.harvest.outputs.error}}
+          false
+      - name: System intentionally skipped DOI
+        if: contains(steps.harvest.outputs.error, 'skipping')
+        run: |
+          echo ${{steps.harvest.outputs.error}}
+          false
+  write-output:
+    name: Write Output
+    runs-on: ubuntu-latest
+    needs: [harvest]
+    outputs:
+      result: "${{ steps.read.outputs.result }}"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: read
+        uses: cloudposse/github-action-matrix-outputs-read@main
+        id: read
+        with:
+          matrix-step-name: harvest
       - name: write DOI
         run: |
-          python save_dois.py '${{steps.harvest.outputs.doi}}'
-      - name: Commit Files
+          python save_dois.py '${{steps.read.outputs.result}}'
+      - name: Commit File
         uses: EndBug/add-and-commit@v9
         with:
-          message: 'Update run date'
-          add: "['last_run.txt', 'harvested_dois.txt']"
+          message: 'Update harvested_dois.txt'
+          add: "['harvested_dois.txt']"
+  report-status:
+    name: Report Status
+    runs-on: ubuntu-latest
+    needs: [write-output]
+    outputs:
+      result: "${{ steps.read.outputs.result }}"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: read
+        uses: cloudposse/github-action-matrix-outputs-read@main
+        id: read
+        with:
+          matrix-step-name: harvest
+      - name: System error on DOI
+        run: |
+          python check_status.py '${{steps.read.outputs.result}}'
diff --git a/.github/workflows/orcid.yaml b/.github/workflows/orcid.yaml
@@ -7,12 +7,50 @@ on:
         description: 'ORCID identifier to harvest from'
         required: true
 
+
 jobs:
-  Harvest:
-    runs-on: ubuntu-22.04
+  get-orcid:
+    runs-on: ubuntu-latest
+    outputs:
+      dois: ${{ steps.harvest.outputs.dois }}
     steps:
       - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+      - name: Python Deps
+        shell: bash
+        run: pip install -r requirements.txt
+      - name: Harvest DOIs
+        shell: bash
+        id: harvest
+        run: python harvest.py orcid -orcid ${{ github.event.inputs.orcid }} -print >> $GITHUB_OUTPUT
+  prepare-dois:
+    name: Prepare Matrix Output of DOIs
+    needs: [get-orcid]
+    runs-on: ubuntu-latest
+    outputs: 
+      dois: ${{ steps.step1.outputs.matrix }}
+    env:
+      DOI: ${{ needs.get-orcid.outputs.dois }}
+    steps: 
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Process DOIs
+        shell: bash
+        run: python split_doi.py >> $GITHUB_OUTPUT
+        id: step1
+  harvest:
+    runs-on: ubuntu-latest
+    needs: [prepare-dois]
+    continue-on-error: true
+    strategy:
+      fail-fast: false
+      matrix:
+        doi: ${{ fromJSON(needs.prepare-dois.outputs.dois) }}
+    outputs:
+      dois: ${{ steps.step1.outputs.matrix }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
       - name: Python Deps
         shell: bash
         run: pip install -r requirements.txt
@@ -29,9 +67,62 @@ jobs:
         shell: bash
         env:
             RDMTOK: ${{ secrets.RDMTOK }}
-        run: python harvest.py orcid -orcid ${{github.event.inputs.orcid}} -actor ${{github.actor}} 
+        id: harvest
+        run: python harvest.py doi -doi "${{matrix.doi}}" -actor ${{github.actor}} >> $GITHUB_OUTPUT
+      - uses: cloudposse/github-action-matrix-outputs-write@main
+        id: out
+        with:
+          matrix-step-name: ${{ github.job }}
+          matrix-key: ${{ matrix.doi }}
+          outputs: |-
+            doi: ${{ steps.harvest.outputs.doi }}
+            error: ${{steps.harvest.outputs.error }}
+      - name: System error on DOI
+        if: contains(steps.harvest.outputs.error, 'system')
+        run: |
+          echo ${{steps.harvest.outputs.error}}
+          false
+      - name: System intentionally skipped DOI
+        if: contains(steps.harvest.outputs.error, 'skipping')
+        run: |
+          echo ${{steps.harvest.outputs.error}}
+          false
+  write-output:
+    name: Write Output
+    runs-on: ubuntu-latest
+    needs: [harvest]
+    outputs:
+      result: "${{ steps.read.outputs.result }}"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: read
+        uses: cloudposse/github-action-matrix-outputs-read@main
+        id: read
+        with:
+          matrix-step-name: harvest
+      - name: write DOI
+        run: |
+          python save_dois.py '${{steps.read.outputs.result}}'
       - name: Commit File
         uses: EndBug/add-and-commit@v9
         with:
           message: 'Update harvested_dois.txt'
           add: "['harvested_dois.txt']"
+  report-status:
+    name: Report Status
+    runs-on: ubuntu-latest
+    needs: [write-output]
+    outputs:
+      result: "${{ steps.read.outputs.result }}"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: read
+        uses: cloudposse/github-action-matrix-outputs-read@main
+        id: read
+        with:
+          matrix-step-name: harvest
+      - name: System error on DOI
+        run: |
+          python check_status.py '${{steps.read.outputs.result}}'
diff --git a/check_status.py b/check_status.py
@@ -3,11 +3,11 @@
 print(sys.argv[1])
 result = json.loads(sys.argv[1])
 
-errors = result['error']
+errors = result["error"]
 
 for error in errors.keys():
     e = errors[error]
     if e:
-        if 'system' in e:
+        if "system" in e:
             print(e)
             exit(1)
diff --git a/harvest.py b/harvest.py
@@ -37,17 +37,36 @@ def cleanup_metadata(metadata):
         for row in reader:
             licenses[row["props__url"]] = row["id"]
     rights = []
-    if "rights" in "metadata":
+    files = None
+    if "rights" in metadata["metadata"]:
         for f in metadata["metadata"]["rights"]:
             link = f["link"]
             if link in licenses:
                 f["id"] = licenses[link]
             else:
                 f["title"]["en"] = "Unknown"
-            if f["description"]["en"] == "vor":
-                rights.append(f)
+            # Not supporting file download till v12
+            # if f["description"]["en"] == "vor":
+            #    rights.append(f)
+            #    if f["id"] == 'cc-by-4.0':
+            #        doi = metadata["pids"]["doi"]["identifier"]
+            #        response = requests.get('https://api.crossref.org/works/' + doi)
+            #        if response.status_code == 200:
+            #            data = response.json()
+            #            try:
+            #                links = data["message"]["link"]
+            #                for link in links:
+            #                    if link["content-type"] == "application/pdf":
+            #                        link = link["URL"]
+            #                        requests.get(link)
+            #                        fname = f"{doi.replace('/','_')}.pdf"
+            #                        with open(fname, "wb") as f:
+            #                            f.write(response.content)
+            #                        files = fname
+            #            except:
+            #                pass
     metadata["metadata"]["rights"] = rights
-    return metadata
+    return metadata, files
 
 
 def get_orcid_works(orcid):
@@ -152,7 +171,6 @@ def read_outputs():
 
 
 if __name__ == "__main__":
-
     parser = argparse.ArgumentParser(
         description="Harvest DOIs from Crossref or ORCID and add to CaltechAUTHORS"
     )
@@ -161,6 +179,9 @@ def read_outputs():
     parser.add_argument("-doi", help="DOI to harvest")
     parser.add_argument("-actor", help="Name of actor to use for review message")
     parser.add_argument("-report", help="Generate a report only", action="store_true")
+    parser.add_argument(
+        "-print", help="Print out DOIs (no harvesting)", action="store_true"
+    )
     args = parser.parse_args()
 
     harvest_type = args.harvest_type
@@ -176,15 +197,25 @@ def read_outputs():
     if harvest_type == "crossref":
         dois = get_crossref_ror()
         review_message = (
-                "Added by Tom during testing, should be a valid article from WOS harvest"
-                #"Automatically added from Crossref based on Caltech ROR affiliation"
+            "Automatically added from Crossref based on Caltech ROR affiliation"
         )
-        dois = ['10.1051/0004-6361/202346526','10.1016/j.palaeo.2023.111756']
+        if args.print:
+            ostring = "dois="
+            for doi in dois:
+                ostring += f" {doi}"
+            print(ostring)
+            dois = []
     elif harvest_type == "orcid":
         dois = get_orcid_works(args.orcid)
         review_message = (
             f"Automatically added from ORCID from record {args.orcid} by {args.actor}"
         )
+        if args.print:
+            ostring = "dois= "
+            for doi in dois:
+                ostring += f" {doi}"
+            print(ostring)
+            dois = []
     elif harvest_type == "doi":
         dois = args.doi.split(" ")
         review_message = f"Automatically added by {args.actor} as part of import from DOI list: {args.doi}"
@@ -224,20 +255,22 @@ def read_outputs():
                     transformed = subprocess.check_output(["doi2rdm", doi])
                     data = transformed.decode("utf-8")
                     data = json.loads(data)
-                    data = cleanup_metadata(data)
+                    data, files = cleanup_metadata(data)
                     response = caltechdata_write(
                         data,
                         token,
                         production=True,
                         authors=True,
                         community=community,
                         review_message=review_message,
+                        files=files,
                     )
-                    print("doi=",doi)
-                    #with open("harvested_dois.txt", "a") as f:
+                    print("doi=", doi)
+                    # with open("harvested_dois.txt", "a") as f:
                     #    f.write(doi + "\n")
-                except:
-                    print("error= system error with doi2rdm")
+                except Exception as e:
+                    cleaned = e.replace("'","/")
+                    print(f"error= system error with doi2rdm {cleaned}")
             else:
                 print(f"error=DOI {doi} has already been harvested, skipping")
         else:
diff --git a/harvested_dois.txt b/harvested_dois.txt
@@ -707,6 +707,7 @@ https://doi.org/10.1016/j.psycr.2023.100173
 10.1073/pnas.2315787121
 10.1063/5.0147340
 10.3847/1538-4357/ad1243
+<<<<<<< HEAD
 10.3847/1538-4357/ad12c6
 10.1103/PRXQuantum.5.010319
 10.1103/PRXQuantum.5.010317
@@ -815,3 +816,10 @@ https://doi.org/10.1016/j.psycr.2023.100173
 10.1103/PhysRevB.109.L060305
 10.1103/PhysRevD.109.035025
 10.3847/1538-4357/acd849
+10.3847/1538-4357/acf3e6
+10.3847/1538-4357/ad12c6
+10.1103/PRXQuantum.5.010319
+10.1103/PRXQuantum.5.010317
+10.1103/PhysRevB.109.085108
+10.1103/PhysRevFluids.8.064612
+10.2514/6.2023-3985
diff --git a/last_run.txt b/last_run.txt
@@ -1 +1 @@
-2024-02-01
+2024-02-22
diff --git a/save_dois.py b/save_dois.py
diff --git a/split_doi.py b/split_doi.py