Skip to content

Commit 0f51876

Browse files
authored
Merge pull request #350 from semantic-systems/develop
Push new updates to main
2 parents 81de3a9 + 41953e9 commit 0f51876

18 files changed

+1727
-1061
lines changed

.github/workflows/main.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ jobs:
3030
echo "CLIENT_ID_ORCID=${{ secrets.CLIENT_ID_ORCID }}" >> ./.env
3131
echo "CLIENT_SECRET_ORCID=${{ secrets.CLIENT_SECRET_ORCID }}" >> ./.env
3232
echo "OPENAI_API_KEY=${{ secrets.OPENAI_API_KEY }}" >> ./.env
33+
echo "DASHBOARD_USERNAME=${{ secrets.DASHBOARD_USERNAME }}" >> ./.env
34+
echo "DASHBOARD_PASSWORD=${{ secrets.DASHBOARD_PASSWORD }}" >> ./.env
3335
echo "LLAMA3_USERNAME=${{ secrets.LLAMA3_USERNAME }}" >> ./.env
3436
echo "LLAMA3_PASSWORD=${{ secrets.LLAMA3_PASSWORD }}" >> ./.env
3537
echo "ELASTIC_SERVER=${{ secrets.ELASTIC_SERVER }}" >> ./.env

config.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ class Config:
122122
},
123123
"module": "huggingface_models",
124124
"search-endpoint": f"https://huggingface.co/api/models?limit={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
125+
"get-resource-endpoint": f"https://huggingface.co/api/models/",
125126
},
126127
"Huggingface - Datasets": {
127128
"logo": {
@@ -133,6 +134,7 @@ class Config:
133134
},
134135
"module": "huggingface_datasets",
135136
"search-endpoint": f"https://huggingface.co/api/datasets?limit={NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT}&search=",
137+
"get-resource-endpoint": f"https://huggingface.co/api/datasets/",
136138
},
137139
"OPENAIRE - Products": {
138140
"logo": {
@@ -181,6 +183,18 @@ class Config:
181183
"get-publication-endpoint": "https://api.crossref.org/works/",
182184
"get-publication-references-endpoint": "https://api.crossref.org/works/",
183185
},
186+
"DataCite": {
187+
"logo": {
188+
"name": "DataCite",
189+
"link": "https://datacite.org/",
190+
"src": "DataCite-Logo.png",
191+
"width": "w-100",
192+
"height": "h-100",
193+
},
194+
"module": "datacite",
195+
"search-endpoint": "https://api.datacite.org/dois?query=titles.title:",
196+
"get-publication-endpoint": "https://api.datacite.org/dois/"
197+
},
184198
"SEMANTIC SCHOLAR - Publications": {
185199
"logo": {
186200
"name": "SEMANTIC SCHOLAR",

sources/__init__.py

Whitespace-only changes.

sources/base.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# sources/base.py
2+
from abc import ABC, abstractmethod
3+
from typing import Iterable, Dict, Any
4+
5+
class BaseSource(ABC):
6+
7+
@abstractmethod
8+
def fetch(self, search_term: str) -> Dict[str, Any]:
9+
"""
10+
Fetch raw json from the source using the given search term.
11+
"""
12+
...
13+
14+
@abstractmethod
15+
def extract_hits(self, raw: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
16+
"""
17+
Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
18+
"""
19+
...
20+
21+
@abstractmethod
22+
def map_hit(self, source_name: str, hit: Dict[str, Any]):
23+
"""
24+
Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
25+
"""
26+
...
27+
28+
@abstractmethod
29+
def search(self, source_name: str, search_term: str, results: dict, failed_sources: list) -> None:
30+
"""
31+
Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
32+
"""
33+
...

sources/cordis.py

Lines changed: 62 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,46 @@
11
from objects import thing, Project, Author
22
from sources import data_retriever
3+
from typing import Iterable, Dict, Any, List
34
import utils
45
from main import app
56

6-
@utils.handle_exceptions
7-
def search(source: str, search_term: str, results, failed_sources):
8-
search_term = f"({search_term})"
9-
search_result = data_retriever.retrieve_data(source=source,
10-
base_url=app.config['DATA_SOURCES'][source].get('search-endpoint', ''),
11-
search_term=search_term,
12-
failed_sources=failed_sources)
13-
total_records_found = search_result.get('result', {}).get('header', {}).get('totalHits', 0)
14-
total_records_pulled = search_result.get('result', {}).get('header', {}).get('numHits', 0)
15-
utils.log_event(type="info", message=f"{source} - {total_records_found} records matched; pulled top {total_records_pulled}")
16-
17-
hits = search_result.get('hits', {}).get('hit', [])
18-
for hit in hits:
7+
from sources.base import BaseSource
8+
9+
class CORDIS(BaseSource):
10+
11+
SOURCE = 'CORDIS'
12+
13+
@utils.handle_exceptions
14+
def fetch(self, search_term: str, failed_sources) -> Dict[str, Any]:
15+
"""
16+
Fetch raw json from the source using the given search term.
17+
"""
18+
search_term = f"({search_term})"
19+
search_result = data_retriever.retrieve_data(source=self.SOURCE,
20+
base_url=app.config['DATA_SOURCES'][self.SOURCE].get('search-endpoint', ''),
21+
search_term=search_term,
22+
failed_sources=failed_sources)
23+
total_records_found = search_result.get('result', {}).get('header', {}).get('totalHits', 0)
24+
total_records_pulled = search_result.get('result', {}).get('header', {}).get('numHits', 0)
25+
utils.log_event(type="info", message=f"{self.SOURCE} - {total_records_found} records matched; pulled top {total_records_pulled}")
26+
27+
return search_result
28+
29+
30+
@utils.handle_exceptions
31+
def extract_hits(self, raw: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
32+
"""
33+
Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
34+
"""
35+
return raw.get('hits', {}).get('hit', [])
36+
37+
38+
@utils.handle_exceptions
39+
def map_hit(self, hit: Dict[str, Any]):
40+
"""
41+
Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
42+
"""
43+
1944
if isinstance(hit, dict):
2045
projectNode = hit.get('project', {})
2146
type = projectNode.get('contenttype', '')
@@ -51,11 +76,33 @@ def search(source: str, search_term: str, results, failed_sources):
5176
project.inLanguage.append(languages)
5277

5378
_source = thing()
54-
_source.name = 'CORDIS'
79+
_source.name = self.SOURCE
5580
_source.identifier = projectNode.get('id', '')
5681
_source.url = project.url
5782
project.source.append(_source)
5883

84+
return project
85+
86+
return None
87+
88+
89+
@utils.handle_exceptions
90+
def search(self, source_name: str, search_term: str, results: dict, failed_sources: list) -> None:
91+
"""
92+
Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
93+
"""
94+
raw = self.fetch(search_term, failed_sources)
95+
hits = self.extract_hits(raw)
5996

97+
for hit in hits:
98+
project = self.map_hit(hit)
99+
100+
if project:
60101
results['projects'].append(project)
61-
102+
103+
@utils.handle_exceptions
104+
def search(source: str, search_term: str, results, failed_sources):
105+
"""
106+
Entrypoint to search CORDIS publications.
107+
"""
108+
CORDIS().search(source, search_term, results, failed_sources)

sources/core.py

Lines changed: 115 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,137 @@
11
from objects import thing, Article, Author, Organization
22
from sources import data_retriever
33
from config import Config
4+
from typing import Iterable, Dict, Any, List
45
import utils
56
import requests
67
from main import app
78

8-
@utils.handle_exceptions
9-
def search(source: str, search_term: str, results, failed_sources):
9+
from sources.base import BaseSource
1010

11-
# we cannot use data_retriever.retrieve_data here because we need to send the request with an API key in the header
12-
# learn more: https://api.core.ac.uk/docs/v3#tag/Search
13-
limit = Config.NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT
14-
api_url = f'https://api.core.ac.uk/v3/search/works/?limit={limit}&q={search_term}&_exists_:doi'
15-
headers = {"Authorization":"Bearer " + Config.CORE_API_KEY}
1611

17-
# send the request
18-
r = requests.get(api_url, headers=headers)
19-
r.raise_for_status()
20-
search_results = r.json()
12+
class CORE(BaseSource):
2113

22-
hits = search_results['results']
23-
total_hits = search_results['totalHits']
24-
total_results = len(hits)
14+
SOURCE = 'CORE'
2515

26-
utils.log_event(type="info", message=f"{source} - {total_hits} records matched; pulled top {total_results}")
27-
print(f"{source} - {total_hits} records matched; pulled top {total_results}")
16+
@utils.handle_exceptions
17+
def fetch(self, search_term: str, failed_sources) -> Dict[str, Any]:
18+
"""
19+
Fetch raw json from the source using the given search term.
20+
"""
21+
# we cannot use data_retriever.retrieve_data here because we need to send the request with an API key in the header
22+
# learn more: https://api.core.ac.uk/docs/v3#tag/Search
23+
limit = Config.NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT
24+
api_url = f'https://api.core.ac.uk/v3/search/works/?limit={limit}&q={search_term}&_exists_:doi'
25+
headers = {"Authorization":"Bearer " + Config.CORE_API_KEY}
2826

29-
for i, hit in enumerate(hits):
30-
digitalObj = map_digital_obj(source, hit)
27+
# send the request
28+
response = requests.get(api_url, headers=headers)
3129

32-
# we only create a result object if we found a DOI, otherwise None
33-
if digitalObj:
34-
results['publications'].append(digitalObj)
35-
36-
@utils.handle_exceptions
37-
def map_digital_obj(source: str, hit: dict) -> Article:
38-
39-
publication = Article()
40-
publication.additionalType = hit.get("documentType", "")
41-
publication.name = hit.get("title", "")
42-
43-
# go through the links and find the one with type: display
44-
links = hit.get("links", {})
45-
for link in links:
46-
if link.get("type", "") == "display":
47-
publication.url = link.get("url", "")
48-
break
30+
if response.status_code == 200:
31+
search_result = response.json()
32+
return search_result
33+
34+
failed_sources.append(self.SOURCE)
35+
return None
4936

50-
publication.encoding_contentUrl = hit.get("downloadUrl", "")
5137

52-
# publications may not always have a DOI!
53-
# if we don't find one, we do NOT create a result object for the hit
54-
if not hit.get("doi", None):
55-
print("No DOI found for publication:", publication.name)
56-
return None
38+
@utils.handle_exceptions
39+
def extract_hits(self, raw: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
40+
"""
41+
Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
42+
"""
43+
44+
hits = raw['results']
45+
total_hits = raw['totalHits']
46+
total_results = len(hits)
5747

58-
publication.identifier = hit.get("doi", "")
59-
publication.datePublished = hit.get("publishedDate", "")
60-
publication.inLanguage.append(hit.get("language", {}).get("code", ""))
48+
utils.log_event(type="info", message=f"{self.SOURCE} - {total_hits} records matched; pulled top {total_results}")
6149

62-
# abstracts may also be empty
63-
abstract = hit.get("abstract", "")
64-
if not abstract:
65-
abstract = ""
50+
return hits
51+
52+
53+
@utils.handle_exceptions
54+
def map_hit(self, hit: Dict[str, Any]):
55+
"""
56+
Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
57+
"""
58+
59+
publication = Article()
60+
publication.additionalType = hit.get("documentType", "")
61+
publication.name = hit.get("title", "")
62+
63+
# go through the links and find the one with type: display
64+
links = hit.get("links", {})
65+
for link in links:
66+
if link.get("type", "") == "display":
67+
publication.url = link.get("url", "")
68+
break
69+
70+
publication.encoding_contentUrl = hit.get("downloadUrl", "")
71+
72+
# publications may not always have a DOI!
73+
# if we don't find one, we do NOT create a result object for the hit
74+
if not hit.get("doi", None):
75+
print("No DOI found for publication:", publication.name)
76+
return None
77+
78+
publication.identifier = hit.get("doi", "")
79+
publication.datePublished = hit.get("publishedDate", "")
80+
publication.inLanguage.append(hit.get("language", {}).get("code", ""))
81+
82+
# abstracts may also be empty
83+
abstract = hit.get("abstract", "")
84+
if not abstract:
85+
abstract = ""
86+
87+
publication.description = utils.remove_html_tags(abstract)
88+
publication.abstract = publication.description
89+
90+
publication.citationCount = hit.get("citationCount", "")
91+
92+
if hit.get("publisher", ""):
93+
_publisher = Organization()
94+
_publisher.name = hit.get("publisher", "")
95+
publication.publisher = _publisher
96+
97+
authors = hit.get("authors", [])
98+
for author in authors:
99+
_author = Author()
100+
_author.additionalType = 'Person'
101+
_author.name = author.get("name", "")
102+
publication.author.append(_author)
103+
104+
_source = thing()
105+
_source.name = self.SOURCE
106+
_source.identifier = publication.identifier
107+
_source.url = publication.url
108+
publication.source.append(_source)
109+
110+
return publication
111+
66112

67-
publication.description = utils.remove_html_tags(abstract)
68-
publication.abstract = publication.description
113+
@utils.handle_exceptions
114+
def search(self, source_name: str, search_term: str, results: dict, failed_sources: list) -> None:
115+
"""
116+
Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
117+
"""
118+
raw = self.fetch(search_term, failed_sources)
69119

70-
publication.citationCount = hit.get("citationCount", "")
120+
if raw == None:
121+
return
71122

72-
if hit.get("publisher", ""):
73-
_publisher = Organization()
74-
_publisher.name = hit.get("publisher", "")
75-
publication.publisher = _publisher
123+
hits = self.extract_hits(raw)
76124

77-
authors = hit.get("authors", [])
78-
for author in authors:
79-
_author = Author()
80-
_author.additionalType = 'Person'
81-
_author.name = author.get("name", "")
82-
publication.author.append(_author)
83-
84-
_source = thing()
85-
_source.name = source
86-
_source.identifier = publication.identifier
87-
_source.url = publication.url
88-
publication.source.append(_source)
125+
for hit in hits:
126+
digitalObj = self.map_hit(hit)
89127

90-
return publication
128+
# we only create a result object if we found a DOI, otherwise None
129+
if digitalObj:
130+
results['publications'].append(digitalObj)
131+
132+
@utils.handle_exceptions
133+
def search(source: str, search_term: str, results, failed_sources):
134+
"""
135+
Entrypoint to search CORE publications.
136+
"""
137+
CORE().search(source, search_term, results, failed_sources)

0 commit comments

Comments
 (0)