11from objects import thing , Article , Author , Organization
22from sources import data_retriever
33from config import Config
4+ from typing import Iterable , Dict , Any , List
45import utils
56import requests
67from main import app
78
8- @utils .handle_exceptions
9- def search (source : str , search_term : str , results , failed_sources ):
9+ from sources .base import BaseSource
1010
11- # we cannot use data_retriever.retrieve_data here because we need to send the request with an API key in the header
12- # learn more: https://api.core.ac.uk/docs/v3#tag/Search
13- limit = Config .NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT
14- api_url = f'https://api.core.ac.uk/v3/search/works/?limit={ limit } &q={ search_term } &_exists_:doi'
15- headers = {"Authorization" :"Bearer " + Config .CORE_API_KEY }
1611
17- # send the request
18- r = requests .get (api_url , headers = headers )
19- r .raise_for_status ()
20- search_results = r .json ()
12+ class CORE (BaseSource ):
2113
22- hits = search_results ['results' ]
23- total_hits = search_results ['totalHits' ]
24- total_results = len (hits )
14+ SOURCE = 'CORE'
2515
26- utils .log_event (type = "info" , message = f"{ source } - { total_hits } records matched; pulled top { total_results } " )
27- print (f"{ source } - { total_hits } records matched; pulled top { total_results } " )
16+ @utils .handle_exceptions
17+ def fetch (self , search_term : str , failed_sources ) -> Dict [str , Any ]:
18+ """
19+ Fetch raw json from the source using the given search term.
20+ """
21+ # we cannot use data_retriever.retrieve_data here because we need to send the request with an API key in the header
22+ # learn more: https://api.core.ac.uk/docs/v3#tag/Search
23+ limit = Config .NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT
24+ api_url = f'https://api.core.ac.uk/v3/search/works/?limit={ limit } &q={ search_term } &_exists_:doi'
25+ headers = {"Authorization" :"Bearer " + Config .CORE_API_KEY }
2826
29- for i , hit in enumerate ( hits ):
30- digitalObj = map_digital_obj ( source , hit )
27+ # send the request
28+ response = requests . get ( api_url , headers = headers )
3129
32- # we only create a result object if we found a DOI, otherwise None
33- if digitalObj :
34- results ['publications' ].append (digitalObj )
35-
36- @utils .handle_exceptions
37- def map_digital_obj (source : str , hit : dict ) -> Article :
38-
39- publication = Article ()
40- publication .additionalType = hit .get ("documentType" , "" )
41- publication .name = hit .get ("title" , "" )
42-
43- # go through the links and find the one with type: display
44- links = hit .get ("links" , {})
45- for link in links :
46- if link .get ("type" , "" ) == "display" :
47- publication .url = link .get ("url" , "" )
48- break
30+ if response .status_code == 200 :
31+ search_result = response .json ()
32+ return search_result
33+
34+ failed_sources .append (self .SOURCE )
35+ return None
4936
50- publication .encoding_contentUrl = hit .get ("downloadUrl" , "" )
5137
52- # publications may not always have a DOI!
53- # if we don't find one, we do NOT create a result object for the hit
54- if not hit .get ("doi" , None ):
55- print ("No DOI found for publication:" , publication .name )
56- return None
38+ @utils .handle_exceptions
39+ def extract_hits (self , raw : Dict [str , Any ]) -> Iterable [Dict [str , Any ]]:
40+ """
41+ Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
42+ """
43+
44+ hits = raw ['results' ]
45+ total_hits = raw ['totalHits' ]
46+ total_results = len (hits )
5747
58- publication .identifier = hit .get ("doi" , "" )
59- publication .datePublished = hit .get ("publishedDate" , "" )
60- publication .inLanguage .append (hit .get ("language" , {}).get ("code" , "" ))
48+ utils .log_event (type = "info" , message = f"{ self .SOURCE } - { total_hits } records matched; pulled top { total_results } " )
6149
62- # abstracts may also be empty
63- abstract = hit .get ("abstract" , "" )
64- if not abstract :
65- abstract = ""
50+ return hits
51+
52+
53+ @utils .handle_exceptions
54+ def map_hit (self , hit : Dict [str , Any ]):
55+ """
56+ Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
57+ """
58+
59+ publication = Article ()
60+ publication .additionalType = hit .get ("documentType" , "" )
61+ publication .name = hit .get ("title" , "" )
62+
63+ # go through the links and find the one with type: display
64+ links = hit .get ("links" , {})
65+ for link in links :
66+ if link .get ("type" , "" ) == "display" :
67+ publication .url = link .get ("url" , "" )
68+ break
69+
70+ publication .encoding_contentUrl = hit .get ("downloadUrl" , "" )
71+
72+ # publications may not always have a DOI!
73+ # if we don't find one, we do NOT create a result object for the hit
74+ if not hit .get ("doi" , None ):
75+ print ("No DOI found for publication:" , publication .name )
76+ return None
77+
78+ publication .identifier = hit .get ("doi" , "" )
79+ publication .datePublished = hit .get ("publishedDate" , "" )
80+ publication .inLanguage .append (hit .get ("language" , {}).get ("code" , "" ))
81+
82+ # abstracts may also be empty
83+ abstract = hit .get ("abstract" , "" )
84+ if not abstract :
85+ abstract = ""
86+
87+ publication .description = utils .remove_html_tags (abstract )
88+ publication .abstract = publication .description
89+
90+ publication .citationCount = hit .get ("citationCount" , "" )
91+
92+ if hit .get ("publisher" , "" ):
93+ _publisher = Organization ()
94+ _publisher .name = hit .get ("publisher" , "" )
95+ publication .publisher = _publisher
96+
97+ authors = hit .get ("authors" , [])
98+ for author in authors :
99+ _author = Author ()
100+ _author .additionalType = 'Person'
101+ _author .name = author .get ("name" , "" )
102+ publication .author .append (_author )
103+
104+ _source = thing ()
105+ _source .name = self .SOURCE
106+ _source .identifier = publication .identifier
107+ _source .url = publication .url
108+ publication .source .append (_source )
109+
110+ return publication
111+
66112
67- publication .description = utils .remove_html_tags (abstract )
68- publication .abstract = publication .description
113+ @utils .handle_exceptions
114+ def search (self , source_name : str , search_term : str , results : dict , failed_sources : list ) -> None :
115+ """
116+ Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
117+ """
118+ raw = self .fetch (search_term , failed_sources )
69119
70- publication .citationCount = hit .get ("citationCount" , "" )
120+ if raw == None :
121+ return
71122
72- if hit .get ("publisher" , "" ):
73- _publisher = Organization ()
74- _publisher .name = hit .get ("publisher" , "" )
75- publication .publisher = _publisher
123+ hits = self .extract_hits (raw )
76124
77- authors = hit .get ("authors" , [])
78- for author in authors :
79- _author = Author ()
80- _author .additionalType = 'Person'
81- _author .name = author .get ("name" , "" )
82- publication .author .append (_author )
83-
84- _source = thing ()
85- _source .name = source
86- _source .identifier = publication .identifier
87- _source .url = publication .url
88- publication .source .append (_source )
125+ for hit in hits :
126+ digitalObj = self .map_hit (hit )
89127
90- return publication
128+ # we only create a result object if we found a DOI, otherwise None
129+ if digitalObj :
130+ results ['publications' ].append (digitalObj )
131+
132+ @utils .handle_exceptions
133+ def search (source : str , search_term : str , results , failed_sources ):
134+ """
135+ Entrypoint to search CORE publications.
136+ """
137+ CORE ().search (source , search_term , results , failed_sources )
0 commit comments