1- from objects import thing , Article , Author
1+ from objects import thing , Article , Author , Organization
22from sources import data_retriever
3+ from typing import Iterable , Dict , Any , List
34import utils
4- from main import app
55from string import Template
66from datetime import datetime
77from dateutil import parser
8+ from sources .base import BaseSource
9+ from main import app
10+
11+ class WIKIDATA_Publication (BaseSource ):
12+
13+ SOURCE = 'WIKIDATA - Publications'
14+
15+ @utils .handle_exceptions
16+ def fetch (self , search_term : str , failed_sources ) -> Dict [str , Any ]:
17+ """
18+ Fetch raw json from the source using the given search term.
19+ """
20+ query_template = Template ('''
21+ SELECT DISTINCT ?item ?label ?date ?doi
22+ (group_concat(DISTINCT ?authorsName; separator=",") as ?authorsLabel)
23+ (group_concat(DISTINCT ?authors2; separator=",") as ?authorsString)
24+ WHERE
25+ {
26+ SERVICE wikibase:mwapi
27+ {
28+ bd:serviceParam wikibase:endpoint "www.wikidata.org";
29+ wikibase:limit "once";
30+ wikibase:api "Generator";
31+ mwapi:generator "search";
32+ mwapi:gsrsearch "$search_string";
33+ mwapi:gsrlimit "max".
34+ ?item wikibase:apiOutputItem mwapi:title.
35+ }
36+ ?item rdfs:label ?label. FILTER( LANG(?label)="en" )
37+ ?item wdt:P31/wdt:P279* wd:Q11826511.
38+ ?item wdt:P577 ?date .
39+ ?item wdt:P356 ?doi .
40+ ?item wdt:P50 ?authors.
41+ ?authors rdfs:label ?authorsName . FILTER( LANG(?authorsName)="en" )
42+ optional {?item wdt:P2093 ?authors2.}
43+ }
44+ GROUP BY ?item ?label ?date ?doi
45+ LIMIT $number_of_records
46+
47+ ''' )
48+ replacement_dict = {
49+ "search_string" : search_term ,
50+ "number_of_records" : app .config ['NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT' ]
51+ }
52+ query = query_template .substitute (replacement_dict )
53+ query = ' ' .join (query .split ())
54+ search_result = data_retriever .retrieve_data (source = self .SOURCE ,
55+ base_url = app .config ['DATA_SOURCES' ][self .SOURCE ].get ('search-endpoint' , '' ),
56+ search_term = query ,
57+ failed_sources = failed_sources )
58+ return search_result
59+
60+ @utils .handle_exceptions
61+ def extract_hits (self , raw : Dict [str , Any ]) -> Iterable [Dict [str , Any ]]:
62+ """
63+ Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
64+ """
65+ hits = raw .get ("results" , {}).get ("bindings" , [])
66+ total_hits = len (hits )
67+ utils .log_event (type = "info" , message = f"{ self .SOURCE } - { total_hits } records matched; pulled top { total_hits } " )
68+ # print(str(total_hits) + " from WIKIDATA Publications")
69+ if int (total_hits ) > 0 :
70+ return hits
71+ return []
72+
73+ @utils .handle_exceptions
74+ def map_hit (self , hit : Dict [str , Any ]):
75+ """
76+ Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
77+ """
78+ publication = Article ()
879
9- @utils .handle_exceptions
10- def search (source : str , search_term : str , results , failed_sources ):
11- query_template = Template ('''
12- SELECT DISTINCT ?item ?label ?date ?doi
13- (group_concat(DISTINCT ?authorsName; separator=",") as ?authorsLabel)
14- (group_concat(DISTINCT ?authors2; separator=",") as ?authorsString)
15- WHERE
16- {
17- SERVICE wikibase:mwapi
18- {
19- bd:serviceParam wikibase:endpoint "www.wikidata.org";
20- wikibase:limit "once";
21- wikibase:api "Generator";
22- mwapi:generator "search";
23- mwapi:gsrsearch "$search_string";
24- mwapi:gsrlimit "max".
25- ?item wikibase:apiOutputItem mwapi:title.
26- }
27- ?item rdfs:label ?label. FILTER( LANG(?label)="en" )
28- ?item wdt:P31/wdt:P279* wd:Q11826511.
29- ?item wdt:P577 ?date .
30- ?item wdt:P356 ?doi .
31- ?item wdt:P50 ?authors.
32- ?authors rdfs:label ?authorsName . FILTER( LANG(?authorsName)="en" )
33- optional {?item wdt:P2093 ?authors2.}
34- }
35- GROUP BY ?item ?label ?date ?doi
36- LIMIT $number_of_records
37-
38- ''' )
80+ publication .name = hit .get ("label" , {}).get ("value" , "" )
81+ publication .url = hit .get ("item" , {}).get ("value" , "" )
82+ publication .identifier = hit .get ("doi" , {}).get ("value" , "" )
83+ # DOI is available for few; we need to update the sparql query to fetch this information
84+ publication .datePublished = datetime .strftime (parser .parse (hit .get ('date' , {}).get ('value' , "" )), '%Y-%m-%d' )
85+ authorsLabels = hit .get ("authorsLabel" , {}).get ("value" , "" )
86+ for authorsLabel in authorsLabels .rstrip ("," ).split ("," ):
87+ _author = Author ()
88+ _author .additionalType = 'Person'
89+ _author .name = authorsLabel
90+ _author .identifier = "" # ORCID is available for few; we need to update the sparql query to pull this information
91+ author_source = thing (
92+ name = self .SOURCE ,
93+ identifier = _author .identifier ,
94+ )
95+ _author .source .append (author_source )
96+ publication .author .append (_author )
3997
40- replacement_dict = {
41- "search_string" : search_term ,
42- "number_of_records" : app .config ['NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT' ]
43- }
44- query = query_template .substitute (replacement_dict )
45- query = ' ' .join (query .split ())
46- search_result = data_retriever .retrieve_data (source = source ,
47- base_url = app .config ['DATA_SOURCES' ][source ].get ('search-endpoint' , '' ),
48- search_term = query ,
49- failed_sources = failed_sources )
50-
51- hits = search_result .get ("results" , {}).get ("bindings" , [])
52- total_hits = len (hits )
53- utils .log_event (type = "info" , message = f"{ source } - { total_hits } records matched; pulled top { total_hits } " )
98+ authorsStrings = hit .get ("authorsString" , {}).get ("value" , "" )
99+ for authorsString in authorsStrings .rstrip ("," ).split ("," ):
100+ _author = Author ()
101+ _author .additionalType = 'Person'
102+ _author .name = authorsString
103+ _author .identifier = "" # ORCID is available for few; we need to update the sparql query to pull this information
104+ author_source = thing (
105+ name = self .SOURCE ,
106+ identifier = _author .identifier ,
107+ )
108+ _author .source .append (author_source )
109+ publication .author .append (_author )
54110
55- if int (total_hits ) > 0 :
56- for hit in hits :
57-
58- publication = Article ()
111+ _source = thing ()
112+ _source .name = self .SOURCE #'WIKIDATA'
113+ _source .identifier = hit ['item' ].get ('value' , "" ).replace ("http://www.wikidata.org/" , "" ) # remove the base url and only keep the ID
114+ _source .url = hit ['item' ].get ('value' , "" )
115+ publication .source .append (_source )
116+ return publication
59117
60- publication .name = hit .get ("label" , {}).get ("value" ,"" )
61- publication .url = hit .get ("item" , {}).get ("value" ,"" )
62- publication .identifier = hit .get ("doi" , {}).get ("value" ,"" ) #DOI is available for few; we need to update the sparql query to fetch this information
63- # print(publication.identifier)
64- publication .datePublished = datetime .strftime (parser .parse (hit .get ('date' , {}).get ('value' , "" )), '%Y-%m-%d' )
65-
66- authorsLabels = hit .get ("authorsLabel" , {}).get ("value" ,"" )
67- for authorsLabel in authorsLabels .rstrip ("," ).split ("," ):
68- _author = Author ()
69- _author .additionalType = 'Person'
70- _author .name = authorsLabel
71- _author .identifier = "" #ORCID is available for few; we need to update the sparql query to pull this information
72- author_source = thing (
73- name = source ,
74- identifier = _author .identifier ,
75- )
76- _author .source .append (author_source )
77- publication .author .append (_author )
78-
79- authorsStrings = hit .get ("authorsString" , {}).get ("value" ,"" )
80- for authorsString in authorsStrings .rstrip ("," ).split ("," ):
81- _author = Author ()
82- _author .additionalType = 'Person'
83- _author .name = authorsString
84- _author .identifier = ""
85- author_source = thing (
86- name = source ,
87- identifier = _author .identifier ,
88- )
89- _author .source .append (author_source )
90- publication .author .append (_author )
91-
92- _source = thing ()
93- _source .name = 'WIKIDATA'
94- _source .identifier = hit ['item' ].get ('value' , "" ).replace ("http://www.wikidata.org/" , "" ) # remove the base url and only keep the ID
95- _source .url = hit ['item' ].get ('value' , "" )
96- publication .source .append (_source )
118+ @utils .handle_exceptions
119+ def search (self , source_name : str , search_term : str , results : dict , failed_sources : list ) -> None :
120+ """
121+ Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
122+ """
123+ raw = self .fetch (search_term , failed_sources )
124+ hits = self .extract_hits (raw )
125+ if len (hits )> 0 :
126+ for hit in hits :
127+ publication = self .map_hit (hit = hit )
128+ if publication .identifier != "" :
129+ results ["publications" ].append (publication )
130+ else :
131+ results ['others' ].append (publication )
97132
98- if publication .identifier != "" :
99- results ['publications' ].append (publication )
100- else :
101- results ['others' ].append (publication )
102133
103-
134+ def search (source_name : str , search_term : str , results : dict , failed_sources : list ):
135+ """
136+ Entrypoint to search WIKIDATA publications.
137+ """
138+ WIKIDATA_Publication ().search (source_name , search_term , results , failed_sources )
0 commit comments