Skip to content

Commit 24e2b44

Browse files
authored
Merge pull request #379 from semantic-systems/develop
Push updates from dev to main
2 parents ebbc0bb + 2598631 commit 24e2b44

File tree

2 files changed

+332
-164
lines changed

2 files changed

+332
-164
lines changed

sources/wikidata_publications.py

Lines changed: 127 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,103 +1,138 @@
1-
from objects import thing, Article, Author
1+
from objects import thing, Article, Author, Organization
22
from sources import data_retriever
3+
from typing import Iterable, Dict, Any, List
34
import utils
4-
from main import app
55
from string import Template
66
from datetime import datetime
77
from dateutil import parser
8+
from sources.base import BaseSource
9+
from main import app
10+
11+
class WIKIDATA_Publication(BaseSource):
12+
13+
SOURCE = 'WIKIDATA - Publications'
14+
15+
@utils.handle_exceptions
16+
def fetch(self, search_term: str, failed_sources) -> Dict[str, Any]:
17+
"""
18+
Fetch raw json from the source using the given search term.
19+
"""
20+
query_template = Template('''
21+
SELECT DISTINCT ?item ?label ?date ?doi
22+
(group_concat(DISTINCT ?authorsName; separator=",") as ?authorsLabel)
23+
(group_concat(DISTINCT ?authors2; separator=",") as ?authorsString)
24+
WHERE
25+
{
26+
SERVICE wikibase:mwapi
27+
{
28+
bd:serviceParam wikibase:endpoint "www.wikidata.org";
29+
wikibase:limit "once";
30+
wikibase:api "Generator";
31+
mwapi:generator "search";
32+
mwapi:gsrsearch "$search_string";
33+
mwapi:gsrlimit "max".
34+
?item wikibase:apiOutputItem mwapi:title.
35+
}
36+
?item rdfs:label ?label. FILTER( LANG(?label)="en" )
37+
?item wdt:P31/wdt:P279* wd:Q11826511.
38+
?item wdt:P577 ?date .
39+
?item wdt:P356 ?doi .
40+
?item wdt:P50 ?authors.
41+
?authors rdfs:label ?authorsName . FILTER( LANG(?authorsName)="en" )
42+
optional {?item wdt:P2093 ?authors2.}
43+
}
44+
GROUP BY ?item ?label ?date ?doi
45+
LIMIT $number_of_records
46+
47+
''')
48+
replacement_dict = {
49+
"search_string": search_term,
50+
"number_of_records": app.config['NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT']
51+
}
52+
query = query_template.substitute(replacement_dict)
53+
query = ' '.join(query.split())
54+
search_result = data_retriever.retrieve_data(source=self.SOURCE,
55+
base_url=app.config['DATA_SOURCES'][self.SOURCE].get('search-endpoint', ''),
56+
search_term=query,
57+
failed_sources=failed_sources)
58+
return search_result
59+
60+
@utils.handle_exceptions
61+
def extract_hits(self, raw: Dict[str, Any]) -> Iterable[Dict[str, Any]]:
62+
"""
63+
Extract the list of hits from the raw JSON response. Should return an iterable of hit dicts.
64+
"""
65+
hits = raw.get("results", {}).get("bindings", [])
66+
total_hits = len(hits)
67+
utils.log_event(type="info", message=f"{self.SOURCE} - {total_hits} records matched; pulled top {total_hits}")
68+
# print(str(total_hits) + " from WIKIDATA Publications")
69+
if int(total_hits) > 0:
70+
return hits
71+
return []
72+
73+
@utils.handle_exceptions
74+
def map_hit(self, hit: Dict[str, Any]):
75+
"""
76+
Map a single hit dict from the source to a object from objects.py (e.g., Article, CreativeWork).
77+
"""
78+
publication = Article()
879

9-
@utils.handle_exceptions
10-
def search(source: str, search_term: str, results, failed_sources):
11-
query_template = Template('''
12-
SELECT DISTINCT ?item ?label ?date ?doi
13-
(group_concat(DISTINCT ?authorsName; separator=",") as ?authorsLabel)
14-
(group_concat(DISTINCT ?authors2; separator=",") as ?authorsString)
15-
WHERE
16-
{
17-
SERVICE wikibase:mwapi
18-
{
19-
bd:serviceParam wikibase:endpoint "www.wikidata.org";
20-
wikibase:limit "once";
21-
wikibase:api "Generator";
22-
mwapi:generator "search";
23-
mwapi:gsrsearch "$search_string";
24-
mwapi:gsrlimit "max".
25-
?item wikibase:apiOutputItem mwapi:title.
26-
}
27-
?item rdfs:label ?label. FILTER( LANG(?label)="en" )
28-
?item wdt:P31/wdt:P279* wd:Q11826511.
29-
?item wdt:P577 ?date .
30-
?item wdt:P356 ?doi .
31-
?item wdt:P50 ?authors.
32-
?authors rdfs:label ?authorsName . FILTER( LANG(?authorsName)="en" )
33-
optional {?item wdt:P2093 ?authors2.}
34-
}
35-
GROUP BY ?item ?label ?date ?doi
36-
LIMIT $number_of_records
37-
38-
''')
80+
publication.name = hit.get("label", {}).get("value", "")
81+
publication.url = hit.get("item", {}).get("value", "")
82+
publication.identifier = hit.get("doi", {}).get("value", "")
83+
# DOI is available for few; we need to update the sparql query to fetch this information
84+
publication.datePublished = datetime.strftime(parser.parse(hit.get('date', {}).get('value', "")), '%Y-%m-%d')
85+
authorsLabels = hit.get("authorsLabel", {}).get("value", "")
86+
for authorsLabel in authorsLabels.rstrip(",").split(","):
87+
_author = Author()
88+
_author.additionalType = 'Person'
89+
_author.name = authorsLabel
90+
_author.identifier = "" # ORCID is available for few; we need to update the sparql query to pull this information
91+
author_source = thing(
92+
name=self.SOURCE,
93+
identifier=_author.identifier,
94+
)
95+
_author.source.append(author_source)
96+
publication.author.append(_author)
3997

40-
replacement_dict = {
41-
"search_string" : search_term,
42-
"number_of_records" : app.config['NUMBER_OF_RECORDS_FOR_SEARCH_ENDPOINT']
43-
}
44-
query = query_template.substitute(replacement_dict)
45-
query = ' '.join(query.split())
46-
search_result = data_retriever.retrieve_data(source=source,
47-
base_url=app.config['DATA_SOURCES'][source].get('search-endpoint', ''),
48-
search_term=query,
49-
failed_sources=failed_sources)
50-
51-
hits = search_result.get("results", {}).get("bindings", [])
52-
total_hits = len(hits)
53-
utils.log_event(type="info", message=f"{source} - {total_hits} records matched; pulled top {total_hits}")
98+
authorsStrings = hit.get("authorsString", {}).get("value", "")
99+
for authorsString in authorsStrings.rstrip(",").split(","):
100+
_author = Author()
101+
_author.additionalType = 'Person'
102+
_author.name = authorsString
103+
_author.identifier = "" # ORCID is available for few; we need to update the sparql query to pull this information
104+
author_source = thing(
105+
name=self.SOURCE,
106+
identifier=_author.identifier,
107+
)
108+
_author.source.append(author_source)
109+
publication.author.append(_author)
54110

55-
if int(total_hits) > 0:
56-
for hit in hits:
57-
58-
publication = Article()
111+
_source = thing()
112+
_source.name = self.SOURCE #'WIKIDATA'
113+
_source.identifier = hit['item'].get('value', "").replace("http://www.wikidata.org/", "") # remove the base url and only keep the ID
114+
_source.url = hit['item'].get('value', "")
115+
publication.source.append(_source)
116+
return publication
59117

60-
publication.name = hit.get("label", {}).get("value","")
61-
publication.url = hit.get("item", {}).get("value","")
62-
publication.identifier = hit.get("doi", {}).get("value","") #DOI is available for few; we need to update the sparql query to fetch this information
63-
# print(publication.identifier)
64-
publication.datePublished = datetime.strftime(parser.parse(hit.get('date', {}).get('value', "")), '%Y-%m-%d')
65-
66-
authorsLabels = hit.get("authorsLabel", {}).get("value","")
67-
for authorsLabel in authorsLabels.rstrip(",").split(","):
68-
_author = Author()
69-
_author.additionalType = 'Person'
70-
_author.name = authorsLabel
71-
_author.identifier = "" #ORCID is available for few; we need to update the sparql query to pull this information
72-
author_source = thing(
73-
name=source,
74-
identifier=_author.identifier,
75-
)
76-
_author.source.append(author_source)
77-
publication.author.append(_author)
78-
79-
authorsStrings = hit.get("authorsString", {}).get("value","")
80-
for authorsString in authorsStrings.rstrip(",").split(","):
81-
_author = Author()
82-
_author.additionalType = 'Person'
83-
_author.name = authorsString
84-
_author.identifier = ""
85-
author_source = thing(
86-
name=source,
87-
identifier=_author.identifier,
88-
)
89-
_author.source.append(author_source)
90-
publication.author.append(_author)
91-
92-
_source = thing()
93-
_source.name = 'WIKIDATA'
94-
_source.identifier = hit['item'].get('value', "").replace("http://www.wikidata.org/", "") # remove the base url and only keep the ID
95-
_source.url = hit['item'].get('value', "")
96-
publication.source.append(_source)
118+
@utils.handle_exceptions
119+
def search(self, source_name: str, search_term: str, results: dict, failed_sources: list) -> None:
120+
"""
121+
Fetch json from the source, extract hits, map them to objects, and insert them in-place into the results dict.
122+
"""
123+
raw = self.fetch(search_term, failed_sources)
124+
hits = self.extract_hits(raw)
125+
if len(hits)> 0:
126+
for hit in hits:
127+
publication = self.map_hit(hit=hit)
128+
if publication.identifier != "":
129+
results["publications"].append(publication)
130+
else:
131+
results['others'].append(publication)
97132

98-
if publication.identifier != "":
99-
results['publications'].append(publication)
100-
else:
101-
results['others'].append(publication)
102133

103-
134+
def search(source_name: str, search_term: str, results: dict, failed_sources: list):
135+
"""
136+
Entrypoint to search WIKIDATA publications.
137+
"""
138+
WIKIDATA_Publication().search(source_name, search_term, results, failed_sources)

0 commit comments

Comments
 (0)