-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathbaseentity.py
More file actions
415 lines (333 loc) · 16.5 KB
/
baseentity.py
File metadata and controls
415 lines (333 loc) · 16.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
from __future__ import annotations
import logging
import re
from copy import copy
from typing import TYPE_CHECKING, Any
import requests
from entityshape import EntityShape, Result
from pydantic import BaseModel
from pyshex import ShExEvaluator
from pyshex.shex_evaluator import EvaluationResult
from rdflib import Graph
from wikibaseintegrator import wbi_fastrun
from wikibaseintegrator.datatypes import BaseDataType
from wikibaseintegrator.models.claims import Claim, Claims
from wikibaseintegrator.wbi_config import config
from wikibaseintegrator.wbi_enums import ActionIfExists
from wikibaseintegrator.wbi_exceptions import EntitySchemaDownloadError, MissingEntityException, TtlDownloadError
from wikibaseintegrator.wbi_helpers import delete_page, edit_entity, mediawiki_api_call_helper
from wikibaseintegrator.wbi_login import _Login
if TYPE_CHECKING:
from wikibaseintegrator import WikibaseIntegrator
log = logging.getLogger(__name__)
class PyshexResult(BaseModel):
reason: str
valid: bool
def __str__(self):
return (
f"Valid: {self.valid}\n"
f"Reason: {self.reason}"
)
class BaseEntity:
ETYPE = 'base-entity'
subclasses: list[type[BaseEntity]] = []
def __init__(self, api: WikibaseIntegrator | None = None, title: str | None = None, pageid: int | None = None, lastrevid: int | None = None, type: str | None = None,
id: str | None = None, claims: Claims | None = None, is_bot: bool | None = None, login: _Login | None = None):
if not api:
from wikibaseintegrator import WikibaseIntegrator
self.api = WikibaseIntegrator()
else:
self.api = copy(api)
self.api.is_bot = is_bot or self.api.is_bot
self.api.login = login or self.api.login
self.title = title
self.pageid = pageid
self.lastrevid = lastrevid
self.type = str(type or self.ETYPE)
self.id = id
self.claims = claims or Claims()
# Allow registration of subclasses of BaseEntity into BaseEntity.subclasses
def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)
cls.subclasses.append(cls)
@property
def api(self) -> WikibaseIntegrator:
return self.__api
@api.setter
def api(self, value: WikibaseIntegrator):
from wikibaseintegrator import WikibaseIntegrator
if not isinstance(value, WikibaseIntegrator):
raise TypeError
self.__api = value
@property
def title(self) -> str | None:
return self.__title
@title.setter
def title(self, value: str | None):
self.__title = value
@property
def pageid(self) -> str | int | None:
return self.__pageid
@pageid.setter
def pageid(self, value: str | int | None):
if isinstance(value, str):
self.__pageid: str | int | None = int(value)
else:
self.__pageid = value
@property
def lastrevid(self) -> int | None:
return self.__lastrevid
@lastrevid.setter
def lastrevid(self, value: int | None):
self.__lastrevid = value
@property
def type(self) -> str:
return self.__type
@type.setter
def type(self, value: str):
self.__type = value
@property
def id(self) -> str | None:
return self.__id
@id.setter
def id(self, value: str | None):
self.__id = value
@property
def claims(self) -> Claims:
return self.__claims
@claims.setter
def claims(self, value: Claims):
if not isinstance(value, Claims):
raise TypeError
self.__claims = value
def add_claims(self, claims: Claim | list[Claim] | Claims, action_if_exists: ActionIfExists = ActionIfExists.APPEND_OR_REPLACE) -> BaseEntity:
"""
:param claims: A Claim, list of Claim or just a Claims object to add to this Claims object.
:param action_if_exists: Replace or append the statement. You can force an addition if the declaration already exists.
KEEP: The original claim will be kept and the new one will not be added (because there is already one with this property number)
APPEND_OR_REPLACE: The new claim will be added only if the new one is different (by comparing values)
FORCE_APPEND: The new claim will be added even if already exists
REPLACE_ALL: The new claim will replace the old one
:return: Return the updated entity object.
"""
self.claims.add(claims=claims, action_if_exists=action_if_exists)
return self
def get_json(self) -> dict[str, str | dict[str, list]]:
"""
To get the dict equivalent of the JSON representation of the entity.
:return:
"""
json_data: dict = {
'type': self.type,
'claims': self.claims.get_json()
}
if self.id:
json_data['id'] = self.id
return json_data
def from_json(self, json_data: dict[str, Any]) -> BaseEntity:
"""
Import a dictionary into BaseEntity attributes.
:param json_data: A specific dictionary from MediaWiki API
:return:
"""
if 'missing' in json_data: # TODO: 1.35 compatibility
raise MissingEntityException('The MW API returned that the entity was missing.')
if 'title' in json_data: # TODO: 1.35 compatibility
self.title = str(json_data['title'])
if 'pageid' in json_data: # TODO: 1.35 compatibility
self.pageid = int(json_data['pageid'])
self.lastrevid = int(json_data['lastrevid'])
self.type = str(json_data['type'])
self.id = str(json_data['id'])
if 'claims' in json_data: # 'claims' is named 'statements' in Wikimedia Commons MediaInfo
self.claims = Claims().from_json(json_data['claims'])
return self
# noinspection PyMethodMayBeStatic
def _get(self, entity_id: str, login: _Login | None = None, allow_anonymous: bool = True, is_bot: bool | None = None, **kwargs: Any) -> dict: # pylint: disable=no-self-use
"""
Retrieve an entity in json representation from the Wikibase instance
:param entity_id: The ID of the entity to retrieve
:param login: A login instance
:param allow_anonymous: Force a check if the query can be anonymous or not
:param is_bot: Add the bot flag to the query
:param kwargs: More arguments for Python requests
:return: python complex dictionary representation of a json
"""
params = {
'action': 'wbgetentities',
'ids': entity_id,
'format': 'json'
}
login = login or self.api.login
is_bot = is_bot if is_bot is not None else self.api.is_bot
return mediawiki_api_call_helper(data=params, login=login, allow_anonymous=allow_anonymous, is_bot=is_bot, **kwargs)
def clear(self, **kwargs: Any) -> dict[str, Any]:
"""
Use the `clear` parameter of `wbeditentity` API call to clear the content of the entity.
The entity will be updated with an empty dictionary.
:param kwargs: More arguments for _write() and Python requests
:return: A dictionary representation of the edited Entity
"""
return self._write(data={}, clear=True, **kwargs)
def _write(self, data: dict | None = None, summary: str | None = None, login: _Login | None = None, allow_anonymous: bool = False, limit_claims: list[str | int] | None = None,
clear: bool = False, as_new: bool = False, is_bot: bool | None = None, **kwargs: Any) -> dict[str, Any]:
"""
Writes the entity JSON to the Wikibase instance and after successful write, returns the "entity" part of the response.
:param data: The serialized object that is used as the data source. A newly created entity will be assigned an 'id'.
:param summary: A summary of the edit
:param login: A login instance
:param allow_anonymous: Force a check if the query can be anonymous or not
:param limit_claims: Limit to a list of specific claims to reduce the data sent and avoid sending the complete entity.
:param clear: If set, the complete entity is emptied before proceeding. The entity will not be saved before it is filled with the "data", possibly with parts excluded.
:param as_new: Write the entity as a new one
:param is_bot: Add the bot flag to the query
:param kwargs: More arguments for Python requests
:return: A dictionary representation of the edited Entity
"""
data = data or {}
if limit_claims:
new_claims = {}
if not isinstance(limit_claims, list):
limit_claims = [limit_claims]
for claim in limit_claims:
if isinstance(claim, int):
claim = 'P' + str(claim)
if claim in data['claims']:
new_claims[claim] = data['claims'][claim]
data['claims'] = new_claims
is_bot = is_bot if is_bot is not None else self.api.is_bot
login = login or self.api.login
if as_new:
entity_id = None
data['id'] = None
else:
entity_id = self.id
try:
json_result: dict = edit_entity(data=data, id=entity_id, type=self.type, summary=summary, clear=clear, is_bot=is_bot, allow_anonymous=allow_anonymous,
login=login, **kwargs)
except Exception:
log.exception('Error while writing to the Wikibase instance')
raise
return json_result['entity']
def delete(self, login: _Login | None = None, allow_anonymous: bool = False, is_bot: bool | None = None, **kwargs: Any):
"""
Delete the current entity. Use the pageid first if available and fallback to the page title.
:param login: A wbi_login.Login instance
:param allow_anonymous: Allow an unidentified edit to the MediaWiki API (default False)
:param is_bot: Flag the edit as a bot
:param reason: Reason for the deletion. If not set, an automatically generated reason will be used.
:param deletetalk: Delete the talk page, if it exists.
:param kwargs: Any additional keyword arguments to pass to mediawiki_api_call_helper and requests.request
:return: The data returned by the API as a dictionary
"""
login = login or self.api.login
if not self.pageid and not self.title:
raise ValueError("A pageid or a page title attribute must be set before deleting an entity object.")
# If there is no pageid, fallback to using the page title. It's not the preferred method.
if not self.pageid:
return delete_page(title=self.title, pageid=None, login=login, allow_anonymous=allow_anonymous, is_bot=is_bot, **kwargs)
else:
if not isinstance(self.pageid, int):
raise ValueError(f"The entity must have a pageid attribute correctly set ({self.pageid})")
return delete_page(title=None, pageid=self.pageid, login=login, allow_anonymous=allow_anonymous, is_bot=is_bot, **kwargs)
def write_required(self, base_filter: list[BaseDataType | list[BaseDataType]] | None = None, action_if_exists: ActionIfExists = ActionIfExists.REPLACE_ALL,
**kwargs: Any) -> bool:
fastrun_container = wbi_fastrun.get_fastrun_container(base_filter=base_filter, **kwargs)
if base_filter is None:
base_filter = []
claims_to_check = []
for claim in self.claims:
if claim.mainsnak.property_number in base_filter:
claims_to_check.append(claim)
# TODO: Add check_language_data
return fastrun_container.write_required(data=claims_to_check, cqid=self.id, action_if_exists=action_if_exists)
def get_entity_url(self, wikibase_url: str | None = None) -> str:
from wikibaseintegrator.wbi_config import config
wikibase_url = wikibase_url or str(config['WIKIBASE_URL'])
if wikibase_url and self.id:
return wikibase_url + '/entity/' + self.id
raise ValueError('wikibase_url or entity ID is null.')
def _get_valid_entity_schema_id(self, entity_schema_id) -> str:
if isinstance(entity_schema_id, str):
pattern = re.compile(r'^(?:[a-zA-Z]+:)?E?([0-9]+)$')
matches = pattern.match(entity_schema_id)
if not matches:
raise ValueError(f"Invalid EntitySchema ID ({entity_schema_id}), format must be 'E[0-9]+'")
entity_schema_id = f'E{matches.group(1)}'
elif isinstance(entity_schema_id, int):
entity_schema_id = f'E{entity_schema_id}'
else:
raise ValueError(f"Invalid EntitySchema ID ({entity_schema_id}), format must be 'E[0-9]+'")
return entity_schema_id
def _get_ttl_data(self) -> str:
"""Download the entity data in turtle format (ttl)"""
api_endpoint = 'https://www.wikidata.org/wiki/Special:EntityData/'
api_url = f'{api_endpoint}{self.id}.ttl'
# TODO fix timeout
response = requests.get(api_url, timeout=10)
if response.status_code == 200:
return response.text
else:
raise TtlDownloadError()
def _get_schema_text(self, entity_schema_id) -> str:
"""
Downloads the schema from wikidata
:param entity_schema_id: the entityschema id to be downloaded
"""
url: str = f"https://www.wikidata.org/wiki/EntitySchema:{entity_schema_id}?action=raw"
response = requests.get(url, timeout=10)
if response.status_code == 200:
json_text: dict = response.json()
return json_text["schemaText"]
else:
raise EntitySchemaDownloadError()
# TODO make an interface for the validator so the user
# does not have to think about how the internals of the validators work
# The users should get similar output no matter which validator they choose
def entityshape_schema_validator(self, entity_schema_id: str, language: str | None = None) -> Result:
entity_schema_id = self._get_valid_entity_schema_id(entity_schema_id=entity_schema_id)
language = str(language or config['DEFAULT_LANGUAGE'])
return EntityShape(qid=self.id, eid=entity_schema_id, lang=language).validate_and_get_result()
def pyshex_schema_validator(self, entity_schema_id: str) -> PyshexResult:
entity_schema_id = self._get_valid_entity_schema_id(entity_schema_id=entity_schema_id)
return self._check_shex_conformance(entity_schema_id=entity_schema_id)
def _check_shex_conformance(self, entity_schema_id: str= "", data: str= "") -> PyshexResult:
"""
Static method which can be used to check for conformance of a Wikidata item to an EntitySchema any SPARQL query
:param entity_schema_id: The URI prefixes required for an endpoint, default is the Wikidata specific prefixes
:param data: Turtle data to be validated (Optional)
:return: The results of the query are an instance of PyshexResult
"""
# load the string of ttl data into a rdf graph to please ShExEvaluator
rdfdata = Graph()
if not data:
# This downloads the ttl data
data = self._get_ttl_data()
# print(data)
# exit()
rdfdata.parse(data=data)
else:
rdfdata.parse(data=data)
for result in ShExEvaluator(rdf=rdfdata, schema=self._get_schema_text(entity_schema_id=entity_schema_id), focus=f"http://www.wikidata.org/entity/{self.id}").evaluate():
result: EvaluationResult
# convert named tuple to pydantic class which is way nicer
# class EvaluationResult(NamedTuple):
# result: bool
# focus: Optional[URIRef]
# start: Optional[URIRef]
# reason: Optional[str]
# We return early because we expect only one result from ShExEvaluator
return PyshexResult(
valid=result[0],
# We ignore these for now as they seem overcomplicated
#focus=result[1],
#start=result[2],
reason=result[3],
)
def __repr__(self):
"""A mixin implementing a simple __repr__."""
return "<{klass} @{id:x} {attrs}>".format( # pylint: disable=consider-using-f-string
klass=self.__class__.__name__,
id=id(self) & 0xFFFFFF,
attrs="\r\n\t ".join(f"{k}={v!r}" for k, v in self.__dict__.items()),
)