WikibaseIntegrator/wikibaseintegrator/entities/baseentity.py at 7270888b696c8dbfce683122e0c7bc2a7b45c5f6 · LeMyst/WikibaseIntegrator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
from __future__ import annotations

import logging
import re
from copy import copy
from typing import TYPE_CHECKING, Any

import requests
from entityshape import EntityShape, Result
from pydantic import BaseModel
from pyshex import ShExEvaluator
from pyshex.shex_evaluator import EvaluationResult
from rdflib import Graph

from wikibaseintegrator import wbi_fastrun
from wikibaseintegrator.datatypes import BaseDataType
from wikibaseintegrator.models.claims import Claim, Claims
from wikibaseintegrator.wbi_config import config
from wikibaseintegrator.wbi_enums import ActionIfExists
from wikibaseintegrator.wbi_exceptions import EntitySchemaDownloadError, MissingEntityException, TtlDownloadError
from wikibaseintegrator.wbi_helpers import delete_page, edit_entity, mediawiki_api_call_helper
from wikibaseintegrator.wbi_login import _Login

if TYPE_CHECKING:
    from wikibaseintegrator import WikibaseIntegrator

log = logging.getLogger(__name__)


class PyshexResult(BaseModel):
    reason: str
    valid: bool

    def __str__(self):
        return (
            f"Valid: {self.valid}\n"
            f"Reason: {self.reason}"
        )


class BaseEntity:
    ETYPE = 'base-entity'
    subclasses: list[type[BaseEntity]] = []

    def __init__(self, api: WikibaseIntegrator | None = None, title: str | None = None, pageid: int | None = None, lastrevid: int | None = None, type: str | None = None,
                 id: str | None = None, claims: Claims | None = None, is_bot: bool | None = None, login: _Login | None = None):
        if not api:
            from wikibaseintegrator import WikibaseIntegrator
            self.api = WikibaseIntegrator()
        else:
            self.api = copy(api)

        self.api.is_bot = is_bot or self.api.is_bot
        self.api.login = login or self.api.login

        self.title = title
        self.pageid = pageid
        self.lastrevid = lastrevid
        self.type = str(type or self.ETYPE)
        self.id = id
        self.claims = claims or Claims()

    # Allow registration of subclasses of BaseEntity into BaseEntity.subclasses
    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        cls.subclasses.append(cls)

    @property
    def api(self) -> WikibaseIntegrator:
        return self.__api

    @api.setter
    def api(self, value: WikibaseIntegrator):
        from wikibaseintegrator import WikibaseIntegrator
        if not isinstance(value, WikibaseIntegrator):
            raise TypeError
        self.__api = value

    @property
    def title(self) -> str | None:
        return self.__title

    @title.setter
    def title(self, value: str | None):
        self.__title = value

    @property
    def pageid(self) -> str | int | None:
        return self.__pageid

    @pageid.setter
    def pageid(self, value: str | int | None):
        if isinstance(value, str):
            self.__pageid: str | int | None = int(value)
        else:
            self.__pageid = value

    @property
    def lastrevid(self) -> int | None:
        return self.__lastrevid

    @lastrevid.setter
    def lastrevid(self, value: int | None):
        self.__lastrevid = value

    @property
    def type(self) -> str:
        return self.__type

    @type.setter
    def type(self, value: str):
        self.__type = value

    @property
    def id(self) -> str | None:
        return self.__id

    @id.setter
    def id(self, value: str | None):
        self.__id = value

    @property
    def claims(self) -> Claims:
        return self.__claims

    @claims.setter
    def claims(self, value: Claims):
        if not isinstance(value, Claims):
            raise TypeError
        self.__claims = value

    def add_claims(self, claims: Claim | list[Claim] | Claims, action_if_exists: ActionIfExists = ActionIfExists.APPEND_OR_REPLACE) -> BaseEntity:
        """

        :param claims: A Claim, list of Claim or just a Claims object to add to this Claims object.
        :param action_if_exists: Replace or append the statement. You can force an addition if the declaration already exists.
            KEEP: The original claim will be kept and the new one will not be added (because there is already one with this property number)
            APPEND_OR_REPLACE: The new claim will be added only if the new one is different (by comparing values)
            FORCE_APPEND: The new claim will be added even if already exists
            REPLACE_ALL: The new claim will replace the old one
        :return: Return the updated entity object.
        """

        self.claims.add(claims=claims, action_if_exists=action_if_exists)

        return self

    def get_json(self) -> dict[str, str | dict[str, list]]:
        """
        To get the dict equivalent of the JSON representation of the entity.

        :return:
        """
        json_data: dict = {
            'type': self.type,
            'claims': self.claims.get_json()
        }
        if self.id:
            json_data['id'] = self.id

        return json_data

    def from_json(self, json_data: dict[str, Any]) -> BaseEntity:
        """
        Import a dictionary into BaseEntity attributes.

        :param json_data: A specific dictionary from MediaWiki API
        :return:
        """
        if 'missing' in json_data:  # TODO: 1.35 compatibility
            raise MissingEntityException('The MW API returned that the entity was missing.')

        if 'title' in json_data:  # TODO: 1.35 compatibility
            self.title = str(json_data['title'])
        if 'pageid' in json_data:  # TODO: 1.35 compatibility
            self.pageid = int(json_data['pageid'])
        self.lastrevid = int(json_data['lastrevid'])
        self.type = str(json_data['type'])
        self.id = str(json_data['id'])
        if 'claims' in json_data:  # 'claims' is named 'statements' in Wikimedia Commons MediaInfo
            self.claims = Claims().from_json(json_data['claims'])

        return self

    # noinspection PyMethodMayBeStatic
    def _get(self, entity_id: str, login: _Login | None = None, allow_anonymous: bool = True, is_bot: bool | None = None, **kwargs: Any) -> dict:  # pylint: disable=no-self-use
        """
        Retrieve an entity in json representation from the Wikibase instance

        :param entity_id: The ID of the entity to retrieve
        :param login: A login instance
        :param allow_anonymous: Force a check if the query can be anonymous or not
        :param is_bot: Add the bot flag to the query
        :param kwargs: More arguments for Python requests
        :return: python complex dictionary representation of a json
        """

        params = {
            'action': 'wbgetentities',
            'ids': entity_id,
            'format': 'json'
        }

        login = login or self.api.login
        is_bot = is_bot if is_bot is not None else self.api.is_bot

        return mediawiki_api_call_helper(data=params, login=login, allow_anonymous=allow_anonymous, is_bot=is_bot, **kwargs)

    def clear(self, **kwargs: Any) -> dict[str, Any]:
        """
        Use the `clear` parameter of `wbeditentity` API call to clear the content of the entity.
        The entity will be updated with an empty dictionary.

        :param kwargs: More arguments for _write() and Python requests
        :return: A dictionary representation of the edited Entity
        """
        return self._write(data={}, clear=True, **kwargs)

    def _write(self, data: dict | None = None, summary: str | None = None, login: _Login | None = None, allow_anonymous: bool = False, limit_claims: list[str | int] | None = None,
               clear: bool = False, as_new: bool = False, is_bot: bool | None = None, **kwargs: Any) -> dict[str, Any]:
        """
        Writes the entity JSON to the Wikibase instance and after successful write, returns the "entity" part of the response.

        :param data: The serialized object that is used as the data source. A newly created entity will be assigned an 'id'.
        :param summary: A summary of the edit
        :param login: A login instance
        :param allow_anonymous: Force a check if the query can be anonymous or not
        :param limit_claims: Limit to a list of specific claims to reduce the data sent and avoid sending the complete entity.
        :param clear: If set, the complete entity is emptied before proceeding. The entity will not be saved before it is filled with the "data", possibly with parts excluded.
        :param as_new: Write the entity as a new one
        :param is_bot: Add the bot flag to the query
        :param kwargs: More arguments for Python requests
        :return: A dictionary representation of the edited Entity
        """

        data = data or {}

        if limit_claims:
            new_claims = {}

            if not isinstance(limit_claims, list):
                limit_claims = [limit_claims]

            for claim in limit_claims:
                if isinstance(claim, int):
                    claim = 'P' + str(claim)

                if claim in data['claims']:
                    new_claims[claim] = data['claims'][claim]

            data['claims'] = new_claims

        is_bot = is_bot if is_bot is not None else self.api.is_bot
        login = login or self.api.login

        if as_new:
            entity_id = None
            data['id'] = None
        else:
            entity_id = self.id

        try:
            json_result: dict = edit_entity(data=data, id=entity_id, type=self.type, summary=summary, clear=clear, is_bot=is_bot, allow_anonymous=allow_anonymous,
                                            login=login, **kwargs)
        except Exception:
            log.exception('Error while writing to the Wikibase instance')
            raise

        return json_result['entity']

    def delete(self, login: _Login | None = None, allow_anonymous: bool = False, is_bot: bool | None = None, **kwargs: Any):
        """
        Delete the current entity. Use the pageid first if available and fallback to the page title.

        :param login: A wbi_login.Login instance
        :param allow_anonymous: Allow an unidentified edit to the MediaWiki API (default False)
        :param is_bot: Flag the edit as a bot
        :param reason: Reason for the deletion. If not set, an automatically generated reason will be used.
        :param deletetalk: Delete the talk page, if it exists.
        :param kwargs: Any additional keyword arguments to pass to mediawiki_api_call_helper and requests.request
        :return: The data returned by the API as a dictionary
        """

        login = login or self.api.login

        if not self.pageid and not self.title:
            raise ValueError("A pageid or a page title attribute must be set before deleting an entity object.")

        # If there is no pageid, fallback to using the page title. It's not the preferred method.
        if not self.pageid:
            return delete_page(title=self.title, pageid=None, login=login, allow_anonymous=allow_anonymous, is_bot=is_bot, **kwargs)
        else:
            if not isinstance(self.pageid, int):
                raise ValueError(f"The entity must have a pageid attribute correctly set ({self.pageid})")

            return delete_page(title=None, pageid=self.pageid, login=login, allow_anonymous=allow_anonymous, is_bot=is_bot, **kwargs)

    def write_required(self, base_filter: list[BaseDataType | list[BaseDataType]] | None = None, action_if_exists: ActionIfExists = ActionIfExists.REPLACE_ALL,
                       **kwargs: Any) -> bool:
        fastrun_container = wbi_fastrun.get_fastrun_container(base_filter=base_filter, **kwargs)

        if base_filter is None:
            base_filter = []

        claims_to_check = []
        for claim in self.claims:
            if claim.mainsnak.property_number in base_filter:
                claims_to_check.append(claim)

        # TODO: Add check_language_data

        return fastrun_container.write_required(data=claims_to_check, cqid=self.id, action_if_exists=action_if_exists)

    def get_entity_url(self, wikibase_url: str | None = None) -> str:
        from wikibaseintegrator.wbi_config import config
        wikibase_url = wikibase_url or str(config['WIKIBASE_URL'])
        if wikibase_url and self.id:
            return wikibase_url + '/entity/' + self.id

        raise ValueError('wikibase_url or entity ID is null.')

    def _get_valid_entity_schema_id(self, entity_schema_id) -> str:
        if isinstance(entity_schema_id, str):
            pattern = re.compile(r'^(?:[a-zA-Z]+:)?E?([0-9]+)$')
            matches = pattern.match(entity_schema_id)

            if not matches:
                raise ValueError(f"Invalid EntitySchema ID ({entity_schema_id}), format must be 'E[0-9]+'")

            entity_schema_id = f'E{matches.group(1)}'
        elif isinstance(entity_schema_id, int):
            entity_schema_id = f'E{entity_schema_id}'
        else:
            raise ValueError(f"Invalid EntitySchema ID ({entity_schema_id}), format must be 'E[0-9]+'")
        return entity_schema_id

    def _get_ttl_data(self) -> str:
        """Download the entity data in turtle format (ttl)"""
        api_endpoint = 'https://www.wikidata.org/wiki/Special:EntityData/'
        api_url = f'{api_endpoint}{self.id}.ttl'
        # TODO fix timeout
        response = requests.get(api_url, timeout=10)
        if response.status_code == 200:
            return response.text
        else:
            raise TtlDownloadError()

    def _get_schema_text(self, entity_schema_id) -> str:
        """
        Downloads the schema from wikidata

        :param entity_schema_id: the entityschema id to be downloaded
        """
        url: str = f"https://www.wikidata.org/wiki/EntitySchema:{entity_schema_id}?action=raw"
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            json_text: dict = response.json()
            return json_text["schemaText"]
        else:
            raise EntitySchemaDownloadError()

    # TODO make an interface for the validator so the user
    #  does not have to think about how the internals of the validators work
    # The users should get similar output no matter which validator they choose
    def entityshape_schema_validator(self, entity_schema_id: str, language: str | None = None) -> Result:
        entity_schema_id = self._get_valid_entity_schema_id(entity_schema_id=entity_schema_id)
        language = str(language or config['DEFAULT_LANGUAGE'])
        return EntityShape(qid=self.id, eid=entity_schema_id, lang=language).validate_and_get_result()

    def pyshex_schema_validator(self, entity_schema_id: str) -> PyshexResult:
        entity_schema_id = self._get_valid_entity_schema_id(entity_schema_id=entity_schema_id)
        return self._check_shex_conformance(entity_schema_id=entity_schema_id)

    def _check_shex_conformance(self, entity_schema_id: str= "", data: str= "") -> PyshexResult:
        """
                Static method which can be used to check for conformance of a Wikidata item to an EntitySchema any SPARQL query

                :param entity_schema_id: The URI prefixes required for an endpoint, default is the Wikidata specific prefixes
                :param data: Turtle data to be validated (Optional)
                :return: The results of the query are an instance of PyshexResult
        """
        # load the string of ttl data into a rdf graph to please ShExEvaluator
        rdfdata = Graph()
        if not data:
            # This downloads the ttl data
            data = self._get_ttl_data()
            # print(data)
            # exit()
            rdfdata.parse(data=data)
        else:
            rdfdata.parse(data=data)
        for result in ShExEvaluator(rdf=rdfdata, schema=self._get_schema_text(entity_schema_id=entity_schema_id), focus=f"http://www.wikidata.org/entity/{self.id}").evaluate():
            result: EvaluationResult
            # convert named tuple to pydantic class which is way nicer
            # class EvaluationResult(NamedTuple):
            #     result: bool
            #     focus: Optional[URIRef]
            #     start: Optional[URIRef]
            #     reason: Optional[str]
            # We return early because we expect only one result from ShExEvaluator
            return PyshexResult(
                valid=result[0],
                # We ignore these for now as they seem overcomplicated
                #focus=result[1],
                #start=result[2],
                reason=result[3],
            )

    def __repr__(self):
        """A mixin implementing a simple __repr__."""
        return "<{klass} @{id:x} {attrs}>".format(  # pylint: disable=consider-using-f-string
            klass=self.__class__.__name__,
            id=id(self) & 0xFFFFFF,
            attrs="\r\n\t ".join(f"{k}={v!r}" for k, v in self.__dict__.items()),
        )