Skip to content

Commit d91a485

Browse files
committed
Optimization fastrun
1 parent 187d9a6 commit d91a485

File tree

1 file changed

+57
-28
lines changed

1 file changed

+57
-28
lines changed

wikibaseintegrator/wbi_fastrun.py

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -206,8 +206,13 @@ def _load_qualifiers(self, sid: str, limit: int | None = None) -> Qualifiers:
206206
"""
207207
offset = 0
208208

209+
if not isinstance(sid, str):
210+
raise ValueError('sid must be a string')
211+
209212
limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore
210213

214+
# TODO: Add cache
215+
211216
# We force a refresh of the data, remove the previous results
212217
qualifiers: Qualifiers = Qualifiers()
213218
while True:
@@ -261,6 +266,10 @@ def _load_references(self, sid: str, limit: int = 10000) -> References:
261266
if not isinstance(sid, str):
262267
raise ValueError('sid must be a string')
263268

269+
limit = limit or int(config['SPARQL_QUERY_LIMIT']) # type: ignore
270+
271+
# TODO: Add cache
272+
264273
# We force a refresh of the data, remove the previous results
265274
references: References = References()
266275
while True:
@@ -326,6 +335,10 @@ def _load_rank(self, sid: str) -> WikibaseRank | None:
326335
if not isinstance(sid, str):
327336
raise ValueError('sid must be a string')
328337

338+
# TODO: Add limit?
339+
340+
# TODO: Add cache
341+
329342
query = f'''
330343
#Tool: WikibaseIntegrator wbi_fastrun._load_rank
331344
SELECT ?rank WHERE {{
@@ -501,44 +514,60 @@ def contains(in_list, lambda_filter):
501514

502515
# If the property is already found, load it completely to compare deeply
503516
for claim in claims:
517+
# Check if the property is in the filter
504518
if claim.mainsnak.property_number in property_filter:
505519
sparql_value = claim.get_sparql_value()
520+
# If the value exist in the cache
506521
if sparql_value and claim.mainsnak.property_number in self.data and sparql_value in self.data[claim.mainsnak.property_number]:
507-
for statement in self.data[claim.mainsnak.property_number][sparql_value]:
508-
if entity_filter and statement['entity'].rsplit('/', 1)[-1] not in entity_filter:
509-
continue
510-
if statement['entity'] in common_entities:
511-
if use_qualifiers:
512-
qualifiers = self._load_qualifiers(statement['sid'], limit=100)
513-
514-
if len(qualifiers) != len(claim.qualifiers):
515-
logging.debug("Difference in number of qualifiers, '%i' != '%i'", len(qualifiers), len(claim.qualifiers))
516-
return True
517-
518-
for qualifier in qualifiers:
519-
if qualifier not in claim.qualifiers:
520-
logging.debug("Difference between two qualifiers")
522+
entity_cache = [statement['entity'].rsplit('/', 1)[-1] for statement in self.data[claim.mainsnak.property_number][sparql_value]]
523+
if entity_filter:
524+
common_cache_filter = [value for value in entity_cache if value in entity_filter]
525+
else:
526+
common_cache_filter = entity_cache
527+
# If there is common entities between the cache and the entity_filter
528+
if common_cache_filter:
529+
for statement in self.data[claim.mainsnak.property_number][sparql_value]:
530+
if entity_filter and statement['entity'].rsplit('/', 1)[-1] not in entity_filter:
531+
continue
532+
533+
if statement['entity'] in common_entities:
534+
if use_qualifiers:
535+
qualifiers = self._load_qualifiers(statement['sid'], limit=100)
536+
537+
if len(qualifiers) != len(claim.qualifiers):
538+
logging.debug("Difference in number of qualifiers, '%i' != '%i'", len(qualifiers), len(claim.qualifiers))
521539
return True
522540

523-
if use_references:
524-
references = self._load_references(statement['sid'], limit=100)
541+
for qualifier in qualifiers:
542+
if qualifier not in claim.qualifiers:
543+
logging.debug("Difference between two qualifiers")
544+
return True
525545

526-
if sum(len(ref) for ref in references) != sum(len(x) for x in claim.references):
527-
logging.debug("Difference in number of references, '%i' != '%i'", sum(len(ref) for ref in references), sum(len(x) for x in claim.references))
528-
return True
546+
if use_references:
547+
references = self._load_references(statement['sid'], limit=100)
529548

530-
for reference in references:
531-
if reference not in claim.references:
532-
logging.debug("Difference between two references")
549+
if sum(len(ref) for ref in references) != sum(len(x) for x in claim.references):
550+
logging.debug("Difference in number of references, '%i' != '%i'", sum(len(ref) for ref in references), sum(len(x) for x in claim.references))
533551
return True
534552

535-
if use_rank:
536-
rank = self._load_rank(statement['sid'])
553+
for reference in references:
554+
if reference not in claim.references:
555+
logging.debug("Difference between two references")
556+
return True
537557

538-
if claim.rank != rank:
539-
logging.debug("Difference with the rank")
540-
return True
541-
# TODO: Add use_rank to compare rank ?
558+
if use_rank:
559+
rank = self._load_rank(statement['sid'])
560+
561+
if claim.rank != rank:
562+
logging.debug("Difference with the rank")
563+
return True
564+
else:
565+
logging.debug("No common entities between cache and entity_filter")
566+
return True
567+
# Enable this if the value doesn't exist ?
568+
else:
569+
logging.debug("Value doesn't already exist in an entity")
570+
return True
542571

543572
return False
544573

0 commit comments

Comments
 (0)