diff --git a/wikiquote_wikidata_matches.py b/wikiquote_wikidata_matches.py index f6ff8b3..d80c01f 100644 --- a/wikiquote_wikidata_matches.py +++ b/wikiquote_wikidata_matches.py @@ -3,6 +3,7 @@ # import pip import os import json +import re # pip.main(['list']) import pywikibot import mysql.connector @@ -155,14 +156,30 @@ mydb.commit() # exit() continue - - skip = 0 + # no need to create tile if link already exists try: - sitelink = candidate_item_dict['sitelinks'][myresult[3]+'wikiquote'] + siteLink = candidate_item_dict['siteLinks'][myresult[3]+'wikiquote'] except: - #{"type": "files","files": files, "q":myresult[1],'deferred_decision':'yes'} - tile = {"id": myresult[0], "sections": [ {"type": "item", "q":myresult[1]}, {"type": "text","title": myresult[2],"url": 'https://'+myresult[3]+'.wikiquote.org/wiki/'+myresult[2].replace(' ','_'),'text':target.text[0:500]}], "controls": [{"type":"buttons", "entries":[{"type": "green","decision": "yes","label": "Match", "api_action": {'action': "wbsetsitelink", "id": myresult[1],"linksite": myresult[3]+'wikiquote',"linktitle": myresult[2]}}, {"type": "white", "decision": "skip", "label": "Skip"}, {"type": "blue", "decision": "no", "label": "No"}]}]}#'q:'+myresult[3] - # tile = {"id": myresult[0], "sections": [ {"type": "item", "q":myresult[1]}, {"type": "wikipage","title": myresult[2],"wiki": 'enwikiquote'}], "controls": [{"type":"buttons", "entries":[{"type": "green","decision": "yes","label": "Match", "api_action": {'action': "wbsetsitelink", "id": myresult[1],"linksite": myresult[3]+'wikiquote',"linktitle": myresult[2]}}, {"type": "white", "decision": "skip", "label": "Skip"}, {"type": "blue", "decision": "no", "label": "No"}]}]}#'q:'+myresult[3] + cleaned_text = target.text[0:500] + # for template calls {{..}} + cleaned_text = re.sub(r'\{\{[^}]*\}\}', '', cleaned_text) + # for [category:..] or other similar cases + cleaned_text = re.sub(r'\[\[(?:Category|Kategorie|Categoria)[^\]]*\]\]', '', cleaned_text, flags=re.IGNORECASE) + # for [file] or other similar + cleaned_text = re.sub(r'\[\[(?:File|Image|Datei|Fichier|Archivo|Imagem)[^\]]*\]\]', '', cleaned_text, flags=re.IGNORECASE) + # for __NOTOC__ and similar + cleaned_text = re.sub(r'__[A-Z]+__', '', cleaned_text) + # [w:Page|Display] to Display (interwiki links) + cleaned_text = re.sub(r'\[\[[wW]:[^|]+\|([^\]]+)\]\]', r'\1', cleaned_text) + # [w:Page|Display] to Display (piped links) + cleaned_text = re.sub(r'\[\[[^\]|]+\|([^\]]+)\]\]', r'\1', cleaned_text) + # simple links without pipes + cleaned_text = re.sub(r'\[\[([^\]|]+)\]\]', r'\1', cleaned_text) + # section headers + cleaned_text = re.sub(r'={2,}[^=]+={2,}', '', cleaned_text) + # extra whitespace + cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() + tile = {"id": myresult[0], "sections": [ {"type": "item", "q":myresult[1]}, {"type": "text","title": myresult[2],"url": 'https://'+myresult[3]+'.wikiquote.org/wiki/'+myresult[2].replace(' ','_'),'text':cleaned_text}], "controls": [{"type":"buttons", "entries":[{"type": "green","decision": "yes","label": "Match", "api_action": {'action': "wbsetsitelink", "id": myresult[1],"linksite": myresult[3]+'wikiquote',"linktitle": myresult[2]}}, {"type": "white", "decision": "skip", "label": "Skip"}, {"type": "blue", "decision": "no", "label": "No"}]}]}#'q:'+myresult[3] tiles.append(tile) i += 1 if i >= int(num):