-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_sentencediff_parser.py
More file actions
138 lines (120 loc) · 5.28 KB
/
data_sentencediff_parser.py
File metadata and controls
138 lines (120 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import requests
import re
import spacy
from bs4 import BeautifulSoup
from wikitextparser import parse
nlp = spacy.load("en_core_web_sm")
def is_valid_sentence(text):
"""
Returns a list of valid sentences from the input text.
A valid sentence:
- Starts with a title-cased word
- Ends with punctuation
- Has at least one noun and one verb
"""
doc = nlp(text)
valid = []
for sent in doc.sents:
if sent[0].is_title and sent[-1].is_punct:
noun_ok = any(tok.pos_ in ["NOUN", "PROPN", "PRON"] for tok in sent)
verb_ok = any(tok.pos_ == "VERB" for tok in sent)
if noun_ok and verb_ok:
valid.append(str(sent))
return valid
def call_wikipedia_api(revid, parentid):
"""
Calls Wikipedia API to get HTML diff between two revisions.
"""
url = "https://en.wikipedia.org/w/api.php"
params = {
'action': "compare",
'format': "json",
'fromrev': parentid,
'torev': revid
}
r = requests.get(url, params=params)
return r.json().get('compare', {}).get('*')
def eachrev(revid, parentid):
"""
Parses the revision diff to extract sentences with link edits and citation edits.
Returns:
ssum (int): Number of link-containing sentences
sl (list): Sentences with link edits
slinklist (list): Target articles
slinklist2 (list): Link display texts
alllinks (list): All link targets
allscores (list): Score per link (1 / len(linklist))
refcl (list): Changed references
refdl (list): Deleted references
"""
text = call_wikipedia_api(revid, parentid)
if not text:
return 0, [], [], [], [], [], [], []
soup = BeautifulSoup(text, "lxml")
deletions = soup.find_all("del", class_="diffchange diffchange-inline")
parents = list(set(s.parent for s in deletions))
ssum, alllinks, allscores = 0, [], []
sl, slinklist, slinklist2 = [], [], []
refcl, refdl = [], []
refcdict, refddict = {}, {}
count_refc = count_refd = 0
for parent in parents:
text_html = str(parent)
raw_text = parent.text
# Extract links
links = re.findall(r"\[\[(.*?)\]\]", raw_text)
flat_links = [l for l in links if "|" not in l or l.count("|") == 1]
flat_links = [l for l in flat_links if not any(k in l.lower() for k in ["file:", "image:"])]
labels = [l.split("|", 1)[1] if "|" in l else l for l in flat_links]
targets = [l.split("|", 1)[0] if "|" in l else l for l in flat_links]
link_dict = dict(zip(labels, targets))
changed = text_html.replace('<del class="diffchange diffchange-inline">', "-(d(-").replace("</del>", "-)d)-")
for r in links:
if any(k in r.lower() for k in ["file:", "image:"]):
changed = changed.replace(f"[[{r}]]", "")
parsed_text = BeautifulSoup(changed, "lxml").text
refs = parse(parsed_text).get_tags()
ref_strings = [r.string for r in refs]
# Detect changed citations
for ref in [r for r in ref_strings if "-(d(-" in r or "-)d)-" in r]:
safe = ref.replace("<", "<").replace(">", ">")
marker = f"-((refc{count_refc}refc))-"
changed = changed.replace(safe, marker)
refcdict[marker] = ref.replace("-(d(-", "((").replace("-)d)-", "))")
count_refc += 1
# Detect fully deleted citations
for d in re.findall(r'<del class=\"diffchange diffchange-inline\">(.*?)</del>', text_html):
if "<" in d and ">" in d and "ref" in d:
for r in ref_strings:
if r in d:
marker = f"-((refdel{count_refd}refdel))-"
changed = changed.replace(r, marker)
refddict[marker] = r
count_refd += 1
# Clean markup to prepare for sentence parsing
changed = re.sub(r'(\.|[?!])((<.*?>)+(\{\{.*?\}\}))(\s)', r'\2\1 ', changed)
changed = re.sub(r'(\.|[?!])((<.*?>)+)(\s)', r'\2\1 ', changed)
changed = changed.replace("-(d(-", "((").replace("-)d)-", "))")
try:
sentences = is_valid_sentence(parse(changed).plain_text())
except:
print("error: no sentence")
return 0, [], [], [], [], [], [], []
for sent in sentences:
if "((" in sent or "))" in sent:
clean = sent.replace("((", "").replace("))", "")
in_links = [link_dict[l] for l in labels if l in clean]
if in_links:
score = [1 / len(in_links)] * len(in_links)
ssum += 1
sl.append(sent)
slinklist.append(" | ".join(in_links))
slinklist2.append(" | ".join([l for l in labels if l in clean]))
alllinks += in_links
allscores += score
refcl.append(" | ".join([refcdict[r] for r in re.findall(r'\-\(\(refc\d+refc\)\)\-', sent)]))
refdl.append(" | ".join([refddict[r] for r in re.findall(r'\-\(\(refdel\d+refdel\)\)\-', sent)]))
return ssum, sl, slinklist, slinklist2, alllinks, allscores, refcl, refdl
if __name__ == "__main__":
output = eachrev("794386526", "852661894")
print(output)