-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextract_candidates.py
More file actions
46 lines (36 loc) · 1.47 KB
/
extract_candidates.py
File metadata and controls
46 lines (36 loc) · 1.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import nltk
import re
import operator
from textblob import TextBlob
def extract_candidates(comparison_object, sentences):
unique_candidates = {}
print(len(sentences))
for sentence in sentences:
blob = TextBlob(sentence)
for candidate in blob.noun_phrases:
if candidate not in [comparison_object, 'vs', 'vs.'] and is_candidate(candidate, comparison_object, sentence):
if candidate in unique_candidates:
unique_candidates[candidate] += 1
else:
unique_candidates[candidate] = 1
unique_candidates = sorted(unique_candidates.items(), key=operator.itemgetter(1), reverse=True)
return unique_candidates
def is_candidate(candidate, comparison_object, sentence):
vs = ' (vs|vs.) '
candidate = re.escape(candidate)
pattern = '(' + candidate + vs + comparison_object + \
'|' + comparison_object + vs + candidate + ')'
if re.match(pattern, sentence, re.IGNORECASE) is not None:
# print(sentence)
return True
def tag_sentence(sentence):
'''
Returns a list of tags for each word of the sentence. A tag is a combination of the word and
its part of speech coded as an NLTK tag, for example ('apple', 'NN').
'''
# remove special characters
# sentence = re.sub('[^a-zA-Z0-9 ]', ' ', sentence)
# find all words in the sentence
wordlist = nltk.word_tokenize(sentence)
taglist = nltk.pos_tag(wordlist)
return taglist