-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathJS.py
More file actions
120 lines (89 loc) · 3.3 KB
/
JS.py
File metadata and controls
120 lines (89 loc) · 3.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding:utf-8
from nlp_util import *
import math
import numpy as np
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def get_all_content_words_lemmatized(sentences, N=1):
all_words = []
for s in sentences:
all_words.extend([wordnet_lemmatizer.lemmatize(r) for r in sentence_token(s)])
if N == 1:
content_words = [w for w in all_words if w not in stopset]
normalized_content_words = map(normalize_word, content_words)
if N > 1:
return [gram for gram in ngrams(normalized_content_words, N)]
return normalized_content_words
def get_all_content_words_stemmed(sentences, N=1):
def is_ngram_content(g):
for a in g:
if not (a in stopset):
return True
return False
all_words = []
for s in sentences:
all_words.extend([stemmer.stem(r) for r in sentence_token(s)])
if N == 1:
content_words = [w for w in all_words if w not in stopset]
else:
content_words = all_words
# content_words = all_words
normalize_content_words = map(normalize_word, content_words)
if N > 1:
return [gram for gram in ngrams(normalize_content_words, N) if is_ngram_content(gram)]
return normalize_content_words
def get_content_words_in_sentence(sentence):
words = sentence_token(sentence)
return [w for w in words if w not in stopset]
def compute_tf_doc(docs, N=1):
sentences = []
for title, doc in docs:
sentences.append(title)
sentences.extend(doc)
content_words = list(set(get_all_content_words_stemmed(sentences, N)))
docs_words = []
for title, doc in docs:
s_tmp = [title]
s_tmp.extend(doc)
docs_words.append(get_all_content_words_stemmed(s_tmp, N))
word_freq = {}
for w in content_words:
w_score = 0
for d in docs_words:
if w in d:
w_score += 1
if w_score != 0:
word_freq[w] = w_score
content_word_tf = dict((w, f / float(len(word_freq.keys()))) for w, f in word_freq.items())
return content_word_tf
def compute_word_freq(words):
word_freq = {}
for w in words:
word_freq[w] = word_freq.get(w, 0) + 1
return word_freq
def compute_tf(sentences, N=1):
content_words = get_all_content_words_stemmed(sentences, N)
content_words_count = len(content_words)
content_words_freq = compute_word_freq(content_words)
content_word_tf = dict((w, f / float(content_words_count)) for w, f in content_words_freq.items())
return content_word_tf
def compute_average_freq(l_freq_1, l_freq_2):
average_freq = {}
keys = set(l_freq_1.keys()) | set(l_freq_2.keys())
for k in keys:
s_1 = l_freq_1.get(k, 0)
s_2 = l_freq_2.get(k, 0)
average_freq[k] = (s_1 + s_2) / 2
return average_freq
def kl_divergence(summary_freq, doc_freq):
sum_val = 0
for w, f in summary_freq.items():
if w in doc_freq:
sum_val += f * math.log(f / float(doc_freq[w]))
return sum_val
def js_divergence(sys_summary, doc_freq):
summary_freq = compute_tf(sys_summary)
average_freq = compute_average_freq(summary_freq, doc_freq)
jsd = kl_divergence(summary_freq, average_freq) + kl_divergence(doc_freq, average_freq)
return jsd / 2.