GA_ABC_MDS/JS.py at master · doudoucao/GA_ABC_MDS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding:utf-8
from nlp_util import *

import math
import numpy as np

from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()


def get_all_content_words_lemmatized(sentences, N=1):
    all_words = []
    for s in sentences:
        all_words.extend([wordnet_lemmatizer.lemmatize(r) for r in sentence_token(s)])
    if N == 1:
        content_words = [w for w in all_words if w not in stopset]
    normalized_content_words = map(normalize_word, content_words)
    if N > 1:
        return [gram for gram in ngrams(normalized_content_words, N)]
    return normalized_content_words


def get_all_content_words_stemmed(sentences, N=1):
    def is_ngram_content(g):
        for a in g:
            if not (a in stopset):
                return True
            return False

    all_words = []
    for s in sentences:
        all_words.extend([stemmer.stem(r) for r in sentence_token(s)])
    if N == 1:
        content_words = [w for w in all_words if w not in stopset]
    else:
        content_words = all_words
    # content_words = all_words
    normalize_content_words = map(normalize_word, content_words)
    if N > 1:
        return [gram for gram in ngrams(normalize_content_words, N) if is_ngram_content(gram)]
    return normalize_content_words


def get_content_words_in_sentence(sentence):
    words = sentence_token(sentence)
    return [w for w in words if w not in stopset]


def compute_tf_doc(docs, N=1):
    sentences = []
    for title, doc in docs:
        sentences.append(title)
        sentences.extend(doc)

    content_words = list(set(get_all_content_words_stemmed(sentences, N)))
    docs_words = []
    for title, doc in docs:
        s_tmp = [title]
        s_tmp.extend(doc)
        docs_words.append(get_all_content_words_stemmed(s_tmp, N))

    word_freq = {}
    for w in content_words:
        w_score = 0
        for d in docs_words:
            if w in d:
                w_score += 1
        if w_score != 0:
            word_freq[w] = w_score
    content_word_tf = dict((w, f / float(len(word_freq.keys()))) for w, f in word_freq.items())
    return content_word_tf


def compute_word_freq(words):
    word_freq = {}
    for w in words:
        word_freq[w] = word_freq.get(w, 0) + 1
    return word_freq


def compute_tf(sentences, N=1):
    content_words = get_all_content_words_stemmed(sentences, N)
    content_words_count = len(content_words)
    content_words_freq = compute_word_freq(content_words)

    content_word_tf = dict((w, f / float(content_words_count)) for w, f in content_words_freq.items())
    return content_word_tf


def compute_average_freq(l_freq_1, l_freq_2):
    average_freq = {}

    keys = set(l_freq_1.keys()) | set(l_freq_2.keys())

    for k in keys:
        s_1 = l_freq_1.get(k, 0)
        s_2 = l_freq_2.get(k, 0)

        average_freq[k] = (s_1 + s_2) / 2

    return average_freq


def kl_divergence(summary_freq, doc_freq):
    sum_val = 0
    for w, f in summary_freq.items():
        if w in doc_freq:
            sum_val += f * math.log(f / float(doc_freq[w]))

    return sum_val


def js_divergence(sys_summary, doc_freq):
    summary_freq = compute_tf(sys_summary)
    average_freq = compute_average_freq(summary_freq, doc_freq)

    jsd = kl_divergence(summary_freq, average_freq) + kl_divergence(doc_freq, average_freq)
    return jsd / 2.