miso-belica · mamei16 · Dec 2, 2023 · Dec 2, 2023 · Dec 2, 2023 · Copilot
diff --git a/sumy/summarizers/fast_kl.py b/sumy/summarizers/fast_kl.py
@@ -0,0 +1,162 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+try:
+    import numpy as np
+except ImportError:
+    numpy = None
-    numpy = None
+    np = None
-    numpy = None
+    np = None
+
+from sumy.summarizers._summarizer import AbstractSummarizer
+
+
+class KLSummarizer(AbstractSummarizer):
+    """
+    Method that greedily adds sentences to a summary so long as it decreases the
+    KL Divergence.
+    Source: http://www.aclweb.org/anthology/N09-1041
+    """
+    MISSING_WORD_VAL = 42.0  # placeholder value used for missing words in document
-    MISSING_WORD_VAL = 42.0  # placeholder value used for missing words in document
+    # Sentinel value used for words that are missing from the document's frequency
+    # representation. We use a numeric value that is assumed to lie outside the
+    # range of any valid word frequency observed in typical documents so that it
+    # can be distinguished from real counts wherever it is used.
+    MISSING_WORD_VAL = 42.0
-    MISSING_WORD_VAL = 42.0  # placeholder value used for missing words in document
+    # Sentinel value used for words that are missing from the document's frequency
+    # representation. We use a numeric value that is assumed to lie outside the
+    # range of any valid word frequency observed in typical documents so that it
+    # can be distinguished from real counts wherever it is used.
+    MISSING_WORD_VAL = 42.0
+    stop_words = frozenset()
+
+    def __call__(self, document, sentences_count):
+        self._ensure_dependencies_installed()
+
+        sentences = document.sentences
+        ratings = self._compute_ratings(sentences)
+
+        return self._get_best_sentences(sentences, sentences_count, ratings)
+
+    @staticmethod
+    def _ensure_dependencies_installed():
+        if np is None:
+            raise ValueError("Fast KL-Sum summarizer requires NumPy."
+                             "Please, install it by command 'pip install numpy'.")
-                             "Please, install it by command 'pip install numpy'.")
+                             " Please, install it by command 'pip install numpy'.")
-                             "Please, install it by command 'pip install numpy'.")
+                             " Please, install it by command 'pip install numpy'.")
+
+    @staticmethod
+    def _get_all_words_in_doc(sentences):
+        return [w for s in sentences for w in s.words]
+
+    def _get_content_words_in_sentence(self, sentence):
+        normalized_words = self._normalize_words(sentence.words)
+        normalized_content_words = self._filter_out_stop_words(normalized_words)
+        return normalized_content_words
+
+    def _normalize_words(self, words):
+        return [self.normalize_word(w) for w in words]
+
+    def _filter_out_stop_words(self, words):
+        return [w for w in words if w not in self.stop_words]
+
+    @staticmethod
+    def _old_compute_word_freq(list_of_words, d=None):
+        word_freq = {} if d is None else d
+        for w in list_of_words:
+            word_freq[w] = word_freq.get(w, 0) + 1
+        return word_freq
+
+    @staticmethod
+    def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind):
-    def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind):
+    def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind):
+        """
+        Update a word-frequency array in place based on the given list of words.
+
+        Unlike :meth:`_old_compute_word_freq`, which builds and returns a dict,
+        this variant increments counts directly in an existing numeric array.
+
+        :param list_of_words: Iterable of word tokens whose frequencies should be counted.
+        :param word_freq_arr: Mutable numeric array (e.g., NumPy array) where each
+            position corresponds to a word index; this array is modified in place.
+        :param word_to_ind: Mapping from word token to integer index into
+            ``word_freq_arr``.
+        :return: The same ``word_freq_arr`` instance, after in-place updates.
+        """
-    def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind):
+    def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind):
+        """
+        Update a word-frequency array in place based on the given list of words.
+
+        Unlike :meth:`_old_compute_word_freq`, which builds and returns a dict,
+        this variant increments counts directly in an existing numeric array.
+
+        :param list_of_words: Iterable of word tokens whose frequencies should be counted.
+        :param word_freq_arr: Mutable numeric array (e.g., NumPy array) where each
+            position corresponds to a word index; this array is modified in place.
+        :param word_to_ind: Mapping from word token to integer index into
+            ``word_freq_arr``.
+        :return: The same ``word_freq_arr`` instance, after in-place updates.
+        """
+        for w in list_of_words:
+            word_freq_arr[word_to_ind[w]] += 1
+        return word_freq_arr
+
+    def _get_all_content_words_in_doc(self, sentences):
+        all_words = self._get_all_words_in_doc(sentences)
+        normalized_words = self._normalize_words(all_words)
+        normalized_content_words = self._filter_out_stop_words(normalized_words)
+        return normalized_content_words
+
+    def compute_tf(self, sentences):
+        """
+        Computes the normalized term frequency as explained in http://www.tfidf.com/
+
+        :type sentences: [sumy.models.dom.Sentence]
+        """
+        content_words = self._get_all_content_words_in_doc(sentences)
+        content_words_count = len(content_words)
+        content_words_freq = self._old_compute_word_freq(content_words)
+        content_word_tf = dict((w, f / content_words_count) for w, f in content_words_freq.items())
+        return content_word_tf
+
+    @staticmethod
+    def _joint_freq(wc1, wc2, total_len):
+        if total_len == 0:
+            return np.zeros_like(wc1)
+        joint_sum = wc1 + wc2
+        return joint_sum / total_len
+
+    @staticmethod
+    def _kl_divergence(summary_freq, doc_freq, doc_missing_word_mask):
+        summary_freq = np.where((summary_freq != 0.0) & doc_missing_word_mask, summary_freq, doc_freq)
+        return (doc_freq * np.log(doc_freq / summary_freq)).sum()
+
+    @staticmethod
+    def _find_index_of_best_sentence(kls):
+        """
+        the best sentence is the one with the smallest kl_divergence
-        the best sentence is the one with the smallest kl_divergence
+        the best sentence is the one with the smallest KL divergence
-        the best sentence is the one with the smallest kl_divergence
+        the best sentence is the one with the smallest KL divergence
+        """
+        return kls.index(min(kls))
+
+    def _compute_ratings(self, sentences):
+        word_to_freq = self.compute_tf(sentences)
+
+        vocabulary = set(self._get_all_words_in_doc(sentences)).union(word_to_freq.keys())
+        word_to_ind = {word: index for index, word in enumerate(vocabulary)}
+
+        word_freq = np.repeat(self.MISSING_WORD_VAL, len(vocabulary))
+        for k, v in word_to_freq.items():
+            word_freq[word_to_ind[k]] = v
+        missing_word_mask = word_freq != self.MISSING_WORD_VAL
+
+        ratings = {}
+
+        # Keep track of number of words in summary and word frequency
+        summary_word_list_len = 0
+        summary_word_freq = np.repeat(0.0, len(vocabulary))
+
+        # make it a list so that it can be modified
+        sentences_list = list(sentences)
+
+        # get all content words once for efficiency
+        sentences_as_words = [self._get_content_words_in_sentence(s) for s in sentences]
+
+        # calculate all sentence lengths and word frequencies once for efficiency
+        i_to_sent_word_freq = {}
+        i_to_sent_len = {}
+        for i, s in enumerate(sentences_as_words):
+            sent_word_freq = np.zeros_like(word_freq)
+            sent_word_freq = self._compute_word_freq(s, sent_word_freq, word_to_ind)
+            i_to_sent_word_freq[i] = sent_word_freq
+            i_to_sent_len[i] = len(s)
+
+        iterations = 0
+        indices = list(range(len(sentences_as_words)))
+        # Removes one sentence per iteration by adding to summary
+        while len(indices) > 0:
+            iterations += 1
-        iterations = 0
-        indices = list(range(len(sentences_as_words)))
-        # Removes one sentence per iteration by adding to summary
-        while len(indices) > 0:
-            iterations += 1
+        indices = list(range(len(sentences_as_words)))
+        # Removes one sentence per iteration by adding to summary
+        while len(indices) > 0:
-        iterations = 0
-        indices = list(range(len(sentences_as_words)))
-        # Removes one sentence per iteration by adding to summary
-        while len(indices) > 0:
-            iterations += 1
+        indices = list(range(len(sentences_as_words)))
+        # Removes one sentence per iteration by adding to summary
+        while len(indices) > 0:
+            # will store all the kls values for this pass
+            kls = []
+
+            for i in indices:
+                # calculates the joint frequency
+                joint_freq = self._joint_freq(i_to_sent_word_freq[i], summary_word_freq,
+                                              i_to_sent_len[i] + summary_word_list_len)
+
+                # adds the calculated kl divergence to the list in index = sentence used
+                kls.append(self._kl_divergence(joint_freq, word_freq, missing_word_mask))
+
+            # to consider and then add it into the summary
+            index_to_remove = self._find_index_of_best_sentence(kls)
+            best_sentence = sentences_list[indices[index_to_remove]]
+            del indices[index_to_remove]
+            best_sentence_word_list = self._get_all_words_in_doc([best_sentence])
+            # update summary length and word frequencies
+            summary_word_list_len += len(best_sentence_word_list)
+            summary_word_freq = self._compute_word_freq(best_sentence_word_list, summary_word_freq, word_to_ind)
-            best_sentence_word_list = self._get_all_words_in_doc([best_sentence])
-            # update summary length and word frequencies
-            summary_word_list_len += len(best_sentence_word_list)
-            summary_word_freq = self._compute_word_freq(best_sentence_word_list, summary_word_freq, word_to_ind)
+            best_sentence_content_words = self._get_content_words_in_sentence(best_sentence)
+            # update summary length and word frequencies using normalized content words
+            summary_word_list_len += len(best_sentence_content_words)
+            summary_word_freq = self._compute_word_freq(best_sentence_content_words, summary_word_freq, word_to_ind)
-            best_sentence_word_list = self._get_all_words_in_doc([best_sentence])
-            # update summary length and word frequencies
-            summary_word_list_len += len(best_sentence_word_list)
-            summary_word_freq = self._compute_word_freq(best_sentence_word_list, summary_word_freq, word_to_ind)
+            best_sentence_content_words = self._get_content_words_in_sentence(best_sentence)
+            # update summary length and word frequencies using normalized content words
+            summary_word_list_len += len(best_sentence_content_words)
+            summary_word_freq = self._compute_word_freq(best_sentence_content_words, summary_word_freq, word_to_ind)
+
+            # value is the iteration in which it was removed multiplied by -1 so that
+            # the first sentences removed (the most important) have highest values
+            ratings[best_sentence] = -1 * len(ratings)
+        return ratings
diff --git a/tests/test_summarizers/test_fast_kl.py b/tests/test_summarizers/test_fast_kl.py
@@ -0,0 +1,207 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+from __future__ import division, print_function, unicode_literals
+
+import pytest
+import numpy as np
+
+import sumy.summarizers.fast_kl as fast_kl_module
+from sumy.models.dom._sentence import Sentence
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.fast_kl import KLSummarizer
+from ..utils import build_document
+
+
+@pytest.fixture
+def empty_stop_words():
+    return []
+
+
+@pytest.fixture
+def stop_words():
+    return ["the", "and", "i"]
+
+
+@pytest.fixture
+def summarizer(stop_words):
+    summarizer = KLSummarizer()
+    summarizer.stop_words = stop_words
+    return summarizer
+
+
+def test_numpy_not_installed():
+    summarizer = KLSummarizer()
+
+    numpy = fast_kl_module.np
+    fast_kl_module.np = None
+
+    with pytest.raises(ValueError):
+        summarizer(build_document(), 10)
+
+    fast_kl_module.np = numpy
+
+
+def test_empty_document(summarizer):
+    document = build_document()
+    returned = summarizer(document, 10)
+
+    assert len(returned) == 0
+
+
+def test_single_sentence(summarizer):
+    s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
+    document = build_document([s])
+
+    returned = summarizer(document, 10)
+
+    assert len(returned) == 1
+
+
+def test_compute_word_freq(summarizer):
+    words = ["one", "two", "three", "four"]
+    word_freq = np.zeros(len(words))
+    word_to_ind = {word: index for index, word in enumerate(words)}
+    freq = summarizer._compute_word_freq(words, word_freq, word_to_ind)
+
+    assert np.all(freq == 1)
+
+    words = ["one", "one", "two", "two"]
+    word_freq = np.zeros(len(set(words)))
+    word_to_ind = {word: index for index, word in enumerate(set(words))}
+    freq = summarizer._compute_word_freq(words, word_freq, word_to_ind)
+
+    assert np.all(freq == 2)
+
+
+def test_joint_freq(summarizer):
+    w1 = ["one", "two", "three", "four"]
+    w2 = ["one", "two", "three", "four"]
+
+    word_freq1 = np.zeros(len(w1))
+    word_freq2 = np.zeros_like(word_freq1)
+    word_to_ind = {word: index for index, word in enumerate(w1)}
+    freq1 = summarizer._compute_word_freq(w1, word_freq1, word_to_ind)
+    freq2 = summarizer._compute_word_freq(w1, word_freq2, word_to_ind)
-    freq2 = summarizer._compute_word_freq(w1, word_freq2, word_to_ind)
+    freq2 = summarizer._compute_word_freq(w2, word_freq2, word_to_ind)
-    freq2 = summarizer._compute_word_freq(w1, word_freq2, word_to_ind)
+    freq2 = summarizer._compute_word_freq(w2, word_freq2, word_to_ind)
+
+    freq = summarizer._joint_freq(freq1, freq2, len(w1) + len(w2))
+
+    assert np.all(freq == 1.0/4)
+
+    w1 = ["one", "two", "three", "four"]
+    w2 = ["one", "one", "three", "five"]
+
+    vocabulary = set(w1).union(set(w2))
+    word_freq1 = np.zeros(len(vocabulary))
+    word_freq2 = np.zeros_like(word_freq1)
+    word_to_ind = {word: index for index, word in enumerate(vocabulary)}
+    freq1 = summarizer._compute_word_freq(w1, word_freq1, word_to_ind)
+    freq2 = summarizer._compute_word_freq(w2, word_freq2, word_to_ind)
+
+    freq = summarizer._joint_freq(freq1, freq2, len(w1) + len(w2))
+
+    assert freq[word_to_ind["one"]] == 3.0/8
+    assert freq[word_to_ind["two"]] == 1.0/8
+    assert freq[word_to_ind["three"]] == 1.0/4
+    assert freq[word_to_ind["four"]] == 1.0/8
+    assert freq[word_to_ind["five"]] == 1.0/8
+
+
+def test_kl_divergence(summarizer):
+    EPS = 0.00001
+
+    words = ["one", "two", "three"]
+    word_freq1 = np.zeros(len(words))
+    word_freq2 = np.zeros_like(word_freq1)
+    word_to_ind = {word: index for index, word in enumerate(words)}
+
+    word_freq1[word_to_ind["one"]] = 0.35
+    word_freq1[word_to_ind["two"]] = 0.5
+    word_freq1[word_to_ind["three"]] = 0.15
+
+    word_freq2[word_to_ind["one"]] = 1.0/3.0
+    word_freq2[word_to_ind["two"]] = 1.0/3.0
+    word_freq2[word_to_ind["three"]] = 1.0/3.0
+
+    missing_word_mask = np.repeat(True, 3)
+
+    # This value comes from scipy.stats.entropy(w2_, w1_)
+    # Note: the order of params is different
+    kl_correct = 0.11475080798005841
+    assert abs(summarizer._kl_divergence(word_freq1, word_freq2, missing_word_mask) - kl_correct) < EPS
+
+    word_freq1[word_to_ind["one"]] = 0.1
+    word_freq1[word_to_ind["two"]] = 0.2
+    word_freq1[word_to_ind["three"]] = 0.7
+
+    word_freq2[word_to_ind["one"]] = 0.2
+    word_freq2[word_to_ind["two"]] = 0.4
+    word_freq2[word_to_ind["three"]] = 0.4
+
+    # This value comes from scipy.stats.entropy(w2_, w1_)
+    # Note: the order of params is different
+    kl_correct = 0.1920419931617981
+    assert abs(summarizer._kl_divergence(word_freq1, word_freq2, missing_word_mask) - kl_correct) < EPS
+
+
+def test_missing_word_in_document_during_kl_divergence_computation(summarizer):
+    """
+    Missing word should not affect the result.
+    See https://github.com/miso-belica/sumy/issues/41
+    """
+    EPS = 0.00001
+
+    words = ["one", "two", "three", "four"]
+    summary_frequences = np.zeros(len(words))
+    document_frequencies = np.repeat(summarizer.MISSING_WORD_VAL, len(words))
+    word_to_ind = {word: index for index, word in enumerate(words)}
+
+    summary_frequences[word_to_ind["one"]] = 0.35
+    summary_frequences[word_to_ind["two"]] = 0.5
+    summary_frequences[word_to_ind["three"]] = 0.15
+    summary_frequences[word_to_ind["four"]] = 0.9
+
+    document_frequencies[word_to_ind["one"]] = 1.0 / 3.0
+    document_frequencies[word_to_ind["two"]] = 1.0 / 3.0
+    document_frequencies[word_to_ind["three"]] = 1.0 / 3.0
+
+    missing_word_mask = np.repeat(False, len(summary_frequences))
+    missing_word_mask[word_to_ind["one"]] = True
+    missing_word_mask[word_to_ind["two"]] = True
+    missing_word_mask[word_to_ind["three"]] = True
+
+    # This value comes from scipy.stats.entropy(w2_, w1_)
+    # Note: the order of params is different
+    kl_correct = 0.11475080798005841
+    assert abs(summarizer._kl_divergence(summary_frequences, document_frequencies,
+                                         missing_word_mask) - kl_correct) < EPS
+
+
+def test_tf_idf_metric_should_be_real_number():
+    """https://github.com/miso-belica/sumy/issues/41"""
+    summarizer = KLSummarizer()
+    frequencies = summarizer.compute_tf([Sentence("There are five words, jop.", Tokenizer("english"))])
+
+    assert frequencies == {
+        "there": 0.2,
+        "are": 0.2,
+        "five": 0.2,
+        "words": 0.2,
+        "jop": 0.2,
+    }
+
+
+def test_the_sentences_should_be_in_different_order(summarizer):
+    """https://github.com/miso-belica/sumy/issues/146"""
+    paragraphs = [
+        ["This is 1st sentence.", "This is 2nd sentence."],
+        ["This is 3rd sentence.", "This is 4th sentence."],
+        ["This is 5th sentence."],
+    ]
+    document = build_document(*paragraphs)
+    reversed_document = build_document(*(reversed(p) for p in reversed(paragraphs)))
+
+    sentences = summarizer(document, "100%")
+    reversed_sentences = summarizer(reversed_document, "100%")
+
+    assert tuple(reversed(sentences)) == reversed_sentences