Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions sumy/summarizers/fast_kl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

try:
import numpy as np
except ImportError:
numpy = None
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable naming inconsistency: The variable is named 'numpy' but the import statement uses 'np'. Since the import uses 'as np', this line should be 'np = None' instead of 'numpy = None'. This inconsistency will cause issues when checking if numpy is installed.

Suggested change
numpy = None
np = None

Copilot uses AI. Check for mistakes.

from sumy.summarizers._summarizer import AbstractSummarizer


class KLSummarizer(AbstractSummarizer):
"""
Method that greedily adds sentences to a summary so long as it decreases the
KL Divergence.
Source: http://www.aclweb.org/anthology/N09-1041
"""
MISSING_WORD_VAL = 42.0 # placeholder value used for missing words in document
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Magic number used for MISSING_WORD_VAL. While the comment explains its purpose, the choice of 42.0 appears arbitrary and could potentially conflict with actual word frequencies. Consider using a more distinctive value like -1.0 or float('inf'), or adding a comment explaining why 42.0 was chosen specifically.

Suggested change
MISSING_WORD_VAL = 42.0 # placeholder value used for missing words in document
# Sentinel value used for words that are missing from the document's frequency
# representation. We use a numeric value that is assumed to lie outside the
# range of any valid word frequency observed in typical documents so that it
# can be distinguished from real counts wherever it is used.
MISSING_WORD_VAL = 42.0

Copilot uses AI. Check for mistakes.
stop_words = frozenset()

def __call__(self, document, sentences_count):
self._ensure_dependencies_installed()

sentences = document.sentences
ratings = self._compute_ratings(sentences)

return self._get_best_sentences(sentences, sentences_count, ratings)

@staticmethod
def _ensure_dependencies_installed():
if np is None:
raise ValueError("Fast KL-Sum summarizer requires NumPy."
"Please, install it by command 'pip install numpy'.")
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing space in error message. There should be a space after the period and before "Please". The message should read "Fast KL-Sum summarizer requires NumPy. Please, install it by command 'pip install numpy'."

Suggested change
"Please, install it by command 'pip install numpy'.")
" Please, install it by command 'pip install numpy'.")

Copilot uses AI. Check for mistakes.

@staticmethod
def _get_all_words_in_doc(sentences):
return [w for s in sentences for w in s.words]

def _get_content_words_in_sentence(self, sentence):
normalized_words = self._normalize_words(sentence.words)
normalized_content_words = self._filter_out_stop_words(normalized_words)
return normalized_content_words

def _normalize_words(self, words):
return [self.normalize_word(w) for w in words]

def _filter_out_stop_words(self, words):
return [w for w in words if w not in self.stop_words]

@staticmethod
def _old_compute_word_freq(list_of_words, d=None):
word_freq = {} if d is None else d
for w in list_of_words:
word_freq[w] = word_freq.get(w, 0) + 1
return word_freq
Comment on lines +52 to +57
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confusing method name. The name "_old_compute_word_freq" suggests deprecated code, but it's actively used in compute_tf. Consider renaming to something more descriptive like "_compute_word_freq_dict" to indicate it returns a dictionary, distinguishing it from the array-based _compute_word_freq method.

Copilot uses AI. Check for mistakes.

@staticmethod
def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind):
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing docstring for the _compute_word_freq method. This method differs from the original implementation's signature and behavior (takes arrays instead of returning a dict), so it should be documented to explain the parameters and the in-place modification of word_freq_arr.

Suggested change
def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind):
def _compute_word_freq(list_of_words, word_freq_arr, word_to_ind):
"""
Update a word-frequency array in place based on the given list of words.
Unlike :meth:`_old_compute_word_freq`, which builds and returns a dict,
this variant increments counts directly in an existing numeric array.
:param list_of_words: Iterable of word tokens whose frequencies should be counted.
:param word_freq_arr: Mutable numeric array (e.g., NumPy array) where each
position corresponds to a word index; this array is modified in place.
:param word_to_ind: Mapping from word token to integer index into
``word_freq_arr``.
:return: The same ``word_freq_arr`` instance, after in-place updates.
"""

Copilot uses AI. Check for mistakes.
for w in list_of_words:
word_freq_arr[word_to_ind[w]] += 1
return word_freq_arr

def _get_all_content_words_in_doc(self, sentences):
all_words = self._get_all_words_in_doc(sentences)
normalized_words = self._normalize_words(all_words)
normalized_content_words = self._filter_out_stop_words(normalized_words)
return normalized_content_words
Comment on lines +65 to +69
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The order of operations differs from the original KL summarizer. In the original, stop words are filtered before normalization, but here normalization happens before filtering. This could cause differences in behavior if the normalization process affects whether a word matches a stop word. Consider matching the original order: filter stop words first, then normalize.

Copilot uses AI. Check for mistakes.

def compute_tf(self, sentences):
"""
Computes the normalized term frequency as explained in http://www.tfidf.com/

:type sentences: [sumy.models.dom.Sentence]
"""
content_words = self._get_all_content_words_in_doc(sentences)
content_words_count = len(content_words)
content_words_freq = self._old_compute_word_freq(content_words)
content_word_tf = dict((w, f / content_words_count) for w, f in content_words_freq.items())
return content_word_tf

@staticmethod
def _joint_freq(wc1, wc2, total_len):
if total_len == 0:
return np.zeros_like(wc1)
joint_sum = wc1 + wc2
return joint_sum / total_len

@staticmethod
def _kl_divergence(summary_freq, doc_freq, doc_missing_word_mask):
summary_freq = np.where((summary_freq != 0.0) & doc_missing_word_mask, summary_freq, doc_freq)
return (doc_freq * np.log(doc_freq / summary_freq)).sum()
Comment on lines +83 to +93
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing docstrings for _joint_freq and _kl_divergence methods. These are key algorithmic components that differ from the original implementation by using numpy arrays instead of dictionaries. They should be documented to explain parameters, especially the doc_missing_word_mask parameter which is not self-explanatory.

Copilot uses AI. Check for mistakes.

@staticmethod
def _find_index_of_best_sentence(kls):
"""
the best sentence is the one with the smallest kl_divergence
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent naming in comment. The comment uses lowercase "kl_divergence" which appears to be referring to the concept of KL divergence rather than the method name. For consistency with other parts of the codebase (e.g., line 16 "KL Divergence"), consider using "KL divergence" or "KL-divergence".

Suggested change
the best sentence is the one with the smallest kl_divergence
the best sentence is the one with the smallest KL divergence

Copilot uses AI. Check for mistakes.
"""
return kls.index(min(kls))

def _compute_ratings(self, sentences):
word_to_freq = self.compute_tf(sentences)

vocabulary = set(self._get_all_words_in_doc(sentences)).union(word_to_freq.keys())
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Vocabulary construction mixes normalized content words with all raw words. The vocabulary is built from the union of all words in the document (via _get_all_words_in_doc which returns raw, non-normalized words including stop words) and word_to_freq.keys() (which contains normalized content words without stop words). This inconsistency can lead to the vocabulary containing both normalized and non-normalized versions of the same word, potentially causing incorrect frequency calculations.

Copilot uses AI. Check for mistakes.
word_to_ind = {word: index for index, word in enumerate(vocabulary)}

word_freq = np.repeat(self.MISSING_WORD_VAL, len(vocabulary))
for k, v in word_to_freq.items():
word_freq[word_to_ind[k]] = v
missing_word_mask = word_freq != self.MISSING_WORD_VAL

ratings = {}

# Keep track of number of words in summary and word frequency
summary_word_list_len = 0
summary_word_freq = np.repeat(0.0, len(vocabulary))

# make it a list so that it can be modified
sentences_list = list(sentences)

# get all content words once for efficiency
sentences_as_words = [self._get_content_words_in_sentence(s) for s in sentences]

# calculate all sentence lengths and word frequencies once for efficiency
i_to_sent_word_freq = {}
i_to_sent_len = {}
for i, s in enumerate(sentences_as_words):
sent_word_freq = np.zeros_like(word_freq)
sent_word_freq = self._compute_word_freq(s, sent_word_freq, word_to_ind)
i_to_sent_word_freq[i] = sent_word_freq
i_to_sent_len[i] = len(s)

iterations = 0
indices = list(range(len(sentences_as_words)))
# Removes one sentence per iteration by adding to summary
while len(indices) > 0:
iterations += 1
Comment on lines +134 to +138
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unused variable. The 'iterations' variable is initialized and incremented but never used. Consider removing it to improve code clarity.

Suggested change
iterations = 0
indices = list(range(len(sentences_as_words)))
# Removes one sentence per iteration by adding to summary
while len(indices) > 0:
iterations += 1
indices = list(range(len(sentences_as_words)))
# Removes one sentence per iteration by adding to summary
while len(indices) > 0:

Copilot uses AI. Check for mistakes.
# will store all the kls values for this pass
kls = []

for i in indices:
# calculates the joint frequency
joint_freq = self._joint_freq(i_to_sent_word_freq[i], summary_word_freq,
i_to_sent_len[i] + summary_word_list_len)
Comment on lines +144 to +145
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent word counting in total length calculation. The total length passed to _joint_freq combines i_to_sent_len[i] (count of content words only from line 132) with summary_word_list_len (count of all words including stop words from line 156). These should both count the same type of words for accurate joint frequency computation.

Copilot uses AI. Check for mistakes.

# adds the calculated kl divergence to the list in index = sentence used
kls.append(self._kl_divergence(joint_freq, word_freq, missing_word_mask))

# to consider and then add it into the summary
index_to_remove = self._find_index_of_best_sentence(kls)
best_sentence = sentences_list[indices[index_to_remove]]
del indices[index_to_remove]
best_sentence_word_list = self._get_all_words_in_doc([best_sentence])
# update summary length and word frequencies
summary_word_list_len += len(best_sentence_word_list)
summary_word_freq = self._compute_word_freq(best_sentence_word_list, summary_word_freq, word_to_ind)
Comment on lines +154 to +157
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistency in word filtering between sentence and summary frequencies. The pre-computed sentence word frequencies use normalized content words only (line 130 via _get_content_words_in_sentence), but the summary word frequency is updated with all raw words including stop words (line 154 via _get_all_words_in_doc). This creates an inconsistency in the joint frequency calculation at line 144-145, where normalized content word frequencies are combined with all-word frequencies. This should match the approach used for sentences.

Suggested change
best_sentence_word_list = self._get_all_words_in_doc([best_sentence])
# update summary length and word frequencies
summary_word_list_len += len(best_sentence_word_list)
summary_word_freq = self._compute_word_freq(best_sentence_word_list, summary_word_freq, word_to_ind)
best_sentence_content_words = self._get_content_words_in_sentence(best_sentence)
# update summary length and word frequencies using normalized content words
summary_word_list_len += len(best_sentence_content_words)
summary_word_freq = self._compute_word_freq(best_sentence_content_words, summary_word_freq, word_to_ind)

Copilot uses AI. Check for mistakes.

# value is the iteration in which it was removed multiplied by -1 so that
# the first sentences removed (the most important) have highest values
ratings[best_sentence] = -1 * len(ratings)
return ratings
207 changes: 207 additions & 0 deletions tests/test_summarizers/test_fast_kl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

import pytest
import numpy as np

import sumy.summarizers.fast_kl as fast_kl_module
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Module 'sumy.summarizers.fast_kl' is imported with both 'import' and 'import from'.

Copilot uses AI. Check for mistakes.
from sumy.models.dom._sentence import Sentence
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.fast_kl import KLSummarizer
from ..utils import build_document


@pytest.fixture
def empty_stop_words():
return []


@pytest.fixture
def stop_words():
return ["the", "and", "i"]


@pytest.fixture
def summarizer(stop_words):
summarizer = KLSummarizer()
summarizer.stop_words = stop_words
return summarizer


def test_numpy_not_installed():
summarizer = KLSummarizer()

numpy = fast_kl_module.np
fast_kl_module.np = None

with pytest.raises(ValueError):
summarizer(build_document(), 10)

fast_kl_module.np = numpy


def test_empty_document(summarizer):
document = build_document()
returned = summarizer(document, 10)

assert len(returned) == 0


def test_single_sentence(summarizer):
s = Sentence("I am one slightly longer sentence.", Tokenizer("english"))
document = build_document([s])

returned = summarizer(document, 10)

assert len(returned) == 1


def test_compute_word_freq(summarizer):
words = ["one", "two", "three", "four"]
word_freq = np.zeros(len(words))
word_to_ind = {word: index for index, word in enumerate(words)}
freq = summarizer._compute_word_freq(words, word_freq, word_to_ind)

assert np.all(freq == 1)

words = ["one", "one", "two", "two"]
word_freq = np.zeros(len(set(words)))
word_to_ind = {word: index for index, word in enumerate(set(words))}
freq = summarizer._compute_word_freq(words, word_freq, word_to_ind)

assert np.all(freq == 2)


def test_joint_freq(summarizer):
w1 = ["one", "two", "three", "four"]
w2 = ["one", "two", "three", "four"]

word_freq1 = np.zeros(len(w1))
word_freq2 = np.zeros_like(word_freq1)
word_to_ind = {word: index for index, word in enumerate(w1)}
freq1 = summarizer._compute_word_freq(w1, word_freq1, word_to_ind)
freq2 = summarizer._compute_word_freq(w1, word_freq2, word_to_ind)
Copy link

Copilot AI Dec 28, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incorrect variable used in test. Line 85 calls _compute_word_freq with w1 instead of w2. While w1 and w2 have the same value in this test case, this should use w2 for correctness and to properly test the intended scenario.

Suggested change
freq2 = summarizer._compute_word_freq(w1, word_freq2, word_to_ind)
freq2 = summarizer._compute_word_freq(w2, word_freq2, word_to_ind)

Copilot uses AI. Check for mistakes.

freq = summarizer._joint_freq(freq1, freq2, len(w1) + len(w2))

assert np.all(freq == 1.0/4)

w1 = ["one", "two", "three", "four"]
w2 = ["one", "one", "three", "five"]

vocabulary = set(w1).union(set(w2))
word_freq1 = np.zeros(len(vocabulary))
word_freq2 = np.zeros_like(word_freq1)
word_to_ind = {word: index for index, word in enumerate(vocabulary)}
freq1 = summarizer._compute_word_freq(w1, word_freq1, word_to_ind)
freq2 = summarizer._compute_word_freq(w2, word_freq2, word_to_ind)

freq = summarizer._joint_freq(freq1, freq2, len(w1) + len(w2))

assert freq[word_to_ind["one"]] == 3.0/8
assert freq[word_to_ind["two"]] == 1.0/8
assert freq[word_to_ind["three"]] == 1.0/4
assert freq[word_to_ind["four"]] == 1.0/8
assert freq[word_to_ind["five"]] == 1.0/8


def test_kl_divergence(summarizer):
EPS = 0.00001

words = ["one", "two", "three"]
word_freq1 = np.zeros(len(words))
word_freq2 = np.zeros_like(word_freq1)
word_to_ind = {word: index for index, word in enumerate(words)}

word_freq1[word_to_ind["one"]] = 0.35
word_freq1[word_to_ind["two"]] = 0.5
word_freq1[word_to_ind["three"]] = 0.15

word_freq2[word_to_ind["one"]] = 1.0/3.0
word_freq2[word_to_ind["two"]] = 1.0/3.0
word_freq2[word_to_ind["three"]] = 1.0/3.0

missing_word_mask = np.repeat(True, 3)

# This value comes from scipy.stats.entropy(w2_, w1_)
# Note: the order of params is different
kl_correct = 0.11475080798005841
assert abs(summarizer._kl_divergence(word_freq1, word_freq2, missing_word_mask) - kl_correct) < EPS

word_freq1[word_to_ind["one"]] = 0.1
word_freq1[word_to_ind["two"]] = 0.2
word_freq1[word_to_ind["three"]] = 0.7

word_freq2[word_to_ind["one"]] = 0.2
word_freq2[word_to_ind["two"]] = 0.4
word_freq2[word_to_ind["three"]] = 0.4

# This value comes from scipy.stats.entropy(w2_, w1_)
# Note: the order of params is different
kl_correct = 0.1920419931617981
assert abs(summarizer._kl_divergence(word_freq1, word_freq2, missing_word_mask) - kl_correct) < EPS


def test_missing_word_in_document_during_kl_divergence_computation(summarizer):
"""
Missing word should not affect the result.
See https://github.com/miso-belica/sumy/issues/41
"""
EPS = 0.00001

words = ["one", "two", "three", "four"]
summary_frequences = np.zeros(len(words))
document_frequencies = np.repeat(summarizer.MISSING_WORD_VAL, len(words))
word_to_ind = {word: index for index, word in enumerate(words)}

summary_frequences[word_to_ind["one"]] = 0.35
summary_frequences[word_to_ind["two"]] = 0.5
summary_frequences[word_to_ind["three"]] = 0.15
summary_frequences[word_to_ind["four"]] = 0.9

document_frequencies[word_to_ind["one"]] = 1.0 / 3.0
document_frequencies[word_to_ind["two"]] = 1.0 / 3.0
document_frequencies[word_to_ind["three"]] = 1.0 / 3.0

missing_word_mask = np.repeat(False, len(summary_frequences))
missing_word_mask[word_to_ind["one"]] = True
missing_word_mask[word_to_ind["two"]] = True
missing_word_mask[word_to_ind["three"]] = True

# This value comes from scipy.stats.entropy(w2_, w1_)
# Note: the order of params is different
kl_correct = 0.11475080798005841
assert abs(summarizer._kl_divergence(summary_frequences, document_frequencies,
missing_word_mask) - kl_correct) < EPS


def test_tf_idf_metric_should_be_real_number():
"""https://github.com/miso-belica/sumy/issues/41"""
summarizer = KLSummarizer()
frequencies = summarizer.compute_tf([Sentence("There are five words, jop.", Tokenizer("english"))])

assert frequencies == {
"there": 0.2,
"are": 0.2,
"five": 0.2,
"words": 0.2,
"jop": 0.2,
}


def test_the_sentences_should_be_in_different_order(summarizer):
"""https://github.com/miso-belica/sumy/issues/146"""
paragraphs = [
["This is 1st sentence.", "This is 2nd sentence."],
["This is 3rd sentence.", "This is 4th sentence."],
["This is 5th sentence."],
]
document = build_document(*paragraphs)
reversed_document = build_document(*(reversed(p) for p in reversed(paragraphs)))

sentences = summarizer(document, "100%")
reversed_sentences = summarizer(reversed_document, "100%")

assert tuple(reversed(sentences)) == reversed_sentences