Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions commit_msg.txt
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove this file

Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
Fix KeyError in SumBasic summarizer

The _get_all_content_words_in_doc method was processing words in a
different order (stem -> filter -> normalize) compared to
_get_content_words_in_sentence (normalize -> filter -> stem). This
mismatch meant some words would appear in the per-sentence word
lists but not in the document frequency table, causing a KeyError
during summarization.

Aligned both methods to use the same processing order:
normalize -> filter stop words -> stem.

Also fixed _get_all_words_in_doc to return raw words instead of
pre-stemmed words, since stemming is now handled consistently
in _get_all_content_words_in_doc.

Fixes #176
9 changes: 5 additions & 4 deletions sumy/summarizers/sum_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def __call__(self, document, sentences_count):
return self._get_best_sentences(document.sentences, sentences_count, ratings)

def _get_all_words_in_doc(self, sentences):
return self._stem_words([w for s in sentences for w in s.words])
return [w for s in sentences for w in s.words]

def _get_content_words_in_sentence(self, sentence):
normalized_words = self._normalize_words(sentence.words)
Expand All @@ -54,9 +54,10 @@ def _compute_word_freq(list_of_words):

def _get_all_content_words_in_doc(self, sentences):
all_words = self._get_all_words_in_doc(sentences)
content_words = self._filter_out_stop_words(all_words)
normalized_content_words = self._normalize_words(content_words)
return normalized_content_words
normalized_words = self._normalize_words(all_words)
content_words = self._filter_out_stop_words(normalized_words)
stemmed_content_words = self._stem_words(content_words)
return stemmed_content_words

def _compute_tf(self, sentences):
"""
Expand Down