diff --git a/commit_msg.txt b/commit_msg.txt new file mode 100644 index 00000000..5d4bdbcf --- /dev/null +++ b/commit_msg.txt @@ -0,0 +1,17 @@ +Fix KeyError in SumBasic summarizer + +The _get_all_content_words_in_doc method was processing words in a +different order (stem -> filter -> normalize) compared to +_get_content_words_in_sentence (normalize -> filter -> stem). This +mismatch meant some words would appear in the per-sentence word +lists but not in the document frequency table, causing a KeyError +during summarization. + +Aligned both methods to use the same processing order: +normalize -> filter stop words -> stem. + +Also fixed _get_all_words_in_doc to return raw words instead of +pre-stemmed words, since stemming is now handled consistently +in _get_all_content_words_in_doc. + +Fixes #176 diff --git a/sumy/summarizers/sum_basic.py b/sumy/summarizers/sum_basic.py index c54cf678..aa82b31c 100644 --- a/sumy/summarizers/sum_basic.py +++ b/sumy/summarizers/sum_basic.py @@ -28,7 +28,7 @@ def __call__(self, document, sentences_count): return self._get_best_sentences(document.sentences, sentences_count, ratings) def _get_all_words_in_doc(self, sentences): - return self._stem_words([w for s in sentences for w in s.words]) + return [w for s in sentences for w in s.words] def _get_content_words_in_sentence(self, sentence): normalized_words = self._normalize_words(sentence.words) @@ -54,9 +54,10 @@ def _compute_word_freq(list_of_words): def _get_all_content_words_in_doc(self, sentences): all_words = self._get_all_words_in_doc(sentences) - content_words = self._filter_out_stop_words(all_words) - normalized_content_words = self._normalize_words(content_words) - return normalized_content_words + normalized_words = self._normalize_words(all_words) + content_words = self._filter_out_stop_words(normalized_words) + stemmed_content_words = self._stem_words(content_words) + return stemmed_content_words def _compute_tf(self, sentences): """