Skip to content

Commit b7d5fea

Browse files
committed
imporved the keyword extraction #1334
1 parent d047097 commit b7d5fea

File tree

1 file changed

+16
-9
lines changed

1 file changed

+16
-9
lines changed

NLP/textsummary.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,9 @@
22
from sentence_transformers import SentenceTransformer
33
from sklearn.metrics.pairwise import cosine_similarity
44
import numpy as np
5-
from nltk.tokenize import sent_tokenize
5+
from nltk.tokenize import sent_tokenize, word_tokenize
66
import nltk
77
from nltk.corpus import stopwords
8-
from nltk.tokenize import word_tokenize
98
from collections import Counter
109

1110
MODEL_NAME = 'all-MiniLM-L6-v2'
@@ -37,21 +36,29 @@ def extract_keywords(text, model, top_n=10):
3736
# Tokenize the text
3837
words = word_tokenize(text.lower())
3938

40-
# Remove stopwords
39+
# Remove stopwords and non-alphanumeric tokens
4140
stop_words = set(stopwords.words('english'))
4241
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
4342

43+
# Count word frequencies
44+
word_freq = Counter(filtered_words)
45+
46+
# Get unique words
47+
unique_words = list(set(filtered_words))
48+
4449
# Get word embeddings
45-
word_embeddings = model.encode(filtered_words)
50+
word_embeddings = model.encode(unique_words)
4651

47-
# Calculate importance scores (you can use different methods here)
52+
# Calculate importance scores
4853
importance_scores = np.mean(word_embeddings, axis=1)
4954

50-
# Get top N words based on importance scores
51-
top_indices = np.argsort(importance_scores)[-top_n:]
52-
keywords = [filtered_words[i] for i in top_indices[::-1]]
55+
# Combine frequency and importance
56+
combined_scores = [(word, word_freq[word] * importance_scores[i]) for i, word in enumerate(unique_words)]
57+
58+
# Sort by combined score and get top N
59+
top_keywords = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_n]
5360

54-
return keywords
61+
return [word for word, _ in top_keywords]
5562

5663
def summarize_text(text, model, num_sentences=3):
5764
# Split the text into sentences

0 commit comments

Comments
 (0)