|
2 | 2 | from sentence_transformers import SentenceTransformer
|
3 | 3 | from sklearn.metrics.pairwise import cosine_similarity
|
4 | 4 | import numpy as np
|
5 |
| -from nltk.tokenize import sent_tokenize |
| 5 | +from nltk.tokenize import sent_tokenize, word_tokenize |
6 | 6 | import nltk
|
7 | 7 | from nltk.corpus import stopwords
|
8 |
| -from nltk.tokenize import word_tokenize |
9 | 8 | from collections import Counter
|
10 | 9 |
|
11 | 10 | MODEL_NAME = 'all-MiniLM-L6-v2'
|
@@ -37,21 +36,29 @@ def extract_keywords(text, model, top_n=10):
|
37 | 36 | # Tokenize the text
|
38 | 37 | words = word_tokenize(text.lower())
|
39 | 38 |
|
40 |
| - # Remove stopwords |
| 39 | + # Remove stopwords and non-alphanumeric tokens |
41 | 40 | stop_words = set(stopwords.words('english'))
|
42 | 41 | filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
|
43 | 42 |
|
| 43 | + # Count word frequencies |
| 44 | + word_freq = Counter(filtered_words) |
| 45 | + |
| 46 | + # Get unique words |
| 47 | + unique_words = list(set(filtered_words)) |
| 48 | + |
44 | 49 | # Get word embeddings
|
45 |
| - word_embeddings = model.encode(filtered_words) |
| 50 | + word_embeddings = model.encode(unique_words) |
46 | 51 |
|
47 |
| - # Calculate importance scores (you can use different methods here) |
| 52 | + # Calculate importance scores |
48 | 53 | importance_scores = np.mean(word_embeddings, axis=1)
|
49 | 54 |
|
50 |
| - # Get top N words based on importance scores |
51 |
| - top_indices = np.argsort(importance_scores)[-top_n:] |
52 |
| - keywords = [filtered_words[i] for i in top_indices[::-1]] |
| 55 | + # Combine frequency and importance |
| 56 | + combined_scores = [(word, word_freq[word] * importance_scores[i]) for i, word in enumerate(unique_words)] |
| 57 | + |
| 58 | + # Sort by combined score and get top N |
| 59 | + top_keywords = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_n] |
53 | 60 |
|
54 |
| - return keywords |
| 61 | + return [word for word, _ in top_keywords] |
55 | 62 |
|
56 | 63 | def summarize_text(text, model, num_sentences=3):
|
57 | 64 | # Split the text into sentences
|
|
0 commit comments