1
+ import os
2
+ from sentence_transformers import SentenceTransformer
3
+ from sklearn .metrics .pairwise import cosine_similarity
4
+ import numpy as np
5
+ from nltk .tokenize import sent_tokenize
6
+ import nltk
7
+ from nltk .corpus import stopwords
8
+ from nltk .tokenize import word_tokenize
9
+ from collections import Counter
10
+
11
+ MODEL_NAME = 'all-MiniLM-L6-v2'
12
+ MODEL_FOLDER = 'model'
13
+
14
+ def load_or_download_model ():
15
+ model_path = os .path .join (MODEL_FOLDER , MODEL_NAME )
16
+ if os .path .exists (model_path ):
17
+ print (f"Loading model from { model_path } " )
18
+ return SentenceTransformer (model_path )
19
+ else :
20
+ print (f"Downloading model { MODEL_NAME } " )
21
+ model = SentenceTransformer (MODEL_NAME )
22
+ os .makedirs (MODEL_FOLDER , exist_ok = True )
23
+ model .save (model_path )
24
+ print (f"Model saved to { model_path } " )
25
+ return model
26
+
27
+ def download_nltk_resources ():
28
+ resources = ['punkt' , 'stopwords' ]
29
+ for resource in resources :
30
+ try :
31
+ nltk .data .find (f'tokenizers/{ resource } ' )
32
+ except LookupError :
33
+ print (f"Downloading { resource } ..." )
34
+ nltk .download (resource , quiet = True )
35
+
36
+ def extract_keywords (text , model , top_n = 10 ):
37
+ # Tokenize the text
38
+ words = word_tokenize (text .lower ())
39
+
40
+ # Remove stopwords
41
+ stop_words = set (stopwords .words ('english' ))
42
+ filtered_words = [word for word in words if word .isalnum () and word not in stop_words ]
43
+
44
+ # Get word embeddings
45
+ word_embeddings = model .encode (filtered_words )
46
+
47
+ # Calculate importance scores (you can use different methods here)
48
+ importance_scores = np .mean (word_embeddings , axis = 1 )
49
+
50
+ # Get top N words based on importance scores
51
+ top_indices = np .argsort (importance_scores )[- top_n :]
52
+ keywords = [filtered_words [i ] for i in top_indices [::- 1 ]]
53
+
54
+ return keywords
55
+
56
+ def summarize_text (text , model , num_sentences = 3 ):
57
+ # Split the text into sentences
58
+ sentences = sent_tokenize (text )
59
+
60
+ # Encode sentences
61
+ sentence_embeddings = model .encode (sentences )
62
+
63
+ # Calculate similarity matrix
64
+ similarity_matrix = cosine_similarity (sentence_embeddings )
65
+
66
+ # Calculate sentence scores
67
+ sentence_scores = np .sum (similarity_matrix , axis = 1 )
68
+
69
+ # Get top sentences
70
+ top_sentence_indices = np .argsort (sentence_scores )[- num_sentences :]
71
+ top_sentences = [sentences [i ] for i in sorted (top_sentence_indices )]
72
+
73
+ return ' ' .join (top_sentences )
74
+
75
+ def main ():
76
+ # Ensure NLTK resources are downloaded
77
+ download_nltk_resources ()
78
+
79
+ # Load or download the model
80
+ model = load_or_download_model ()
81
+
82
+ # Read input file
83
+ input_file = 'input.txt'
84
+ if not os .path .exists (input_file ):
85
+ print (f"Error: { input_file } not found. Please ensure the file exists in the current directory." )
86
+ return
87
+
88
+ try :
89
+ with open (input_file , 'r' , encoding = 'utf-8' ) as file :
90
+ text = file .read ()
91
+ except Exception as e :
92
+ print (f"Error reading { input_file } : { str (e )} " )
93
+ return
94
+
95
+ # Extract keywords
96
+ keywords = extract_keywords (text , model )
97
+
98
+ # Generate summary
99
+ summary = summarize_text (text , model )
100
+
101
+ # Print results
102
+ print ("Keywords:" )
103
+ for i , word in enumerate (keywords , 1 ):
104
+ print (f"{ i } . { word } " )
105
+
106
+ print ("\n Summary:" )
107
+ print (summary )
108
+
109
+ if __name__ == "__main__" :
110
+ main ()
0 commit comments