1
+ import os
2
+ import nltk
3
+ from nltk .tokenize import sent_tokenize , word_tokenize
4
+ from nltk .corpus import stopwords
5
+ from collections import Counter
6
+ from sentence_transformers import SentenceTransformer
7
+ from sklearn .metrics .pairwise import cosine_similarity
8
+ import numpy as np
9
+
10
+ MODEL_NAME = 'all-MiniLM-L6-v2'
11
+ MODEL_FOLDER = 'model'
12
+ NLTK_DATA_FOLDER = os .path .join (MODEL_FOLDER , 'nltk_data' )
13
+
14
+ def load_or_download_model ():
15
+ model_path = os .path .join (MODEL_FOLDER , MODEL_NAME )
16
+ if os .path .exists (model_path ):
17
+ print (f"Loading model from { model_path } " )
18
+ return SentenceTransformer (model_path )
19
+ else :
20
+ print (f"Downloading model { MODEL_NAME } " )
21
+ model = SentenceTransformer (MODEL_NAME )
22
+ os .makedirs (MODEL_FOLDER , exist_ok = True )
23
+ model .save (model_path )
24
+ print (f"Model saved to { model_path } " )
25
+ return model
26
+
27
+ def download_nltk_resources ():
28
+ nltk .data .path .append (NLTK_DATA_FOLDER )
29
+ os .makedirs (NLTK_DATA_FOLDER , exist_ok = True )
30
+
31
+ resources = [('punkt' , 'tokenizers' ), ('stopwords' , 'corpora' )]
32
+ for resource , folder in resources :
33
+ try :
34
+ nltk .data .find (f'{ folder } /{ resource } ' )
35
+ print (f"{ resource } is being Loaded." )
36
+ except LookupError :
37
+ print (f"Downloading { resource } ..." )
38
+ nltk .download (resource , download_dir = NLTK_DATA_FOLDER , quiet = True )
39
+
40
+ def extract_keywords (text , model , top_n = 10 ):
41
+ # Tokenize the text
42
+ words = word_tokenize (text .lower ())
43
+
44
+ # Remove stopwords and non-alphanumeric tokens
45
+ stop_words = set (stopwords .words ('english' ))
46
+ filtered_words = [word for word in words if word .isalnum () and word not in stop_words ]
47
+
48
+ # Count word frequencies
49
+ word_freq = Counter (filtered_words )
50
+
51
+ # Get unique words
52
+ unique_words = list (set (filtered_words ))
53
+
54
+ # Get word embeddings
55
+ word_embeddings = model .encode (unique_words )
56
+
57
+ # Calculate importance scores
58
+ importance_scores = np .mean (word_embeddings , axis = 1 )
59
+
60
+ # Combine frequency and importance
61
+ combined_scores = [(word , word_freq [word ] * importance_scores [i ]) for i , word in enumerate (unique_words )]
62
+
63
+ # Sort by combined score and get top N
64
+ top_keywords = sorted (combined_scores , key = lambda x : x [1 ], reverse = True )[:top_n ]
65
+
66
+ return [word for word , _ in top_keywords ]
67
+
68
+ def summarize_text (text , model , num_sentences = 3 ):
69
+ # Split the text into sentences
70
+ sentences = sent_tokenize (text )
71
+
72
+ # Encode sentences
73
+ sentence_embeddings = model .encode (sentences )
74
+
75
+ # Calculate similarity matrix
76
+ similarity_matrix = cosine_similarity (sentence_embeddings )
77
+
78
+ # Calculate sentence scores
79
+ sentence_scores = np .sum (similarity_matrix , axis = 1 )
80
+
81
+ # Get top sentences
82
+ top_sentence_indices = np .argsort (sentence_scores )[- num_sentences :]
83
+ top_sentences = [sentences [i ] for i in sorted (top_sentence_indices )]
84
+
85
+ return ' ' .join (top_sentences )
86
+
87
+ def main ():
88
+ # Ensure NLTK resources are downloaded
89
+ download_nltk_resources ()
90
+
91
+ # Load or download the model
92
+ model = load_or_download_model ()
93
+
94
+ # Read input file
95
+ input_file = 'input.txt'
96
+ if not os .path .exists (input_file ):
97
+ print (f"Error: { input_file } not found. Please ensure the file exists in the current directory." )
98
+ return
99
+
100
+ try :
101
+ with open (input_file , 'r' , encoding = 'utf-8' ) as file :
102
+ text = file .read ()
103
+ except Exception as e :
104
+ print (f"Error reading { input_file } : { str (e )} " )
105
+ return
106
+
107
+ # Extract keywords
108
+ keywords = extract_keywords (text , model )
109
+
110
+ # Generate summary
111
+ summary = summarize_text (text , model )
112
+
113
+ # Print results
114
+ print ("Keywords:" )
115
+ for i , word in enumerate (keywords , 1 ):
116
+ print (f"{ i } . { word } " )
117
+
118
+ print ("\n Summary:" )
119
+ print (summary )
120
+
121
+ if __name__ == "__main__" :
122
+ main ()
0 commit comments