Skip to content

Commit d047097

Browse files
committed
it can find the keywords and generate a summary #1334
1 parent b5c786b commit d047097

File tree

1 file changed

+110
-0
lines changed

1 file changed

+110
-0
lines changed

NLP/textsummary.py

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
import os
2+
from sentence_transformers import SentenceTransformer
3+
from sklearn.metrics.pairwise import cosine_similarity
4+
import numpy as np
5+
from nltk.tokenize import sent_tokenize
6+
import nltk
7+
from nltk.corpus import stopwords
8+
from nltk.tokenize import word_tokenize
9+
from collections import Counter
10+
11+
MODEL_NAME = 'all-MiniLM-L6-v2'
12+
MODEL_FOLDER = 'model'
13+
14+
def load_or_download_model():
15+
model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
16+
if os.path.exists(model_path):
17+
print(f"Loading model from {model_path}")
18+
return SentenceTransformer(model_path)
19+
else:
20+
print(f"Downloading model {MODEL_NAME}")
21+
model = SentenceTransformer(MODEL_NAME)
22+
os.makedirs(MODEL_FOLDER, exist_ok=True)
23+
model.save(model_path)
24+
print(f"Model saved to {model_path}")
25+
return model
26+
27+
def download_nltk_resources():
28+
resources = ['punkt', 'stopwords']
29+
for resource in resources:
30+
try:
31+
nltk.data.find(f'tokenizers/{resource}')
32+
except LookupError:
33+
print(f"Downloading {resource}...")
34+
nltk.download(resource, quiet=True)
35+
36+
def extract_keywords(text, model, top_n=10):
37+
# Tokenize the text
38+
words = word_tokenize(text.lower())
39+
40+
# Remove stopwords
41+
stop_words = set(stopwords.words('english'))
42+
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
43+
44+
# Get word embeddings
45+
word_embeddings = model.encode(filtered_words)
46+
47+
# Calculate importance scores (you can use different methods here)
48+
importance_scores = np.mean(word_embeddings, axis=1)
49+
50+
# Get top N words based on importance scores
51+
top_indices = np.argsort(importance_scores)[-top_n:]
52+
keywords = [filtered_words[i] for i in top_indices[::-1]]
53+
54+
return keywords
55+
56+
def summarize_text(text, model, num_sentences=3):
57+
# Split the text into sentences
58+
sentences = sent_tokenize(text)
59+
60+
# Encode sentences
61+
sentence_embeddings = model.encode(sentences)
62+
63+
# Calculate similarity matrix
64+
similarity_matrix = cosine_similarity(sentence_embeddings)
65+
66+
# Calculate sentence scores
67+
sentence_scores = np.sum(similarity_matrix, axis=1)
68+
69+
# Get top sentences
70+
top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]
71+
top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]
72+
73+
return ' '.join(top_sentences)
74+
75+
def main():
76+
# Ensure NLTK resources are downloaded
77+
download_nltk_resources()
78+
79+
# Load or download the model
80+
model = load_or_download_model()
81+
82+
# Read input file
83+
input_file = 'input.txt'
84+
if not os.path.exists(input_file):
85+
print(f"Error: {input_file} not found. Please ensure the file exists in the current directory.")
86+
return
87+
88+
try:
89+
with open(input_file, 'r', encoding='utf-8') as file:
90+
text = file.read()
91+
except Exception as e:
92+
print(f"Error reading {input_file}: {str(e)}")
93+
return
94+
95+
# Extract keywords
96+
keywords = extract_keywords(text, model)
97+
98+
# Generate summary
99+
summary = summarize_text(text, model)
100+
101+
# Print results
102+
print("Keywords:")
103+
for i, word in enumerate(keywords, 1):
104+
print(f"{i}. {word}")
105+
106+
print("\nSummary:")
107+
print(summary)
108+
109+
if __name__ == "__main__":
110+
main()

0 commit comments

Comments
 (0)