Skip to content

Commit 923ac5a

Browse files
authored
Merge pull request #1504 from A-Akhil/main
Keyword Extraction Feature
2 parents 2b5928b + 20b6338 commit 923ac5a

File tree

2 files changed

+154
-65
lines changed

2 files changed

+154
-65
lines changed

NLP/dummysentence.py

Lines changed: 32 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,33 @@
1-
import os
2-
from sentence_transformers import SentenceTransformer, util
3-
4-
MODEL_NAME = 'all-MiniLM-L6-v2'
5-
MODEL_FOLDER = 'model'
6-
7-
def load_file(file_path):
8-
with open(file_path, 'r', encoding='utf-8') as file:
9-
return [line.strip() for line in file if line.strip()]
10-
11-
def load_or_download_model():
12-
model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
13-
if os.path.exists(model_path):
14-
print(f"Loading model from {model_path}")
15-
return SentenceTransformer(model_path)
16-
else:
17-
print(f"Downloading model {MODEL_NAME}")
18-
model = SentenceTransformer(MODEL_NAME)
19-
os.makedirs(MODEL_FOLDER, exist_ok=True)
20-
model.save(model_path)
21-
print(f"Model saved to {model_path}")
22-
return model
23-
24-
def find_similar_sentences(query, file_path, top_n=5):
25-
# Load the pre-trained model
26-
model = load_or_download_model()
27-
28-
# Load and encode the sentences from the file
29-
sentences = load_file(file_path)
30-
sentence_embeddings = model.encode(sentences)
31-
32-
# Encode the query
33-
query_embedding = model.encode([query])
34-
35-
# Calculate cosine similarities
36-
cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
37-
38-
# Get top N results
39-
top_results = sorted(zip(sentences, cosine_scores), key=lambda x: x[1], reverse=True)[:top_n]
40-
41-
return top_results
42-
43-
def main():
44-
print("Welcome to the Sentence Similarity Search Tool!")
45-
46-
# Get user input for query
47-
query = input("Enter your query: ")
1+
import ollama as client
2+
3+
# Function to get response from Ollama API with system prompt
4+
def get_ollama_response(sentence_number):
5+
system_prompt = "You are a bot and speak in one line. Keep your responses short and to the point."
6+
stream = client.chat(
7+
model="llama3.2",
8+
messages=[
9+
{"role": "system", "content": system_prompt},
10+
{"role": "user", "content": f"Generate a unique sentence randomly for sentence number {sentence_number}."}
11+
],
12+
stream=True
13+
)
4814

49-
# Get user input for file path
50-
file_name = input("Enter the name of your text file (without .txt extension): ")
51-
file_path = f"{file_name}.txt"
52-
53-
try:
54-
results = find_similar_sentences(query, file_path)
55-
56-
print(f"\nTop 5 similar sentences for query: '{query}'\n")
57-
for sentence, score in results:
58-
print(f"Similarity: {score:.4f}")
59-
print(f"Sentence: {sentence}\n")
60-
except FileNotFoundError:
61-
print(f"Error: The file '{file_path}' was not found. Please check the file name and try again.")
62-
except Exception as e:
63-
print(f"An error occurred: {str(e)}")
64-
65-
if __name__ == "__main__":
66-
main()
15+
response = ''
16+
for chunk in stream:
17+
response += chunk['message']['content']
18+
return response.strip() # Strip any leading/trailing spaces
19+
20+
# Open the file in write mode
21+
with open("generated_sentences.txt", "w") as file:
22+
# Loop to generate 100 sentences one by one
23+
for i in range(100):
24+
# Get the sentence using the function
25+
sentence = get_ollama_response(i + 1)
26+
27+
# Write the sentence to the file on a new line
28+
file.write(sentence + "\n")
29+
30+
# Print the sentence to the console
31+
print(f"Sentence {i+1}: {sentence}")
32+
33+
print("File 'generated_sentences.txt' created with 100 sentences, each on a new line.")

NLP/textsummary.py

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import os
2+
import nltk
3+
from nltk.tokenize import sent_tokenize, word_tokenize
4+
from nltk.corpus import stopwords
5+
from collections import Counter
6+
from sentence_transformers import SentenceTransformer
7+
from sklearn.metrics.pairwise import cosine_similarity
8+
import numpy as np
9+
10+
MODEL_NAME = 'all-MiniLM-L6-v2'
11+
MODEL_FOLDER = 'model'
12+
NLTK_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'nltk_data')
13+
14+
def load_or_download_model():
15+
model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
16+
if os.path.exists(model_path):
17+
print(f"Loading model from {model_path}")
18+
return SentenceTransformer(model_path)
19+
else:
20+
print(f"Downloading model {MODEL_NAME}")
21+
model = SentenceTransformer(MODEL_NAME)
22+
os.makedirs(MODEL_FOLDER, exist_ok=True)
23+
model.save(model_path)
24+
print(f"Model saved to {model_path}")
25+
return model
26+
27+
def download_nltk_resources():
28+
nltk.data.path.append(NLTK_DATA_FOLDER)
29+
os.makedirs(NLTK_DATA_FOLDER, exist_ok=True)
30+
31+
resources = [('punkt', 'tokenizers'), ('stopwords', 'corpora')]
32+
for resource, folder in resources:
33+
try:
34+
nltk.data.find(f'{folder}/{resource}')
35+
print(f"{resource} is being Loaded.")
36+
except LookupError:
37+
print(f"Downloading {resource}...")
38+
nltk.download(resource, download_dir=NLTK_DATA_FOLDER, quiet=True)
39+
40+
def extract_keywords(text, model, top_n=10):
41+
# Tokenize the text
42+
words = word_tokenize(text.lower())
43+
44+
# Remove stopwords and non-alphanumeric tokens
45+
stop_words = set(stopwords.words('english'))
46+
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
47+
48+
# Count word frequencies
49+
word_freq = Counter(filtered_words)
50+
51+
# Get unique words
52+
unique_words = list(set(filtered_words))
53+
54+
# Get word embeddings
55+
word_embeddings = model.encode(unique_words)
56+
57+
# Calculate importance scores
58+
importance_scores = np.mean(word_embeddings, axis=1)
59+
60+
# Combine frequency and importance
61+
combined_scores = [(word, word_freq[word] * importance_scores[i]) for i, word in enumerate(unique_words)]
62+
63+
# Sort by combined score and get top N
64+
top_keywords = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_n]
65+
66+
return [word for word, _ in top_keywords]
67+
68+
def summarize_text(text, model, num_sentences=3):
69+
# Split the text into sentences
70+
sentences = sent_tokenize(text)
71+
72+
# Encode sentences
73+
sentence_embeddings = model.encode(sentences)
74+
75+
# Calculate similarity matrix
76+
similarity_matrix = cosine_similarity(sentence_embeddings)
77+
78+
# Calculate sentence scores
79+
sentence_scores = np.sum(similarity_matrix, axis=1)
80+
81+
# Get top sentences
82+
top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]
83+
top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]
84+
85+
return ' '.join(top_sentences)
86+
87+
def main():
88+
# Ensure NLTK resources are downloaded
89+
download_nltk_resources()
90+
91+
# Load or download the model
92+
model = load_or_download_model()
93+
94+
# Read input file
95+
input_file = 'input.txt'
96+
if not os.path.exists(input_file):
97+
print(f"Error: {input_file} not found. Please ensure the file exists in the current directory.")
98+
return
99+
100+
try:
101+
with open(input_file, 'r', encoding='utf-8') as file:
102+
text = file.read()
103+
except Exception as e:
104+
print(f"Error reading {input_file}: {str(e)}")
105+
return
106+
107+
# Extract keywords
108+
keywords = extract_keywords(text, model)
109+
110+
# Generate summary
111+
summary = summarize_text(text, model)
112+
113+
# Print results
114+
print("Keywords:")
115+
for i, word in enumerate(keywords, 1):
116+
print(f"{i}. {word}")
117+
118+
print("\nSummary:")
119+
print(summary)
120+
121+
if __name__ == "__main__":
122+
main()

0 commit comments

Comments
 (0)