From b5c786b4ca85cb55ae795809a618deff45449ce0 Mon Sep 17 00:00:00 2001 From: A-Akhil Date: Sun, 20 Oct 2024 18:25:09 +0530 Subject: [PATCH 1/4] used to make dummy text using ollama #1334 --- NLP/dummysentence.py | 97 +++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 65 deletions(-) diff --git a/NLP/dummysentence.py b/NLP/dummysentence.py index b07d7b46b..530387c61 100644 --- a/NLP/dummysentence.py +++ b/NLP/dummysentence.py @@ -1,66 +1,33 @@ -import os -from sentence_transformers import SentenceTransformer, util - -MODEL_NAME = 'all-MiniLM-L6-v2' -MODEL_FOLDER = 'model' - -def load_file(file_path): - with open(file_path, 'r', encoding='utf-8') as file: - return [line.strip() for line in file if line.strip()] - -def load_or_download_model(): - model_path = os.path.join(MODEL_FOLDER, MODEL_NAME) - if os.path.exists(model_path): - print(f"Loading model from {model_path}") - return SentenceTransformer(model_path) - else: - print(f"Downloading model {MODEL_NAME}") - model = SentenceTransformer(MODEL_NAME) - os.makedirs(MODEL_FOLDER, exist_ok=True) - model.save(model_path) - print(f"Model saved to {model_path}") - return model - -def find_similar_sentences(query, file_path, top_n=5): - # Load the pre-trained model - model = load_or_download_model() - - # Load and encode the sentences from the file - sentences = load_file(file_path) - sentence_embeddings = model.encode(sentences) - - # Encode the query - query_embedding = model.encode([query]) - - # Calculate cosine similarities - cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0] - - # Get top N results - top_results = sorted(zip(sentences, cosine_scores), key=lambda x: x[1], reverse=True)[:top_n] - - return top_results - -def main(): - print("Welcome to the Sentence Similarity Search Tool!") - - # Get user input for query - query = input("Enter your query: ") +import ollama as client + +# Function to get response from Ollama API with system prompt +def get_ollama_response(sentence_number): + system_prompt = "You are a bot and speak in one line. Keep your responses short and to the point." + stream = client.chat( + model="llama3.2", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f"Generate a unique sentence randomly for sentence number {sentence_number}."} + ], + stream=True + ) - # Get user input for file path - file_name = input("Enter the name of your text file (without .txt extension): ") - file_path = f"{file_name}.txt" - - try: - results = find_similar_sentences(query, file_path) - - print(f"\nTop 5 similar sentences for query: '{query}'\n") - for sentence, score in results: - print(f"Similarity: {score:.4f}") - print(f"Sentence: {sentence}\n") - except FileNotFoundError: - print(f"Error: The file '{file_path}' was not found. Please check the file name and try again.") - except Exception as e: - print(f"An error occurred: {str(e)}") - -if __name__ == "__main__": - main() \ No newline at end of file + response = '' + for chunk in stream: + response += chunk['message']['content'] + return response.strip() # Strip any leading/trailing spaces + +# Open the file in write mode +with open("generated_sentences.txt", "w") as file: + # Loop to generate 100 sentences one by one + for i in range(100): + # Get the sentence using the function + sentence = get_ollama_response(i + 1) + + # Write the sentence to the file on a new line + file.write(sentence + "\n") + + # Print the sentence to the console + print(f"Sentence {i+1}: {sentence}") + +print("File 'generated_sentences.txt' created with 100 sentences, each on a new line.") From d047097050d6e4c53f2f1d781d155d217c21e989 Mon Sep 17 00:00:00 2001 From: A-Akhil Date: Sun, 20 Oct 2024 18:41:40 +0530 Subject: [PATCH 2/4] it can find the keywords and generate a summary #1334 --- NLP/textsummary.py | 110 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 NLP/textsummary.py diff --git a/NLP/textsummary.py b/NLP/textsummary.py new file mode 100644 index 000000000..97d873a61 --- /dev/null +++ b/NLP/textsummary.py @@ -0,0 +1,110 @@ +import os +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np +from nltk.tokenize import sent_tokenize +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from collections import Counter + +MODEL_NAME = 'all-MiniLM-L6-v2' +MODEL_FOLDER = 'model' + +def load_or_download_model(): + model_path = os.path.join(MODEL_FOLDER, MODEL_NAME) + if os.path.exists(model_path): + print(f"Loading model from {model_path}") + return SentenceTransformer(model_path) + else: + print(f"Downloading model {MODEL_NAME}") + model = SentenceTransformer(MODEL_NAME) + os.makedirs(MODEL_FOLDER, exist_ok=True) + model.save(model_path) + print(f"Model saved to {model_path}") + return model + +def download_nltk_resources(): + resources = ['punkt', 'stopwords'] + for resource in resources: + try: + nltk.data.find(f'tokenizers/{resource}') + except LookupError: + print(f"Downloading {resource}...") + nltk.download(resource, quiet=True) + +def extract_keywords(text, model, top_n=10): + # Tokenize the text + words = word_tokenize(text.lower()) + + # Remove stopwords + stop_words = set(stopwords.words('english')) + filtered_words = [word for word in words if word.isalnum() and word not in stop_words] + + # Get word embeddings + word_embeddings = model.encode(filtered_words) + + # Calculate importance scores (you can use different methods here) + importance_scores = np.mean(word_embeddings, axis=1) + + # Get top N words based on importance scores + top_indices = np.argsort(importance_scores)[-top_n:] + keywords = [filtered_words[i] for i in top_indices[::-1]] + + return keywords + +def summarize_text(text, model, num_sentences=3): + # Split the text into sentences + sentences = sent_tokenize(text) + + # Encode sentences + sentence_embeddings = model.encode(sentences) + + # Calculate similarity matrix + similarity_matrix = cosine_similarity(sentence_embeddings) + + # Calculate sentence scores + sentence_scores = np.sum(similarity_matrix, axis=1) + + # Get top sentences + top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:] + top_sentences = [sentences[i] for i in sorted(top_sentence_indices)] + + return ' '.join(top_sentences) + +def main(): + # Ensure NLTK resources are downloaded + download_nltk_resources() + + # Load or download the model + model = load_or_download_model() + + # Read input file + input_file = 'input.txt' + if not os.path.exists(input_file): + print(f"Error: {input_file} not found. Please ensure the file exists in the current directory.") + return + + try: + with open(input_file, 'r', encoding='utf-8') as file: + text = file.read() + except Exception as e: + print(f"Error reading {input_file}: {str(e)}") + return + + # Extract keywords + keywords = extract_keywords(text, model) + + # Generate summary + summary = summarize_text(text, model) + + # Print results + print("Keywords:") + for i, word in enumerate(keywords, 1): + print(f"{i}. {word}") + + print("\nSummary:") + print(summary) + +if __name__ == "__main__": + main() \ No newline at end of file From b7d5feaf81f9c1f31672d03e583da85cdcbea1e9 Mon Sep 17 00:00:00 2001 From: A-Akhil Date: Sun, 20 Oct 2024 18:42:55 +0530 Subject: [PATCH 3/4] imporved the keyword extraction #1334 --- NLP/textsummary.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/NLP/textsummary.py b/NLP/textsummary.py index 97d873a61..71d566b28 100644 --- a/NLP/textsummary.py +++ b/NLP/textsummary.py @@ -2,10 +2,9 @@ from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity import numpy as np -from nltk.tokenize import sent_tokenize +from nltk.tokenize import sent_tokenize, word_tokenize import nltk from nltk.corpus import stopwords -from nltk.tokenize import word_tokenize from collections import Counter MODEL_NAME = 'all-MiniLM-L6-v2' @@ -37,21 +36,29 @@ def extract_keywords(text, model, top_n=10): # Tokenize the text words = word_tokenize(text.lower()) - # Remove stopwords + # Remove stopwords and non-alphanumeric tokens stop_words = set(stopwords.words('english')) filtered_words = [word for word in words if word.isalnum() and word not in stop_words] + # Count word frequencies + word_freq = Counter(filtered_words) + + # Get unique words + unique_words = list(set(filtered_words)) + # Get word embeddings - word_embeddings = model.encode(filtered_words) + word_embeddings = model.encode(unique_words) - # Calculate importance scores (you can use different methods here) + # Calculate importance scores importance_scores = np.mean(word_embeddings, axis=1) - # Get top N words based on importance scores - top_indices = np.argsort(importance_scores)[-top_n:] - keywords = [filtered_words[i] for i in top_indices[::-1]] + # Combine frequency and importance + combined_scores = [(word, word_freq[word] * importance_scores[i]) for i, word in enumerate(unique_words)] + + # Sort by combined score and get top N + top_keywords = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_n] - return keywords + return [word for word, _ in top_keywords] def summarize_text(text, model, num_sentences=3): # Split the text into sentences From 20b63385bbce255c7a0123f55612d4bfe679e475 Mon Sep 17 00:00:00 2001 From: A-Akhil Date: Sun, 20 Oct 2024 18:51:51 +0530 Subject: [PATCH 4/4] Can fetch from local dir #1334 --- NLP/textsummary.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/NLP/textsummary.py b/NLP/textsummary.py index 71d566b28..b11d3f872 100644 --- a/NLP/textsummary.py +++ b/NLP/textsummary.py @@ -1,14 +1,15 @@ import os -from sentence_transformers import SentenceTransformer -from sklearn.metrics.pairwise import cosine_similarity -import numpy as np -from nltk.tokenize import sent_tokenize, word_tokenize import nltk +from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from collections import Counter +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np MODEL_NAME = 'all-MiniLM-L6-v2' MODEL_FOLDER = 'model' +NLTK_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'nltk_data') def load_or_download_model(): model_path = os.path.join(MODEL_FOLDER, MODEL_NAME) @@ -24,13 +25,17 @@ def load_or_download_model(): return model def download_nltk_resources(): - resources = ['punkt', 'stopwords'] - for resource in resources: + nltk.data.path.append(NLTK_DATA_FOLDER) + os.makedirs(NLTK_DATA_FOLDER, exist_ok=True) + + resources = [('punkt', 'tokenizers'), ('stopwords', 'corpora')] + for resource, folder in resources: try: - nltk.data.find(f'tokenizers/{resource}') + nltk.data.find(f'{folder}/{resource}') + print(f"{resource} is being Loaded.") except LookupError: print(f"Downloading {resource}...") - nltk.download(resource, quiet=True) + nltk.download(resource, download_dir=NLTK_DATA_FOLDER, quiet=True) def extract_keywords(text, model, top_n=10): # Tokenize the text