From b5c786b4ca85cb55ae795809a618deff45449ce0 Mon Sep 17 00:00:00 2001
From: A-Akhil <akhilrahul70@gmail.com>
Date: Sun, 20 Oct 2024 18:25:09 +0530
Subject: [PATCH 1/4] used to make dummy text using ollama #1334

---
 NLP/dummysentence.py | 97 +++++++++++++++-----------------------------
 1 file changed, 32 insertions(+), 65 deletions(-)

diff --git a/NLP/dummysentence.py b/NLP/dummysentence.py
index b07d7b46b..530387c61 100644
--- a/NLP/dummysentence.py
+++ b/NLP/dummysentence.py
@@ -1,66 +1,33 @@
-import os
-from sentence_transformers import SentenceTransformer, util
-
-MODEL_NAME = 'all-MiniLM-L6-v2'
-MODEL_FOLDER = 'model'
-
-def load_file(file_path):
-    with open(file_path, 'r', encoding='utf-8') as file:
-        return [line.strip() for line in file if line.strip()]
-
-def load_or_download_model():
-    model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
-    if os.path.exists(model_path):
-        print(f"Loading model from {model_path}")
-        return SentenceTransformer(model_path)
-    else:
-        print(f"Downloading model {MODEL_NAME}")
-        model = SentenceTransformer(MODEL_NAME)
-        os.makedirs(MODEL_FOLDER, exist_ok=True)
-        model.save(model_path)
-        print(f"Model saved to {model_path}")
-        return model
-
-def find_similar_sentences(query, file_path, top_n=5):
-    # Load the pre-trained model
-    model = load_or_download_model()
-
-    # Load and encode the sentences from the file
-    sentences = load_file(file_path)
-    sentence_embeddings = model.encode(sentences)
-
-    # Encode the query
-    query_embedding = model.encode([query])
-
-    # Calculate cosine similarities
-    cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
-
-    # Get top N results
-    top_results = sorted(zip(sentences, cosine_scores), key=lambda x: x[1], reverse=True)[:top_n]
-
-    return top_results
-
-def main():
-    print("Welcome to the Sentence Similarity Search Tool!")
-    
-    # Get user input for query
-    query = input("Enter your query: ")
+import ollama as client
+
+# Function to get response from Ollama API with system prompt
+def get_ollama_response(sentence_number):
+    system_prompt = "You are a bot and speak in one line. Keep your responses short and to the point."
+    stream = client.chat(
+        model="llama3.2",
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": f"Generate a unique sentence randomly for sentence number {sentence_number}."}
+        ],
+        stream=True
+    )
     
-    # Get user input for file path
-    file_name = input("Enter the name of your text file (without .txt extension): ")
-    file_path = f"{file_name}.txt"
-
-    try:
-        results = find_similar_sentences(query, file_path)
-
-        print(f"\nTop 5 similar sentences for query: '{query}'\n")
-        for sentence, score in results:
-            print(f"Similarity: {score:.4f}")
-            print(f"Sentence: {sentence}\n")
-    except FileNotFoundError:
-        print(f"Error: The file '{file_path}' was not found. Please check the file name and try again.")
-    except Exception as e:
-        print(f"An error occurred: {str(e)}")
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
+    response = ''
+    for chunk in stream:
+        response += chunk['message']['content']
+    return response.strip()  # Strip any leading/trailing spaces
+
+# Open the file in write mode
+with open("generated_sentences.txt", "w") as file:
+    # Loop to generate 100 sentences one by one
+    for i in range(100):
+        # Get the sentence using the function
+        sentence = get_ollama_response(i + 1)
+        
+        # Write the sentence to the file on a new line
+        file.write(sentence + "\n")
+        
+        # Print the sentence to the console
+        print(f"Sentence {i+1}: {sentence}")
+
+print("File 'generated_sentences.txt' created with 100 sentences, each on a new line.")

From d047097050d6e4c53f2f1d781d155d217c21e989 Mon Sep 17 00:00:00 2001
From: A-Akhil <akhilrahul70@gmail.com>
Date: Sun, 20 Oct 2024 18:41:40 +0530
Subject: [PATCH 2/4] it can find the keywords and generate a summary #1334

---
 NLP/textsummary.py | 110 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 NLP/textsummary.py

diff --git a/NLP/textsummary.py b/NLP/textsummary.py
new file mode 100644
index 000000000..97d873a61
--- /dev/null
+++ b/NLP/textsummary.py
@@ -0,0 +1,110 @@
+import os
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+from nltk.tokenize import sent_tokenize
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from collections import Counter
+
+MODEL_NAME = 'all-MiniLM-L6-v2'
+MODEL_FOLDER = 'model'
+
+def load_or_download_model():
+    model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
+    if os.path.exists(model_path):
+        print(f"Loading model from {model_path}")
+        return SentenceTransformer(model_path)
+    else:
+        print(f"Downloading model {MODEL_NAME}")
+        model = SentenceTransformer(MODEL_NAME)
+        os.makedirs(MODEL_FOLDER, exist_ok=True)
+        model.save(model_path)
+        print(f"Model saved to {model_path}")
+        return model
+
+def download_nltk_resources():
+    resources = ['punkt', 'stopwords']
+    for resource in resources:
+        try:
+            nltk.data.find(f'tokenizers/{resource}')
+        except LookupError:
+            print(f"Downloading {resource}...")
+            nltk.download(resource, quiet=True)
+
+def extract_keywords(text, model, top_n=10):
+    # Tokenize the text
+    words = word_tokenize(text.lower())
+    
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
+    
+    # Get word embeddings
+    word_embeddings = model.encode(filtered_words)
+    
+    # Calculate importance scores (you can use different methods here)
+    importance_scores = np.mean(word_embeddings, axis=1)
+    
+    # Get top N words based on importance scores
+    top_indices = np.argsort(importance_scores)[-top_n:]
+    keywords = [filtered_words[i] for i in top_indices[::-1]]
+    
+    return keywords
+
+def summarize_text(text, model, num_sentences=3):
+    # Split the text into sentences
+    sentences = sent_tokenize(text)
+    
+    # Encode sentences
+    sentence_embeddings = model.encode(sentences)
+    
+    # Calculate similarity matrix
+    similarity_matrix = cosine_similarity(sentence_embeddings)
+    
+    # Calculate sentence scores
+    sentence_scores = np.sum(similarity_matrix, axis=1)
+    
+    # Get top sentences
+    top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]
+    top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]
+    
+    return ' '.join(top_sentences)
+
+def main():
+    # Ensure NLTK resources are downloaded
+    download_nltk_resources()
+    
+    # Load or download the model
+    model = load_or_download_model()
+    
+    # Read input file
+    input_file = 'input.txt'
+    if not os.path.exists(input_file):
+        print(f"Error: {input_file} not found. Please ensure the file exists in the current directory.")
+        return
+
+    try:
+        with open(input_file, 'r', encoding='utf-8') as file:
+            text = file.read()
+    except Exception as e:
+        print(f"Error reading {input_file}: {str(e)}")
+        return
+    
+    # Extract keywords
+    keywords = extract_keywords(text, model)
+    
+    # Generate summary
+    summary = summarize_text(text, model)
+    
+    # Print results
+    print("Keywords:")
+    for i, word in enumerate(keywords, 1):
+        print(f"{i}. {word}")
+    
+    print("\nSummary:")
+    print(summary)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From b7d5feaf81f9c1f31672d03e583da85cdcbea1e9 Mon Sep 17 00:00:00 2001
From: A-Akhil <akhilrahul70@gmail.com>
Date: Sun, 20 Oct 2024 18:42:55 +0530
Subject: [PATCH 3/4] imporved the keyword extraction #1334

---
 NLP/textsummary.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/NLP/textsummary.py b/NLP/textsummary.py
index 97d873a61..71d566b28 100644
--- a/NLP/textsummary.py
+++ b/NLP/textsummary.py
@@ -2,10 +2,9 @@
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
-from nltk.tokenize import sent_tokenize
+from nltk.tokenize import sent_tokenize, word_tokenize
 import nltk
 from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
 from collections import Counter
 
 MODEL_NAME = 'all-MiniLM-L6-v2'
@@ -37,21 +36,29 @@ def extract_keywords(text, model, top_n=10):
     # Tokenize the text
     words = word_tokenize(text.lower())
     
-    # Remove stopwords
+    # Remove stopwords and non-alphanumeric tokens
     stop_words = set(stopwords.words('english'))
     filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
     
+    # Count word frequencies
+    word_freq = Counter(filtered_words)
+    
+    # Get unique words
+    unique_words = list(set(filtered_words))
+    
     # Get word embeddings
-    word_embeddings = model.encode(filtered_words)
+    word_embeddings = model.encode(unique_words)
     
-    # Calculate importance scores (you can use different methods here)
+    # Calculate importance scores
     importance_scores = np.mean(word_embeddings, axis=1)
     
-    # Get top N words based on importance scores
-    top_indices = np.argsort(importance_scores)[-top_n:]
-    keywords = [filtered_words[i] for i in top_indices[::-1]]
+    # Combine frequency and importance
+    combined_scores = [(word, word_freq[word] * importance_scores[i]) for i, word in enumerate(unique_words)]
+    
+    # Sort by combined score and get top N
+    top_keywords = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_n]
     
-    return keywords
+    return [word for word, _ in top_keywords]
 
 def summarize_text(text, model, num_sentences=3):
     # Split the text into sentences

From 20b63385bbce255c7a0123f55612d4bfe679e475 Mon Sep 17 00:00:00 2001
From: A-Akhil <akhilrahul70@gmail.com>
Date: Sun, 20 Oct 2024 18:51:51 +0530
Subject: [PATCH 4/4] Can fetch from local dir #1334

---
 NLP/textsummary.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/NLP/textsummary.py b/NLP/textsummary.py
index 71d566b28..b11d3f872 100644
--- a/NLP/textsummary.py
+++ b/NLP/textsummary.py
@@ -1,14 +1,15 @@
 import os
-from sentence_transformers import SentenceTransformer
-from sklearn.metrics.pairwise import cosine_similarity
-import numpy as np
-from nltk.tokenize import sent_tokenize, word_tokenize
 import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
 from nltk.corpus import stopwords
 from collections import Counter
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
 
 MODEL_NAME = 'all-MiniLM-L6-v2'
 MODEL_FOLDER = 'model'
+NLTK_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'nltk_data')
 
 def load_or_download_model():
     model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
@@ -24,13 +25,17 @@ def load_or_download_model():
         return model
 
 def download_nltk_resources():
-    resources = ['punkt', 'stopwords']
-    for resource in resources:
+    nltk.data.path.append(NLTK_DATA_FOLDER)
+    os.makedirs(NLTK_DATA_FOLDER, exist_ok=True)
+    
+    resources = [('punkt', 'tokenizers'), ('stopwords', 'corpora')]
+    for resource, folder in resources:
         try:
-            nltk.data.find(f'tokenizers/{resource}')
+            nltk.data.find(f'{folder}/{resource}')
+            print(f"{resource} is being Loaded.")
         except LookupError:
             print(f"Downloading {resource}...")
-            nltk.download(resource, quiet=True)
+            nltk.download(resource, download_dir=NLTK_DATA_FOLDER, quiet=True)
 
 def extract_keywords(text, model, top_n=10):
     # Tokenize the text