Merge pull request #1504 from A-Akhil/main

sanjay-kv · web-flow · commit 923ac5a28f9e · 2024-10-21T02:55:07.000+11:00
Keyword Extraction Feature
diff --git a/NLP/dummysentence.py b/NLP/dummysentence.py
@@ -1,66 +1,33 @@
-import os
-from sentence_transformers import SentenceTransformer, util
-
-MODEL_NAME = 'all-MiniLM-L6-v2'
-MODEL_FOLDER = 'model'
-
-def load_file(file_path):
-    with open(file_path, 'r', encoding='utf-8') as file:
-        return [line.strip() for line in file if line.strip()]
-
-def load_or_download_model():
-    model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
-    if os.path.exists(model_path):
-        print(f"Loading model from {model_path}")
-        return SentenceTransformer(model_path)
-    else:
-        print(f"Downloading model {MODEL_NAME}")
-        model = SentenceTransformer(MODEL_NAME)
-        os.makedirs(MODEL_FOLDER, exist_ok=True)
-        model.save(model_path)
-        print(f"Model saved to {model_path}")
-        return model
-
-def find_similar_sentences(query, file_path, top_n=5):
-    # Load the pre-trained model
-    model = load_or_download_model()
-
-    # Load and encode the sentences from the file
-    sentences = load_file(file_path)
-    sentence_embeddings = model.encode(sentences)
-
-    # Encode the query
-    query_embedding = model.encode([query])
-
-    # Calculate cosine similarities
-    cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
-
-    # Get top N results
-    top_results = sorted(zip(sentences, cosine_scores), key=lambda x: x[1], reverse=True)[:top_n]
-
-    return top_results
-
-def main():
-    print("Welcome to the Sentence Similarity Search Tool!")
-    
-    # Get user input for query
-    query = input("Enter your query: ")
+import ollama as client
+
+# Function to get response from Ollama API with system prompt
+def get_ollama_response(sentence_number):
+    system_prompt = "You are a bot and speak in one line. Keep your responses short and to the point."
+    stream = client.chat(
+        model="llama3.2",
+        messages=[
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": f"Generate a unique sentence randomly for sentence number {sentence_number}."}
+        ],
+        stream=True
+    )
     
-    # Get user input for file path
-    file_name = input("Enter the name of your text file (without .txt extension): ")
-    file_path = f"{file_name}.txt"
-
-    try:
-        results = find_similar_sentences(query, file_path)
-
-        print(f"\nTop 5 similar sentences for query: '{query}'\n")
-        for sentence, score in results:
-            print(f"Similarity: {score:.4f}")
-            print(f"Sentence: {sentence}\n")
-    except FileNotFoundError:
-        print(f"Error: The file '{file_path}' was not found. Please check the file name and try again.")
-    except Exception as e:
-        print(f"An error occurred: {str(e)}")
-
-if __name__ == "__main__":
-    main()
+    response = ''
+    for chunk in stream:
+        response += chunk['message']['content']
+    return response.strip()  # Strip any leading/trailing spaces
+
+# Open the file in write mode
+with open("generated_sentences.txt", "w") as file:
+    # Loop to generate 100 sentences one by one
+    for i in range(100):
+        # Get the sentence using the function
+        sentence = get_ollama_response(i + 1)
+        
+        # Write the sentence to the file on a new line
+        file.write(sentence + "\n")
+        
+        # Print the sentence to the console
+        print(f"Sentence {i+1}: {sentence}")
+
+print("File 'generated_sentences.txt' created with 100 sentences, each on a new line.")
diff --git a/NLP/textsummary.py b/NLP/textsummary.py
@@ -0,0 +1,122 @@
+import os
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from collections import Counter
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+
+MODEL_NAME = 'all-MiniLM-L6-v2'
+MODEL_FOLDER = 'model'
+NLTK_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'nltk_data')
+
+def load_or_download_model():
+    model_path = os.path.join(MODEL_FOLDER, MODEL_NAME)
+    if os.path.exists(model_path):
+        print(f"Loading model from {model_path}")
+        return SentenceTransformer(model_path)
+    else:
+        print(f"Downloading model {MODEL_NAME}")
+        model = SentenceTransformer(MODEL_NAME)
+        os.makedirs(MODEL_FOLDER, exist_ok=True)
+        model.save(model_path)
+        print(f"Model saved to {model_path}")
+        return model
+
+def download_nltk_resources():
+    nltk.data.path.append(NLTK_DATA_FOLDER)
+    os.makedirs(NLTK_DATA_FOLDER, exist_ok=True)
+    
+    resources = [('punkt', 'tokenizers'), ('stopwords', 'corpora')]
+    for resource, folder in resources:
+        try:
+            nltk.data.find(f'{folder}/{resource}')
+            print(f"{resource} is being Loaded.")
+        except LookupError:
+            print(f"Downloading {resource}...")
+            nltk.download(resource, download_dir=NLTK_DATA_FOLDER, quiet=True)
+
+def extract_keywords(text, model, top_n=10):
+    # Tokenize the text
+    words = word_tokenize(text.lower())
+    
+    # Remove stopwords and non-alphanumeric tokens
+    stop_words = set(stopwords.words('english'))
+    filtered_words = [word for word in words if word.isalnum() and word not in stop_words]
+    
+    # Count word frequencies
+    word_freq = Counter(filtered_words)
+    
+    # Get unique words
+    unique_words = list(set(filtered_words))
+    
+    # Get word embeddings
+    word_embeddings = model.encode(unique_words)
+    
+    # Calculate importance scores
+    importance_scores = np.mean(word_embeddings, axis=1)
+    
+    # Combine frequency and importance
+    combined_scores = [(word, word_freq[word] * importance_scores[i]) for i, word in enumerate(unique_words)]
+    
+    # Sort by combined score and get top N
+    top_keywords = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_n]
+    
+    return [word for word, _ in top_keywords]
+
+def summarize_text(text, model, num_sentences=3):
+    # Split the text into sentences
+    sentences = sent_tokenize(text)
+    
+    # Encode sentences
+    sentence_embeddings = model.encode(sentences)
+    
+    # Calculate similarity matrix
+    similarity_matrix = cosine_similarity(sentence_embeddings)
+    
+    # Calculate sentence scores
+    sentence_scores = np.sum(similarity_matrix, axis=1)
+    
+    # Get top sentences
+    top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:]
+    top_sentences = [sentences[i] for i in sorted(top_sentence_indices)]
+    
+    return ' '.join(top_sentences)
+
+def main():
+    # Ensure NLTK resources are downloaded
+    download_nltk_resources()
+    
+    # Load or download the model
+    model = load_or_download_model()
+    
+    # Read input file
+    input_file = 'input.txt'
+    if not os.path.exists(input_file):
+        print(f"Error: {input_file} not found. Please ensure the file exists in the current directory.")
+        return
+
+    try:
+        with open(input_file, 'r', encoding='utf-8') as file:
+            text = file.read()
+    except Exception as e:
+        print(f"Error reading {input_file}: {str(e)}")
+        return
+    
+    # Extract keywords
+    keywords = extract_keywords(text, model)
+    
+    # Generate summary
+    summary = summarize_text(text, model)
+    
+    # Print results
+    print("Keywords:")
+    for i, word in enumerate(keywords, 1):
+        print(f"{i}. {word}")
+    
+    print("\nSummary:")
+    print(summary)
+
+if __name__ == "__main__":
+    main()