diff --git a/NLP/dummysentence.py b/NLP/dummysentence.py index b07d7b46b..530387c61 100644 --- a/NLP/dummysentence.py +++ b/NLP/dummysentence.py @@ -1,66 +1,33 @@ -import os -from sentence_transformers import SentenceTransformer, util - -MODEL_NAME = 'all-MiniLM-L6-v2' -MODEL_FOLDER = 'model' - -def load_file(file_path): - with open(file_path, 'r', encoding='utf-8') as file: - return [line.strip() for line in file if line.strip()] - -def load_or_download_model(): - model_path = os.path.join(MODEL_FOLDER, MODEL_NAME) - if os.path.exists(model_path): - print(f"Loading model from {model_path}") - return SentenceTransformer(model_path) - else: - print(f"Downloading model {MODEL_NAME}") - model = SentenceTransformer(MODEL_NAME) - os.makedirs(MODEL_FOLDER, exist_ok=True) - model.save(model_path) - print(f"Model saved to {model_path}") - return model - -def find_similar_sentences(query, file_path, top_n=5): - # Load the pre-trained model - model = load_or_download_model() - - # Load and encode the sentences from the file - sentences = load_file(file_path) - sentence_embeddings = model.encode(sentences) - - # Encode the query - query_embedding = model.encode([query]) - - # Calculate cosine similarities - cosine_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0] - - # Get top N results - top_results = sorted(zip(sentences, cosine_scores), key=lambda x: x[1], reverse=True)[:top_n] - - return top_results - -def main(): - print("Welcome to the Sentence Similarity Search Tool!") - - # Get user input for query - query = input("Enter your query: ") +import ollama as client + +# Function to get response from Ollama API with system prompt +def get_ollama_response(sentence_number): + system_prompt = "You are a bot and speak in one line. Keep your responses short and to the point." + stream = client.chat( + model="llama3.2", + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": f"Generate a unique sentence randomly for sentence number {sentence_number}."} + ], + stream=True + ) - # Get user input for file path - file_name = input("Enter the name of your text file (without .txt extension): ") - file_path = f"{file_name}.txt" - - try: - results = find_similar_sentences(query, file_path) - - print(f"\nTop 5 similar sentences for query: '{query}'\n") - for sentence, score in results: - print(f"Similarity: {score:.4f}") - print(f"Sentence: {sentence}\n") - except FileNotFoundError: - print(f"Error: The file '{file_path}' was not found. Please check the file name and try again.") - except Exception as e: - print(f"An error occurred: {str(e)}") - -if __name__ == "__main__": - main() \ No newline at end of file + response = '' + for chunk in stream: + response += chunk['message']['content'] + return response.strip() # Strip any leading/trailing spaces + +# Open the file in write mode +with open("generated_sentences.txt", "w") as file: + # Loop to generate 100 sentences one by one + for i in range(100): + # Get the sentence using the function + sentence = get_ollama_response(i + 1) + + # Write the sentence to the file on a new line + file.write(sentence + "\n") + + # Print the sentence to the console + print(f"Sentence {i+1}: {sentence}") + +print("File 'generated_sentences.txt' created with 100 sentences, each on a new line.") diff --git a/NLP/textsummary.py b/NLP/textsummary.py new file mode 100644 index 000000000..b11d3f872 --- /dev/null +++ b/NLP/textsummary.py @@ -0,0 +1,122 @@ +import os +import nltk +from nltk.tokenize import sent_tokenize, word_tokenize +from nltk.corpus import stopwords +from collections import Counter +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity +import numpy as np + +MODEL_NAME = 'all-MiniLM-L6-v2' +MODEL_FOLDER = 'model' +NLTK_DATA_FOLDER = os.path.join(MODEL_FOLDER, 'nltk_data') + +def load_or_download_model(): + model_path = os.path.join(MODEL_FOLDER, MODEL_NAME) + if os.path.exists(model_path): + print(f"Loading model from {model_path}") + return SentenceTransformer(model_path) + else: + print(f"Downloading model {MODEL_NAME}") + model = SentenceTransformer(MODEL_NAME) + os.makedirs(MODEL_FOLDER, exist_ok=True) + model.save(model_path) + print(f"Model saved to {model_path}") + return model + +def download_nltk_resources(): + nltk.data.path.append(NLTK_DATA_FOLDER) + os.makedirs(NLTK_DATA_FOLDER, exist_ok=True) + + resources = [('punkt', 'tokenizers'), ('stopwords', 'corpora')] + for resource, folder in resources: + try: + nltk.data.find(f'{folder}/{resource}') + print(f"{resource} is being Loaded.") + except LookupError: + print(f"Downloading {resource}...") + nltk.download(resource, download_dir=NLTK_DATA_FOLDER, quiet=True) + +def extract_keywords(text, model, top_n=10): + # Tokenize the text + words = word_tokenize(text.lower()) + + # Remove stopwords and non-alphanumeric tokens + stop_words = set(stopwords.words('english')) + filtered_words = [word for word in words if word.isalnum() and word not in stop_words] + + # Count word frequencies + word_freq = Counter(filtered_words) + + # Get unique words + unique_words = list(set(filtered_words)) + + # Get word embeddings + word_embeddings = model.encode(unique_words) + + # Calculate importance scores + importance_scores = np.mean(word_embeddings, axis=1) + + # Combine frequency and importance + combined_scores = [(word, word_freq[word] * importance_scores[i]) for i, word in enumerate(unique_words)] + + # Sort by combined score and get top N + top_keywords = sorted(combined_scores, key=lambda x: x[1], reverse=True)[:top_n] + + return [word for word, _ in top_keywords] + +def summarize_text(text, model, num_sentences=3): + # Split the text into sentences + sentences = sent_tokenize(text) + + # Encode sentences + sentence_embeddings = model.encode(sentences) + + # Calculate similarity matrix + similarity_matrix = cosine_similarity(sentence_embeddings) + + # Calculate sentence scores + sentence_scores = np.sum(similarity_matrix, axis=1) + + # Get top sentences + top_sentence_indices = np.argsort(sentence_scores)[-num_sentences:] + top_sentences = [sentences[i] for i in sorted(top_sentence_indices)] + + return ' '.join(top_sentences) + +def main(): + # Ensure NLTK resources are downloaded + download_nltk_resources() + + # Load or download the model + model = load_or_download_model() + + # Read input file + input_file = 'input.txt' + if not os.path.exists(input_file): + print(f"Error: {input_file} not found. Please ensure the file exists in the current directory.") + return + + try: + with open(input_file, 'r', encoding='utf-8') as file: + text = file.read() + except Exception as e: + print(f"Error reading {input_file}: {str(e)}") + return + + # Extract keywords + keywords = extract_keywords(text, model) + + # Generate summary + summary = summarize_text(text, model) + + # Print results + print("Keywords:") + for i, word in enumerate(keywords, 1): + print(f"{i}. {word}") + + print("\nSummary:") + print(summary) + +if __name__ == "__main__": + main() \ No newline at end of file