|
1 |
| -import tensorflow_hub as hub |
2 |
| -from sklearn.cluster import KMeans |
3 |
| -import numpy as np |
4 |
| -from tqdm import tqdm |
5 |
| - |
6 |
| -# Preprocessing functions |
7 |
| -def preprocess_text(text): |
8 |
| - sentences = text.split('\n') # Split text into sentences |
9 |
| - return sentences |
10 |
| - |
11 |
| -def extract_chunks(text): |
12 |
| - # Preprocess the input text |
13 |
| - sentences = preprocess_text(text) |
14 |
| - |
15 |
| - # Show progress bar while loading the model |
16 |
| - with tqdm(total=1, desc="Loading model") as pbar: |
17 |
| - embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5") |
18 |
| - pbar.update(1) |
19 |
| - |
20 |
| - # Generate sentence embeddings |
21 |
| - sentence_embeddings = embed(sentences) |
22 |
| - |
23 |
| - # Determine the optimal number of clusters using the Elbow Method |
24 |
| - distortions = [] |
25 |
| - K = range(1, 10) # Set the range of possible clusters |
26 |
| - for k in K: |
27 |
| - kmeans = KMeans(n_clusters=k) |
28 |
| - kmeans.fit(sentence_embeddings) |
29 |
| - distortions.append(kmeans.inertia_) |
30 |
| - |
31 |
| - # Find the "elbow" point in the distortion plot |
32 |
| - elbow_index = np.argmin(np.diff(distortions)) + 1 |
33 |
| - num_clusters = K[elbow_index] |
34 |
| - |
35 |
| - # Perform clustering with the determined number of clusters |
36 |
| - kmeans = KMeans(n_clusters=num_clusters) |
37 |
| - kmeans.fit(sentence_embeddings) |
38 |
| - |
39 |
| - # Retrieve topic-wise chunks with subsections |
40 |
| - chunks = [] |
41 |
| - for cluster_index in range(num_clusters): |
42 |
| - chunk_sentences = [sentences[i] for i in range(len(sentences)) if kmeans.labels_[i] == cluster_index] |
43 |
| - chunks.append({"topic": f"Topic {cluster_index+1}", "subsections": chunk_sentences}) |
44 |
| - |
45 |
| - return chunks |
46 |
| - |
47 |
| -# Example usage |
48 |
| -text = "This is an example text. It contains multiple sentences.\nEach sentence represents a subsection." |
49 |
| -result = extract_chunks(text) |
50 |
| -print(result) |
| 1 | +from fastapi import APIRouter |
| 2 | +import boto3 |
| 3 | +import openai |
| 4 | +import time |
| 5 | + |
| 6 | +s3_access_key = "AKIAZTHHIOR4JJ5HLTUB" |
| 7 | +s3_secret_access_key = "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" |
| 8 | +s3_bucket_name = "learnmateai" |
| 9 | + |
| 10 | +s3 = boto3.client("s3", aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_access_key) |
| 11 | + |
| 12 | +# Set up OpenAI API credentials |
| 13 | +openai.api_key = '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' |
| 14 | + |
| 15 | +def batch_text(input_text, delimiter="TOPIC:"): |
| 16 | + batches = input_text.split(delimiter) |
| 17 | + cleaned_batches = [batch.strip() for batch in batches if batch.strip()] |
| 18 | + return cleaned_batches |
| 19 | + |
| 20 | +def upload_to_s3(bucket_name, folder_name, file_name, content): |
| 21 | + s3 = boto3.client('s3') |
| 22 | + key = folder_name + '/' + file_name |
| 23 | + s3.put_object(Body=content, Bucket=bucket_name, Key=key) |
| 24 | + |
| 25 | +app = APIRouter() |
| 26 | + |
| 27 | +@app.get("/process_files") |
| 28 | +def process_files(): |
| 29 | + # Function to read and process a file |
| 30 | + def process_file(file_name): |
| 31 | + # Read file from S3 |
| 32 | + response = s3.get_object(Bucket='learnmateai', Key='notes_txt/' + file_name) |
| 33 | + file_content = response['Body'].read().decode('utf-8') |
| 34 | + |
| 35 | + # Split file content into batches (adjust batch size as needed) |
| 36 | + batch_size = 3000 |
| 37 | + batches = [file_content[i:i+batch_size] for i in range(0, len(file_content), batch_size)] |
| 38 | + |
| 39 | + # Process batches |
| 40 | + for batch in batches: |
| 41 | + # Send batch to OpenAI API |
| 42 | + |
| 43 | + |
| 44 | + response = openai.ChatCompletion.create( |
| 45 | + model="gpt-3.5-turbo", |
| 46 | + messages=[ |
| 47 | + { |
| 48 | + "role": "user", |
| 49 | + "content": f"divide the text topic wise (it should look like TOPIC:notes) notes should very breif and be created in a way so that you will be able to recreate the full txt :\n\n{batch}\n\n" |
| 50 | + } |
| 51 | + ] |
| 52 | + ) |
| 53 | + |
| 54 | + important_topics = response.choices[0].message.content |
| 55 | + #print(important_topics) |
| 56 | + #return important_topics |
| 57 | + # Add a delay of 20 seconds to handle rate limit |
| 58 | + time.sleep(20) |
| 59 | + |
| 60 | + text_batches = batch_text(important_topics) |
| 61 | + |
| 62 | + bucket_name = 'learnmateai' |
| 63 | + file=file_name.split(".")[0] |
| 64 | + folder_name = f'Analysed_Notes/{file}' |
| 65 | + |
| 66 | + for i, batch in enumerate(text_batches): |
| 67 | + lines = batch.split('\n') |
| 68 | + file_name1 = lines[0].strip().replace(" ", "_") + '.txt' |
| 69 | + content = '\n'.join(lines[1:]).strip() |
| 70 | + upload_to_s3(bucket_name, folder_name, file_name1, content) |
| 71 | + |
| 72 | + # Print uploaded file information |
| 73 | + print(f"File '{file_name1}' uploaded to '{bucket_name}/{folder_name}'") |
| 74 | + |
| 75 | + # Get the list of files in the "notes_txt" folder |
| 76 | + response = s3.list_objects_v2(Bucket='learnmateai', Prefix='notes_txt/') |
| 77 | + |
| 78 | + # Process each file |
| 79 | + for file in response['Contents']: |
| 80 | + file_name = file['Key'].split('/')[-1] |
| 81 | + process_file(file_name) |
| 82 | + |
| 83 | + return {"message": "File processing completed."} |
0 commit comments