Skip to content

Commit a9a0492

Browse files
committed
chunker over
1 parent 2e3dc06 commit a9a0492

File tree

4 files changed

+84
-51
lines changed

4 files changed

+84
-51
lines changed

Backend/NotesChunker.py

Lines changed: 83 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,83 @@
1-
import tensorflow_hub as hub
2-
from sklearn.cluster import KMeans
3-
import numpy as np
4-
from tqdm import tqdm
5-
6-
# Preprocessing functions
7-
def preprocess_text(text):
8-
sentences = text.split('\n') # Split text into sentences
9-
return sentences
10-
11-
def extract_chunks(text):
12-
# Preprocess the input text
13-
sentences = preprocess_text(text)
14-
15-
# Show progress bar while loading the model
16-
with tqdm(total=1, desc="Loading model") as pbar:
17-
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")
18-
pbar.update(1)
19-
20-
# Generate sentence embeddings
21-
sentence_embeddings = embed(sentences)
22-
23-
# Determine the optimal number of clusters using the Elbow Method
24-
distortions = []
25-
K = range(1, 10) # Set the range of possible clusters
26-
for k in K:
27-
kmeans = KMeans(n_clusters=k)
28-
kmeans.fit(sentence_embeddings)
29-
distortions.append(kmeans.inertia_)
30-
31-
# Find the "elbow" point in the distortion plot
32-
elbow_index = np.argmin(np.diff(distortions)) + 1
33-
num_clusters = K[elbow_index]
34-
35-
# Perform clustering with the determined number of clusters
36-
kmeans = KMeans(n_clusters=num_clusters)
37-
kmeans.fit(sentence_embeddings)
38-
39-
# Retrieve topic-wise chunks with subsections
40-
chunks = []
41-
for cluster_index in range(num_clusters):
42-
chunk_sentences = [sentences[i] for i in range(len(sentences)) if kmeans.labels_[i] == cluster_index]
43-
chunks.append({"topic": f"Topic {cluster_index+1}", "subsections": chunk_sentences})
44-
45-
return chunks
46-
47-
# Example usage
48-
text = "This is an example text. It contains multiple sentences.\nEach sentence represents a subsection."
49-
result = extract_chunks(text)
50-
print(result)
1+
from fastapi import APIRouter
2+
import boto3
3+
import openai
4+
import time
5+
6+
s3_access_key = "AKIAZTHHIOR4JJ5HLTUB"
7+
s3_secret_access_key = "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"
8+
s3_bucket_name = "learnmateai"
9+
10+
s3 = boto3.client("s3", aws_access_key_id=s3_access_key, aws_secret_access_key=s3_secret_access_key)
11+
12+
# Set up OpenAI API credentials
13+
openai.api_key = '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'
14+
15+
def batch_text(input_text, delimiter="TOPIC:"):
16+
batches = input_text.split(delimiter)
17+
cleaned_batches = [batch.strip() for batch in batches if batch.strip()]
18+
return cleaned_batches
19+
20+
def upload_to_s3(bucket_name, folder_name, file_name, content):
21+
s3 = boto3.client('s3')
22+
key = folder_name + '/' + file_name
23+
s3.put_object(Body=content, Bucket=bucket_name, Key=key)
24+
25+
app = APIRouter()
26+
27+
@app.get("/process_files")
28+
def process_files():
29+
# Function to read and process a file
30+
def process_file(file_name):
31+
# Read file from S3
32+
response = s3.get_object(Bucket='learnmateai', Key='notes_txt/' + file_name)
33+
file_content = response['Body'].read().decode('utf-8')
34+
35+
# Split file content into batches (adjust batch size as needed)
36+
batch_size = 3000
37+
batches = [file_content[i:i+batch_size] for i in range(0, len(file_content), batch_size)]
38+
39+
# Process batches
40+
for batch in batches:
41+
# Send batch to OpenAI API
42+
43+
44+
response = openai.ChatCompletion.create(
45+
model="gpt-3.5-turbo",
46+
messages=[
47+
{
48+
"role": "user",
49+
"content": f"divide the text topic wise (it should look like TOPIC:notes) notes should very breif and be created in a way so that you will be able to recreate the full txt :\n\n{batch}\n\n"
50+
}
51+
]
52+
)
53+
54+
important_topics = response.choices[0].message.content
55+
#print(important_topics)
56+
#return important_topics
57+
# Add a delay of 20 seconds to handle rate limit
58+
time.sleep(20)
59+
60+
text_batches = batch_text(important_topics)
61+
62+
bucket_name = 'learnmateai'
63+
file=file_name.split(".")[0]
64+
folder_name = f'Analysed_Notes/{file}'
65+
66+
for i, batch in enumerate(text_batches):
67+
lines = batch.split('\n')
68+
file_name1 = lines[0].strip().replace(" ", "_") + '.txt'
69+
content = '\n'.join(lines[1:]).strip()
70+
upload_to_s3(bucket_name, folder_name, file_name1, content)
71+
72+
# Print uploaded file information
73+
print(f"File '{file_name1}' uploaded to '{bucket_name}/{folder_name}'")
74+
75+
# Get the list of files in the "notes_txt" folder
76+
response = s3.list_objects_v2(Bucket='learnmateai', Prefix='notes_txt/')
77+
78+
# Process each file
79+
for file in response['Contents']:
80+
file_name = file['Key'].split('/')[-1]
81+
process_file(file_name)
82+
83+
return {"message": "File processing completed."}
1.37 KB
Binary file not shown.

__pycache__/app.cpython-310.pyc

-8 Bytes
Binary file not shown.

app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
#from Backend.Notes_Analyser import router as api4_router
99
#from Backend.Narrator import router as api5_router
10-
from Backend.NotesChunker import router as chunker
10+
from Backend.NotesChunker import app as chunker
1111
#from Backend.NotesToText import router as notestotxt
1212

1313
# import other API routers as needed

0 commit comments

Comments
 (0)