Skip to content

Commit 0728586

Browse files
committed
6
1 parent bffb434 commit 0728586

File tree

604 files changed

+170
-15922
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

604 files changed

+170
-15922
lines changed

Backend/NotesToText.py

Lines changed: 62 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import boto3
77
from botocore.exceptions import NoCredentialsError
88
from io import BytesIO
9+
import tempfile
10+
911

1012
s3_access_key = "AKIAZTHHIOR4CN6UXO6N"
1113
s3_secret_access_key = "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"
@@ -34,91 +36,99 @@ def download_files_from_s3(bucket_name, prefix, local_directory):
3436

3537

3638

37-
def pdf_to_images(pdf_path, output_folder):
39+
def pdf_to_images_from_bytes(pdf_content, output_folder, file_name):
40+
s3_bucket_name = 'learnmateai'
41+
42+
# Save PDF content to a temporary file
43+
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
44+
temp_filename = temp_file.name
45+
temp_file.write(pdf_content)
3846

3947
# Convert PDF pages to images
40-
images = convert_from_path(pdf_path)
41-
42-
# Create the output folder if it doesn't exist
43-
if not os.path.exists(output_folder):
44-
os.makedirs(output_folder)
48+
images = convert_from_path(temp_filename)
49+
50+
# Remove the temporary file
51+
os.remove(temp_filename)
4552

46-
# Save each image in the specified output folder
53+
# Save each image to S3
4754
image_paths = []
4855
for i, image in enumerate(images):
49-
image_path = os.path.join(output_folder, f'page_{i+1}.jpeg')
50-
image.save(image_path, 'JPEG')
51-
image_paths.append(image_path)
56+
image_bytes = BytesIO()
57+
image.save(image_bytes, 'JPEG')
58+
image_bytes.seek(0)
59+
60+
image_key = f'{output_folder}/page_{i+1}.jpeg'
61+
s3.put_object(Body=image_bytes, Bucket=s3_bucket_name, Key=image_key)
62+
63+
image_paths.append(image_key)
5264
noImg = i+1
53-
return image_paths,noImg
65+
return image_paths, noImg
5466

5567
@router.get("/notestotext")
5668
def NotesToText_handler():
5769
substring_to_remove = "Scanned by CamScanner"
70+
s3_bucket_name = 'learnmateai'
71+
5872

5973
prefix = 'notes_pdf/'
60-
local_directory = 'Local_Storage/notes_pdf'
61-
62-
# Create the local directory if it doesn't exist
63-
os.makedirs(local_directory, exist_ok=True)
64-
65-
# Download files from S3
66-
download_files_from_s3(s3_bucket_name, prefix, local_directory)
6774

68-
folder_path = "Local_Storage/notes_pdf"
69-
70-
# Get all files in the folder
71-
mod_files = os.listdir(folder_path)
72-
73-
# Print the file names
74-
for file_name in mod_files:
75-
file_name=file_name.split(".")[0]
76-
75+
# List files in the S3 bucket with the specified prefix
76+
response = s3.list_objects_v2(Bucket=s3_bucket_name, Prefix=prefix)
77+
78+
# Extract the file names from the response
79+
files = [obj['Key'] for obj in response.get('Contents', [])]
80+
81+
# Process each file
82+
for file_name in files:
83+
file_name = os.path.splitext(os.path.basename(file_name))[0]
84+
7785
print(f"converting {file_name}....")
78-
pdf_path = f'Local_Storage/notes_pdf/{file_name}.pdf'
86+
87+
# Download the PDF file from S3
88+
pdf_object = s3.get_object(Bucket=s3_bucket_name, Key=f'{prefix}{file_name}.pdf')
89+
pdf_content = pdf_object['Body'].read()
90+
91+
# Create the output folder in S3
7992
output_folder = f'images/Notes_images/{file_name}'
93+
s3.put_object(Body='', Bucket=s3_bucket_name, Key=f'{output_folder}/')
8094

81-
# Convert the PDF to images and save them in the output folder
82-
image_paths, noImg = pdf_to_images(pdf_path, output_folder)
95+
# Convert the PDF to images and save them in the output folder in S3
96+
image_paths, noImg = pdf_to_images_from_bytes(pdf_content, output_folder, file_name)
8397
print(noImg)
8498

8599
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'Files/client_file_vision.json'
86100
client = vision.ImageAnnotatorClient()
87-
101+
88102
# [START vision_python_migration_text_detection]
89103
image_contents = " "
90104

91105
for j in range(noImg):
92-
image_path = f'images/Notes_images/{file_name}/page_{j+1}.jpeg'
93-
with open(image_path, 'rb') as image_file:
94-
content = image_file.read()
95-
image = vision.Image(content=content)
96-
response = client.text_detection(image=image)
97-
texts = response.text_annotations[0]
98-
text = str(texts.description)
99-
image_contents += text.replace(substring_to_remove, "")
100-
101-
102-
output_file = f"Local_Storage/notes_txt/{file_name}.txt"
103-
# Write the text content to the output file
104-
with open(output_file, "w",encoding="utf-8") as file:
105-
file.write(image_contents)
106-
print(f"{file_name} completed")
106+
image_path = f'{output_folder}/page_{j+1}.jpeg'
107+
108+
# Download the image from S3
109+
image_object = s3.get_object(Bucket=s3_bucket_name, Key=image_path)
110+
image_content = image_object['Body'].read()
111+
112+
content = vision.Image(content=image_content)
113+
response = client.text_detection(image=content)
114+
texts = response.text_annotations[0]
115+
text = str(texts.description)
116+
image_contents += text.replace(substring_to_remove, "")
107117

108118
s3_key = f'notes_txt/{file_name}.txt'
109-
110-
# Write the text content to the output file
119+
120+
# Upload the text content to S3
111121
s3.put_object(
112122
Body=image_contents,
113123
Bucket=s3_bucket_name,
114124
Key=s3_key
115125
)
116-
126+
117127
if response.error.message:
118128
raise Exception(
119-
'{}\nFor more info on error messages, check: '
120-
'https://cloud.google.com/apis/design/errors'.format(
121-
response.error.message))
129+
'{}\nFor more info on error messages, check: '
130+
'https://cloud.google.com/apis/design/errors'.format(
131+
response.error.message))
122132

123133

124134

221 Bytes
Binary file not shown.

Backend/new_sorter.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
from fastapi import APIRouter
2+
import io
3+
import re
4+
import chardet
5+
import numpy as np
6+
import tensorflow as tf
7+
import tensorflow_hub as hub
8+
from sklearnex import patch_sklearn
9+
patch_sklearn()
10+
from sklearn.cluster import KMeans
11+
import boto3
12+
13+
# Create an instance of APIRouter
14+
test = APIRouter()
15+
16+
def extract_questions_from_file(file_content):
17+
pattern = r'((?:[IVX]+|\([a-z]\))\. .*(?:\n\s+\(\w\)\. .*)*)'
18+
matches = re.findall(pattern, file_content)
19+
questions = [re.sub(r'\n\s+\(\w\)\. ', ' ', match.strip()) for match in matches]
20+
return questions
21+
22+
def extract_questions_from_s3(bucket, key):
23+
s3 = boto3.client('s3')
24+
response = s3.get_object(Bucket=bucket, Key=key)
25+
content = response['Body'].read().decode('utf-8')
26+
return extract_questions_from_file(content)
27+
28+
def cluster_questions(questions, num_clusters):
29+
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
30+
embed = hub.load(module_url)
31+
embeddings = embed(questions).numpy()
32+
kmeans = KMeans(n_clusters=num_clusters)
33+
kmeans.fit(embeddings)
34+
y_kmeans = kmeans.predict(embeddings)
35+
36+
# Find repeated questions
37+
repeated_indices = []
38+
for i in range(len(questions)):
39+
if questions[i] in questions[:i]:
40+
repeated_indices.append(i)
41+
42+
return y_kmeans, repeated_indices
43+
44+
@test.get("/api1")
45+
def api1_handler():
46+
s3_bucket = 'learnmateai'
47+
s3_key_prefix = 'pyqs_text/'
48+
num_clusters = 4
49+
50+
s3 = boto3.client('s3')
51+
questions = []
52+
response = s3.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key_prefix)
53+
for obj in response['Contents']:
54+
key = obj['Key']
55+
if key.endswith('.txt'):
56+
response = s3.get_object(Bucket=s3_bucket, Key=key)
57+
content = response['Body'].read().decode('utf-8')
58+
questions += extract_questions_from_file(content)
59+
60+
labels, repeated_indices = cluster_questions(questions, num_clusters)
61+
62+
print("Clustering questions")
63+
for i in range(num_clusters):
64+
cluster_questions = np.array(questions)[np.where(labels == i)[0]]
65+
print(f"Module {i+1}:")
66+
for question in cluster_questions:
67+
print(f" - {question}")
68+
69+
# Print repeated questions separately
70+
if repeated_indices:
71+
print("Repeated Questions:")
72+
for index in repeated_indices:
73+
print(f" - {questions[index]}")
74+
75+
# Save the results to S3
76+
output_key = 'Generated_Files/cluster_questions.txt'
77+
output_content = ""
78+
for i in range(num_clusters):
79+
cluster_questions = np.array(questions)[np.where(labels == i)[0]]
80+
output_content += f"Module {i+1}:\n"
81+
for question in cluster_questions:
82+
output_content += f" - {question}\n"
83+
output_content += "\n"
84+
85+
if repeated_indices:
86+
output_content += "Repeated Questions:\n"
87+
for index in repeated_indices:
88+
output_content += f" - {questions[index]}\n"
89+
90+
s3.put_object(Body=output_content.encode('utf-8'), Bucket=s3_bucket, Key=output_key)
91+
92+
return {"message": "Previous Year question papers sorted to modules"}
93+
94+
@test.post("/api1")
95+
def api1_post_handler():
96+
# Add your logic here
97+
return {"message": "POST request received on API 1"}

Backend/updated_sorter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def extract_questions_from_directory(directory):
3636
questions += extract_questions_from_file(filepath)
3737
return questions
3838

39-
def cluster_questions_1(questions, num_clusters, syllabus_file):
39+
def cluster_questions_1(questions, num_clusters):
4040
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
4141

4242
embed = hub.load(module_url)
@@ -58,8 +58,8 @@ def cluster_questions_1(questions, num_clusters, syllabus_file):
5858
def api1_handler():
5959
questions = extract_questions_from_directory('Local_Storage/pyqs_text')
6060
num_clusters = 4
61-
syllabus_file = 'Local_Storage/syllabus.txt'
62-
labels, repeated_indices = cluster_questions_1(questions, num_clusters, syllabus_file)
61+
62+
labels, repeated_indices = cluster_questions_1(questions, num_clusters)
6363

6464
print("Clustering questions")
6565
for i in range(num_clusters):

Dockerfile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
FROM amazon/aws-lambda-python:3.10
22

3+
# Set the user as root
4+
USER root
35
# Upgrade pip
46
RUN python3 -m pip install --upgrade pip
7+
RUN yum install -y poppler-utils
58
# Install the function's dependencies using file requirements.txt
69
# from your project folder.
710
COPY Backend ./Backend
@@ -13,6 +16,11 @@ COPY Local_Storage ./Local_Storage
1316
COPY requirements.txt .
1417
RUN pip3 install -r requirements.txt --target "${LAMBDA_TASK_ROOT}"
1518

19+
RUN chmod a+rwx Local_Storage
20+
RUN chmod a+rwx images
21+
22+
23+
1624
# Copy function code
1725
COPY app.py ${LAMBDA_TASK_ROOT}
1826

261 KB
Binary file not shown.
-17.1 KB
Binary file not shown.

Local_Storage/notes_pdf/SEM3.pdf

-198 KB
Binary file not shown.

0 commit comments

Comments
 (0)