ANGELOANTU7
diff --git a/‎Backend/NotesToText.py‎
Lines changed: 62 additions & 52 deletions b/‎Backend/NotesToText.py‎
Lines changed: 62 additions & 52 deletions
diff --git a/‎Backend/__pycache__/NotesToText.cpython-310.pyc‎
221 Bytes b/‎Backend/__pycache__/NotesToText.cpython-310.pyc‎
221 Bytes
diff --git a/‎Backend/new_sorter.py‎
Lines changed: 97 additions & 0 deletions b/‎Backend/new_sorter.py‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎Backend/updated_sorter.py‎
Lines changed: 3 additions & 3 deletions b/‎Backend/updated_sorter.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎Dockerfile‎
Lines changed: 8 additions & 0 deletions b/‎Dockerfile‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Local_Storage/notes_pdf/EE2-695-2019_Scheme-2023 (8).pdf‎
261 KB b/‎Local_Storage/notes_pdf/EE2-695-2019_Scheme-2023 (8).pdf‎
261 KB
diff --git a/‎Local_Storage/notes_pdf/Principles of Programming Languages 2017 May (2015 Ad).PDF‎
-17.1 KB b/‎Local_Storage/notes_pdf/Principles of Programming Languages 2017 May (2015 Ad).PDF‎
-17.1 KB
diff --git a/‎Local_Storage/notes_pdf/SEM3.pdf‎
-198 KB b/‎Local_Storage/notes_pdf/SEM3.pdf‎
-198 KB
@@ -6,6 +6,8 @@
 import boto3
 from botocore.exceptions import NoCredentialsError
 from io import BytesIO
+import tempfile
+
 
 s3_access_key = "AKIAZTHHIOR4CN6UXO6N"
 s3_secret_access_key = "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"
@@ -34,91 +36,99 @@ def download_files_from_s3(bucket_name, prefix, local_directory):
 
 
 
-def pdf_to_images(pdf_path, output_folder):
+def pdf_to_images_from_bytes(pdf_content, output_folder, file_name):
+    s3_bucket_name = 'learnmateai'
+    
+    # Save PDF content to a temporary file
+    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+        temp_filename = temp_file.name
+        temp_file.write(pdf_content)
 
     # Convert PDF pages to images
-    images = convert_from_path(pdf_path)
-
-    # Create the output folder if it doesn't exist
-    if not os.path.exists(output_folder):
-        os.makedirs(output_folder)
+    images = convert_from_path(temp_filename)
+    
+    # Remove the temporary file
+    os.remove(temp_filename)
 
-    # Save each image in the specified output folder
+    # Save each image to S3
     image_paths = []
     for i, image in enumerate(images):
-        image_path = os.path.join(output_folder, f'page_{i+1}.jpeg')
-        image.save(image_path, 'JPEG')
-        image_paths.append(image_path)
+        image_bytes = BytesIO()
+        image.save(image_bytes, 'JPEG')
+        image_bytes.seek(0)
+        
+        image_key = f'{output_folder}/page_{i+1}.jpeg'
+        s3.put_object(Body=image_bytes, Bucket=s3_bucket_name, Key=image_key)
+        
+        image_paths.append(image_key)
     noImg = i+1     
-    return image_paths,noImg
+    return image_paths, noImg
 
 @router.get("/notestotext")
 def NotesToText_handler():
     substring_to_remove = "Scanned by CamScanner"
+    s3_bucket_name = 'learnmateai'
+
 
     prefix = 'notes_pdf/'
-    local_directory = 'Local_Storage/notes_pdf'
-
-    # Create the local directory if it doesn't exist
-    os.makedirs(local_directory, exist_ok=True)
-
-    # Download files from S3
-    download_files_from_s3(s3_bucket_name, prefix, local_directory)
 
-    folder_path = "Local_Storage/notes_pdf"
-
-    # Get all files in the folder
-    mod_files = os.listdir(folder_path)
-
-    # Print the file names
-    for file_name in mod_files:
-        file_name=file_name.split(".")[0]
-
+    # List files in the S3 bucket with the specified prefix
+    response = s3.list_objects_v2(Bucket=s3_bucket_name, Prefix=prefix)
+    
+    # Extract the file names from the response
+    files = [obj['Key'] for obj in response.get('Contents', [])]
+    
+    # Process each file
+    for file_name in files:
+        file_name = os.path.splitext(os.path.basename(file_name))[0]
+        
         print(f"converting {file_name}....")
-        pdf_path = f'Local_Storage/notes_pdf/{file_name}.pdf'
+        
+        # Download the PDF file from S3
+        pdf_object = s3.get_object(Bucket=s3_bucket_name, Key=f'{prefix}{file_name}.pdf')
+        pdf_content = pdf_object['Body'].read()
+        
+        # Create the output folder in S3
         output_folder = f'images/Notes_images/{file_name}'
+        s3.put_object(Body='', Bucket=s3_bucket_name, Key=f'{output_folder}/')
 
-        # Convert the PDF to images and save them in the output folder
-        image_paths, noImg = pdf_to_images(pdf_path, output_folder)
+        # Convert the PDF to images and save them in the output folder in S3
+        image_paths, noImg = pdf_to_images_from_bytes(pdf_content, output_folder, file_name)
         print(noImg)
 
         os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'Files/client_file_vision.json'
         client = vision.ImageAnnotatorClient()
-
+        
         # [START vision_python_migration_text_detection]
         image_contents = " "
 
         for j in range(noImg):
-            image_path = f'images/Notes_images/{file_name}/page_{j+1}.jpeg'   
-            with open(image_path, 'rb') as image_file:
-                content = image_file.read()
-                image = vision.Image(content=content)
-                response = client.text_detection(image=image)
-                texts = response.text_annotations[0]
-                text = str(texts.description)
-                image_contents += text.replace(substring_to_remove, "")
-
-
-        output_file = f"Local_Storage/notes_txt/{file_name}.txt"
-    #    Write the text content to the output file
-        with open(output_file, "w",encoding="utf-8") as file:
-            file.write(image_contents)
-            print(f"{file_name} completed")
+            image_path = f'{output_folder}/page_{j+1}.jpeg'
+            
+            # Download the image from S3
+            image_object = s3.get_object(Bucket=s3_bucket_name, Key=image_path)
+            image_content = image_object['Body'].read()
+            
+            content = vision.Image(content=image_content)
+            response = client.text_detection(image=content)
+            texts = response.text_annotations[0]
+            text = str(texts.description)
+            image_contents += text.replace(substring_to_remove, "")
 
         s3_key = f'notes_txt/{file_name}.txt'
-
-        # Write the text content to the output file
+        
+        # Upload the text content to S3
         s3.put_object(
             Body=image_contents,
             Bucket=s3_bucket_name,
             Key=s3_key
         )
-              
+        
         if response.error.message:
             raise Exception(
-            '{}\nFor more info on error messages, check: '
-            'https://cloud.google.com/apis/design/errors'.format(
-                response.error.message))
+                '{}\nFor more info on error messages, check: '
+                'https://cloud.google.com/apis/design/errors'.format(
+                    response.error.message))
 
 
 
 
@@ -0,0 +1,97 @@
+from fastapi import APIRouter
+import io
+import re
+import chardet
+import numpy as np
+import tensorflow as tf
+import tensorflow_hub as hub
+from sklearnex import patch_sklearn
+patch_sklearn()
+from sklearn.cluster import KMeans
+import boto3
+
+# Create an instance of APIRouter
+test = APIRouter()
+
+def extract_questions_from_file(file_content):
+    pattern = r'((?:[IVX]+|\([a-z]\))\. .*(?:\n\s+\(\w\)\. .*)*)'
+    matches = re.findall(pattern, file_content)
+    questions = [re.sub(r'\n\s+\(\w\)\. ', ' ', match.strip()) for match in matches]
+    return questions
+
+def extract_questions_from_s3(bucket, key):
+    s3 = boto3.client('s3')
+    response = s3.get_object(Bucket=bucket, Key=key)
+    content = response['Body'].read().decode('utf-8')
+    return extract_questions_from_file(content)
+
+def cluster_questions(questions, num_clusters):
+    module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
+    embed = hub.load(module_url)
+    embeddings = embed(questions).numpy()
+    kmeans = KMeans(n_clusters=num_clusters)
+    kmeans.fit(embeddings)
+    y_kmeans = kmeans.predict(embeddings)
+
+    # Find repeated questions
+    repeated_indices = []
+    for i in range(len(questions)):
+        if questions[i] in questions[:i]:
+            repeated_indices.append(i)
+
+    return y_kmeans, repeated_indices
+
+@test.get("/api1")
+def api1_handler():
+    s3_bucket = 'learnmateai'
+    s3_key_prefix = 'pyqs_text/'
+    num_clusters = 4
+    
+    s3 = boto3.client('s3')
+    questions = []
+    response = s3.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key_prefix)
+    for obj in response['Contents']:
+        key = obj['Key']
+        if key.endswith('.txt'):
+            response = s3.get_object(Bucket=s3_bucket, Key=key)
+            content = response['Body'].read().decode('utf-8')
+            questions += extract_questions_from_file(content)
+
+    labels, repeated_indices = cluster_questions(questions, num_clusters)
+
+    print("Clustering questions")
+    for i in range(num_clusters):
+        cluster_questions = np.array(questions)[np.where(labels == i)[0]]
+        print(f"Module {i+1}:")
+        for question in cluster_questions:
+            print(f" - {question}")
+
+    # Print repeated questions separately
+    if repeated_indices:
+        print("Repeated Questions:")
+        for index in repeated_indices:
+            print(f" - {questions[index]}")
+
+    # Save the results to S3
+    output_key = 'Generated_Files/cluster_questions.txt'
+    output_content = ""
+    for i in range(num_clusters):
+        cluster_questions = np.array(questions)[np.where(labels == i)[0]]
+        output_content += f"Module {i+1}:\n"
+        for question in cluster_questions:
+            output_content += f" - {question}\n"
+        output_content += "\n"
+
+    if repeated_indices:
+        output_content += "Repeated Questions:\n"
+        for index in repeated_indices:
+            output_content += f" - {questions[index]}\n"
+
+    s3.put_object(Body=output_content.encode('utf-8'), Bucket=s3_bucket, Key=output_key)
+
+    return {"message": "Previous Year question papers sorted to modules"}
+
+@test.post("/api1")
+def api1_post_handler():
+    # Add your logic here
+    return {"message": "POST request received on API 1"}
@@ -36,7 +36,7 @@ def extract_questions_from_directory(directory):
             questions += extract_questions_from_file(filepath)
     return questions
 
-def cluster_questions_1(questions, num_clusters, syllabus_file):
+def cluster_questions_1(questions, num_clusters):
     module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
 
     embed = hub.load(module_url)
@@ -58,8 +58,8 @@ def cluster_questions_1(questions, num_clusters, syllabus_file):
 def api1_handler():
     questions = extract_questions_from_directory('Local_Storage/pyqs_text')
     num_clusters = 4
-    syllabus_file = 'Local_Storage/syllabus.txt'
-    labels, repeated_indices = cluster_questions_1(questions, num_clusters, syllabus_file)
+    
+    labels, repeated_indices = cluster_questions_1(questions, num_clusters)
 
     print("Clustering questions")
     for i in range(num_clusters):
 
@@ -1,7 +1,10 @@
 FROM amazon/aws-lambda-python:3.10
 
+# Set the user as root
+USER root
 # Upgrade pip
 RUN python3 -m pip install --upgrade pip
+RUN yum install -y poppler-utils
 # Install the function's dependencies using file requirements.txt
 # from your project folder.
 COPY Backend ./Backend
@@ -13,6 +16,11 @@ COPY Local_Storage ./Local_Storage
 COPY requirements.txt  .
 RUN  pip3 install -r requirements.txt --target "${LAMBDA_TASK_ROOT}"
 
+RUN chmod a+rwx Local_Storage
+RUN chmod a+rwx images
+
+
+
 # Copy function code
 COPY app.py ${LAMBDA_TASK_ROOT}