1
- #pyqsorter , sorts set of pyqs into modules
2
1
from fastapi import APIRouter
3
2
import os
4
3
import re
7
6
import tensorflow as tf
8
7
import tensorflow_hub as hub
9
8
from sklearnex import patch_sklearn
9
+ import boto3
10
+ from botocore .exceptions import NoCredentialsError
10
11
patch_sklearn ()
11
12
from sklearn .cluster import KMeans
12
-
13
-
14
-
13
+ import tempfile
14
+ from io import BytesIO
15
15
16
16
# Create an instance of APIRouter
17
17
test = APIRouter ()
18
18
19
+ # AWS S3 configuration
20
+ AWS_ACCESS_KEY_ID = 'AKIAZTHHIOR4CN6UXO6N'
21
+ AWS_SECRET_ACCESS_KEY = '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'
22
+ AWS_BUCKET_NAME = 'learnmateai'
23
+ AWS_BUCKET_FOLDER = 'pyqs_txt'
24
+
25
+ s3_client = boto3 .client (
26
+ 's3' ,
27
+ aws_access_key_id = AWS_ACCESS_KEY_ID ,
28
+ aws_secret_access_key = AWS_SECRET_ACCESS_KEY
29
+ )
30
+
19
31
def extract_questions_from_file (filepath ):
20
- with open (filepath , 'rb' ) as f :
21
- result = chardet .detect (f .read ())
22
- encoding = result ['encoding' ]
23
- with open (filepath , encoding = encoding ) as f :
24
- content = f .read ()
25
- pattern = r'((?:[IVX]+|\([a-z]\))\. .*(?:\n\s+\(\w\)\. .*)*)'
26
- matches = re .findall (pattern , content )
27
- questions = [re .sub (r'\n\s+\(\w\)\. ' , ' ' , match .strip ()) for match in matches ]
32
+ questions = []
33
+ with open (filepath , 'rb' ) as file :
34
+ content = file .read ()
35
+ encoding = chardet .detect (content )['encoding' ]
36
+ decoded_content = content .decode (encoding , errors = 'ignore' )
37
+ questions = re .findall (r'\b(?:what|where|why|how|when|which|who|whom|whose)\b.*[?!.]' , decoded_content , re .IGNORECASE )
28
38
return questions
29
39
30
-
31
40
def extract_questions_from_directory (directory ):
41
+ paginator = s3_client .get_paginator ('list_objects_v2' )
42
+ operation_parameters = {'Bucket' : AWS_BUCKET_NAME , 'Prefix' : directory }
43
+
44
+ page_iterator = paginator .paginate (** operation_parameters )
45
+
46
+ try :
47
+ if not os .path .exists ("temp1/pyqs_txt" ):
48
+ os .makedirs ("temp1/pyqs_txt" ) # Create directory if it doesn't exist
49
+ for page in page_iterator :
50
+ if 'Contents' in page :
51
+ for item in page ['Contents' ]:
52
+ key = item ['Key' ]
53
+ local_file_path = os .path .join ("temp1/pyqs_txt" , os .path .basename (key )) # Use basename of key as local file name
54
+ try :
55
+ s3_client .download_file (AWS_BUCKET_NAME , key , local_file_path )
56
+ print (f"Downloaded { key } to { local_file_path } " )
57
+ except Exception as e :
58
+ print (f"Failed to download { key } : { str (e )} " )
59
+ except Exception as e :
60
+ print (f"An error occurred during pagination: { str (e )} " )
61
+ return []
62
+
32
63
questions = []
33
- for filename in os .listdir (directory ):
34
- filepath = os .path .join (directory , filename )
64
+ for filename in os .listdir ("temp1/pyqs_txt" ):
65
+ filepath = os .path .join ("temp1/pyqs_txt" , filename )
35
66
if os .path .isfile (filepath ):
36
67
questions += extract_questions_from_file (filepath )
68
+
37
69
return questions
38
70
39
- def cluster_questions_1 (questions , num_clusters ):
71
+ def cluster_questions (questions , num_clusters ):
72
+ if len (questions ) == 0 :
73
+ return None , []
74
+
40
75
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
41
76
42
77
embed = hub .load (module_url )
@@ -53,17 +88,17 @@ def cluster_questions_1(questions, num_clusters):
53
88
54
89
return y_kmeans , repeated_indices
55
90
56
-
57
91
@test .get ("/api1" )
58
92
def api1_handler ():
59
- questions = extract_questions_from_directory ('Local_Storage/pyqs_text' )
93
+ questions = extract_questions_from_directory (AWS_BUCKET_FOLDER )
60
94
num_clusters = 4
61
-
62
- labels , repeated_indices = cluster_questions_1 (questions , num_clusters )
95
+
96
+ labels , repeated_indices = cluster_questions (questions , num_clusters )
63
97
64
98
print ("Clustering questions" )
65
99
for i in range (num_clusters ):
66
- cluster_questions = np .array (questions )[np .where (labels == i )[0 ]]
100
+ cluster_indices = np .where (labels == i )[0 ]
101
+ cluster_questions = np .array (questions )[cluster_indices ]
67
102
print (f"Module { i + 1 } :" )
68
103
for question in cluster_questions :
69
104
print (f" - { question } " )
@@ -74,21 +109,33 @@ def api1_handler():
74
109
for index in repeated_indices :
75
110
print (f" - { questions [index ]} " )
76
111
77
- with open ('Local_Storage/Generated_Files/cluster_questions.txt' , 'w' ) as f :
112
+ try :
113
+ # Write cluster questions to S3
114
+ cluster_questions_content = ""
78
115
for i in range (num_clusters ):
79
- cluster_questions = np .array (questions )[np .where (labels == i )[0 ]]
80
- f .write (f"Module { i + 1 } :\n " )
116
+ cluster_indices = np .where (labels == i )[0 ]
117
+ cluster_questions = np .array (questions )[cluster_indices ]
118
+ cluster_questions_content += f"Module { i + 1 } :\n "
81
119
for question in cluster_questions :
82
- f . write ( f " - { question } \n ")
83
- f . write ( "\n " )
120
+ cluster_questions_content += f " - { question } \n "
121
+ cluster_questions_content += "\n "
84
122
85
- # Write repeated questions to file
123
+ # Write repeated questions to S3
86
124
if repeated_indices :
87
- f . write ( "Repeated Questions:\n " )
125
+ cluster_questions_content += "Repeated Questions:\n "
88
126
for index in repeated_indices :
89
- f .write (f" - { questions [index ]} \n " )
127
+ cluster_questions_content += f" - { questions [index ]} \n "
128
+
129
+ s3_client .put_object (
130
+ Body = cluster_questions_content .encode (),
131
+ Bucket = AWS_BUCKET_NAME ,
132
+ Key = 'Generated_Files/cluster_questions.txt'
133
+ )
134
+
135
+ return {"message" : "Previous Year question papers sorted into modules" }
136
+ except NoCredentialsError :
137
+ return {"message" : "Failed to write to S3. Credentials not available." }
90
138
91
- return {"message" : "Previous Year question papers sorted to modules" }
92
139
93
140
@test .post ("/api1" )
94
141
def api1_post_handler ():
0 commit comments