1
+ #pyqsorter , sorts set of pyqs into modules
1
2
from fastapi import APIRouter
2
- import io
3
+ import os
3
4
import re
4
5
import chardet
5
6
import numpy as np
8
9
from sklearnex import patch_sklearn
9
10
patch_sklearn ()
10
11
from sklearn .cluster import KMeans
11
- import boto3
12
+
13
+
14
+
12
15
13
16
# Create an instance of APIRouter
14
17
test = APIRouter ()
15
18
16
- def extract_questions_from_file (file_content ):
17
- pattern = r'((?:[IVX]+|\([a-z]\))\. .*(?:\n\s+\(\w\)\. .*)*)'
18
- matches = re .findall (pattern , file_content )
19
- questions = [re .sub (r'\n\s+\(\w\)\. ' , ' ' , match .strip ()) for match in matches ]
19
+ def extract_questions_from_file (filepath ):
20
+ with open (filepath , 'rb' ) as f :
21
+ result = chardet .detect (f .read ())
22
+ encoding = result ['encoding' ]
23
+ with open (filepath , encoding = encoding ) as f :
24
+ content = f .read ()
25
+ pattern = r'((?:[IVX]+|\([a-z]\))\. .*(?:\n\s+\(\w\)\. .*)*)'
26
+ matches = re .findall (pattern , content )
27
+ questions = [re .sub (r'\n\s+\(\w\)\. ' , ' ' , match .strip ()) for match in matches ]
20
28
return questions
21
29
22
- def extract_questions_from_s3 (bucket , key ):
23
- s3 = boto3 .client ('s3' )
24
- response = s3 .get_object (Bucket = bucket , Key = key )
25
- content = response ['Body' ].read ().decode ('utf-8' )
26
- return extract_questions_from_file (content )
27
30
28
- def cluster_questions (questions , num_clusters ):
31
+ def extract_questions_from_directory (directory ):
32
+ questions = []
33
+ for filename in os .listdir (directory ):
34
+ filepath = os .path .join (directory , filename )
35
+ if os .path .isfile (filepath ):
36
+ questions += extract_questions_from_file (filepath )
37
+ return questions
38
+
39
+ def cluster_questions_1 (questions , num_clusters ):
29
40
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
41
+
30
42
embed = hub .load (module_url )
31
43
embeddings = embed (questions ).numpy ()
32
44
kmeans = KMeans (n_clusters = num_clusters )
@@ -41,23 +53,13 @@ def cluster_questions(questions, num_clusters):
41
53
42
54
return y_kmeans , repeated_indices
43
55
56
+
44
57
@test .get ("/api1" )
45
58
def api1_handler ():
46
- s3_bucket = 'learnmateai'
47
- s3_key_prefix = 'pyqs_text/'
59
+ questions = extract_questions_from_directory ('Local_Storage/pyqs_text' )
48
60
num_clusters = 4
49
61
50
- s3 = boto3 .client ('s3' )
51
- questions = []
52
- response = s3 .list_objects_v2 (Bucket = s3_bucket , Prefix = s3_key_prefix )
53
- for obj in response ['Contents' ]:
54
- key = obj ['Key' ]
55
- if key .endswith ('.txt' ):
56
- response = s3 .get_object (Bucket = s3_bucket , Key = key )
57
- content = response ['Body' ].read ().decode ('utf-8' )
58
- questions += extract_questions_from_file (content )
59
-
60
- labels , repeated_indices = cluster_questions (questions , num_clusters )
62
+ labels , repeated_indices = cluster_questions_1 (questions , num_clusters )
61
63
62
64
print ("Clustering questions" )
63
65
for i in range (num_clusters ):
@@ -72,22 +74,19 @@ def api1_handler():
72
74
for index in repeated_indices :
73
75
print (f" - { questions [index ]} " )
74
76
75
- # Save the results to S3
76
- output_key = 'Generated_Files/cluster_questions.txt'
77
- output_content = ""
78
- for i in range (num_clusters ):
79
- cluster_questions = np .array (questions )[np .where (labels == i )[0 ]]
80
- output_content += f"Module { i + 1 } :\n "
81
- for question in cluster_questions :
82
- output_content += f" - { question } \n "
83
- output_content += "\n "
84
-
85
- if repeated_indices :
86
- output_content += "Repeated Questions:\n "
87
- for index in repeated_indices :
88
- output_content += f" - { questions [index ]} \n "
77
+ with open ('Local_Storage/Generated_Files/cluster_questions.txt' , 'w' ) as f :
78
+ for i in range (num_clusters ):
79
+ cluster_questions = np .array (questions )[np .where (labels == i )[0 ]]
80
+ f .write (f"Module { i + 1 } :\n " )
81
+ for question in cluster_questions :
82
+ f .write (f" - { question } \n " )
83
+ f .write ("\n " )
89
84
90
- s3 .put_object (Body = output_content .encode ('utf-8' ), Bucket = s3_bucket , Key = output_key )
85
+ # Write repeated questions to file
86
+ if repeated_indices :
87
+ f .write ("Repeated Questions:\n " )
88
+ for index in repeated_indices :
89
+ f .write (f" - { questions [index ]} \n " )
91
90
92
91
return {"message" : "Previous Year question papers sorted to modules" }
93
92
0 commit comments