Skip to content

Commit c8a7ff1

Browse files
committed
sorter
1 parent 0728586 commit c8a7ff1

File tree

5 files changed

+44
-43
lines changed

5 files changed

+44
-43
lines changed
3.15 KB
Binary file not shown.
-23 Bytes
Binary file not shown.

Backend/new_sorter.py

Lines changed: 39 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1+
#pyqsorter , sorts set of pyqs into modules
12
from fastapi import APIRouter
2-
import io
3+
import os
34
import re
45
import chardet
56
import numpy as np
@@ -8,25 +9,36 @@
89
from sklearnex import patch_sklearn
910
patch_sklearn()
1011
from sklearn.cluster import KMeans
11-
import boto3
12+
13+
14+
1215

1316
# Create an instance of APIRouter
1417
test = APIRouter()
1518

16-
def extract_questions_from_file(file_content):
17-
pattern = r'((?:[IVX]+|\([a-z]\))\. .*(?:\n\s+\(\w\)\. .*)*)'
18-
matches = re.findall(pattern, file_content)
19-
questions = [re.sub(r'\n\s+\(\w\)\. ', ' ', match.strip()) for match in matches]
19+
def extract_questions_from_file(filepath):
20+
with open(filepath, 'rb') as f:
21+
result = chardet.detect(f.read())
22+
encoding = result['encoding']
23+
with open(filepath, encoding=encoding) as f:
24+
content = f.read()
25+
pattern = r'((?:[IVX]+|\([a-z]\))\. .*(?:\n\s+\(\w\)\. .*)*)'
26+
matches = re.findall(pattern, content)
27+
questions = [re.sub(r'\n\s+\(\w\)\. ', ' ', match.strip()) for match in matches]
2028
return questions
2129

22-
def extract_questions_from_s3(bucket, key):
23-
s3 = boto3.client('s3')
24-
response = s3.get_object(Bucket=bucket, Key=key)
25-
content = response['Body'].read().decode('utf-8')
26-
return extract_questions_from_file(content)
2730

28-
def cluster_questions(questions, num_clusters):
31+
def extract_questions_from_directory(directory):
32+
questions = []
33+
for filename in os.listdir(directory):
34+
filepath = os.path.join(directory, filename)
35+
if os.path.isfile(filepath):
36+
questions += extract_questions_from_file(filepath)
37+
return questions
38+
39+
def cluster_questions_1(questions, num_clusters):
2940
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
41+
3042
embed = hub.load(module_url)
3143
embeddings = embed(questions).numpy()
3244
kmeans = KMeans(n_clusters=num_clusters)
@@ -41,23 +53,13 @@ def cluster_questions(questions, num_clusters):
4153

4254
return y_kmeans, repeated_indices
4355

56+
4457
@test.get("/api1")
4558
def api1_handler():
46-
s3_bucket = 'learnmateai'
47-
s3_key_prefix = 'pyqs_text/'
59+
questions = extract_questions_from_directory('Local_Storage/pyqs_text')
4860
num_clusters = 4
4961

50-
s3 = boto3.client('s3')
51-
questions = []
52-
response = s3.list_objects_v2(Bucket=s3_bucket, Prefix=s3_key_prefix)
53-
for obj in response['Contents']:
54-
key = obj['Key']
55-
if key.endswith('.txt'):
56-
response = s3.get_object(Bucket=s3_bucket, Key=key)
57-
content = response['Body'].read().decode('utf-8')
58-
questions += extract_questions_from_file(content)
59-
60-
labels, repeated_indices = cluster_questions(questions, num_clusters)
62+
labels, repeated_indices = cluster_questions_1(questions, num_clusters)
6163

6264
print("Clustering questions")
6365
for i in range(num_clusters):
@@ -72,22 +74,19 @@ def api1_handler():
7274
for index in repeated_indices:
7375
print(f" - {questions[index]}")
7476

75-
# Save the results to S3
76-
output_key = 'Generated_Files/cluster_questions.txt'
77-
output_content = ""
78-
for i in range(num_clusters):
79-
cluster_questions = np.array(questions)[np.where(labels == i)[0]]
80-
output_content += f"Module {i+1}:\n"
81-
for question in cluster_questions:
82-
output_content += f" - {question}\n"
83-
output_content += "\n"
84-
85-
if repeated_indices:
86-
output_content += "Repeated Questions:\n"
87-
for index in repeated_indices:
88-
output_content += f" - {questions[index]}\n"
77+
with open('Local_Storage/Generated_Files/cluster_questions.txt', 'w') as f:
78+
for i in range(num_clusters):
79+
cluster_questions = np.array(questions)[np.where(labels == i)[0]]
80+
f.write(f"Module {i+1}:\n")
81+
for question in cluster_questions:
82+
f.write(f" - {question}\n")
83+
f.write("\n")
8984

90-
s3.put_object(Body=output_content.encode('utf-8'), Bucket=s3_bucket, Key=output_key)
85+
# Write repeated questions to file
86+
if repeated_indices:
87+
f.write("Repeated Questions:\n")
88+
for index in repeated_indices:
89+
f.write(f" - {questions[index]}\n")
9190

9291
return {"message": "Previous Year question papers sorted to modules"}
9392

__pycache__/app.cpython-310.pyc

69 Bytes
Binary file not shown.

app.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
from fastapi.middleware.cors import CORSMiddleware
44
from mangum import Mangum
55
#from Backend.pyqsorter import router as api1_router
6-
from Backend.summariser import router_summariser as summariser
6+
#from Backend.summariser import router_summariser as summariser
7+
from Backend.new_sorter import test as sorter
78
#from Backend.Notes_Analyser import router as api4_router
89
#from Backend.Narrator import router as api5_router
9-
#from Backend.NotesToText import router as notestotext
10+
from Backend.NotesToText import router as notestotext
1011
# import other API routers as needed
1112

1213
origins = ["*"]
@@ -26,7 +27,8 @@
2627

2728
# Mount the API routerss
2829
#app.include_router(api1_router)
29-
app.include_router(summariser)
30+
app.include_router(notestotext)
31+
app.include_router(sorter)
3032
#app.include_router(api4_router)
3133
#app.include_router(api6_router)
3234
#app.include_router(notestotext)

0 commit comments

Comments
 (0)