Skip to content

Commit d721856

Browse files
committed
Add syllabus extraction feature, protect backend apis, ui tweaks
1 parent fc4bac9 commit d721856

File tree

19 files changed

+1804
-353
lines changed

19 files changed

+1804
-353
lines changed

backend/giraph/syllabus_service.py

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
"""
2+
Syllabus Import Service
3+
Extracts course contents, course outcomes from syllabus PDFs
4+
using PyMuPDF4LLM for PDF parsing and an OpenAI-compatible LLM API for extraction.
5+
"""
6+
7+
import json
8+
import os
9+
import re
10+
import tempfile
11+
from typing import Any, Dict
12+
13+
# PDF parsing
14+
import pymupdf4llm
15+
from openai import OpenAI
16+
17+
# LLM API configuration - can be overridden via environment variables
18+
LLM_API_URL = os.environ.get("LLM_API_URL", "http://localhost:11434/v1")
19+
LLM_MODEL = os.environ.get("LLM_MODEL", "mlx-community/Qwen3-4B-4bit")
20+
LLM_API_KEY = os.environ.get("LLM_API_KEY", "ollama")
21+
22+
# Initialize OpenAI client
23+
_client = None
24+
25+
def get_client() -> OpenAI:
26+
"""Get or initialize the OpenAI client (singleton pattern)."""
27+
global _client
28+
if _client is None:
29+
_client = OpenAI(
30+
base_url=LLM_API_URL,
31+
api_key=LLM_API_KEY,
32+
)
33+
return _client
34+
35+
36+
def extract_text_from_pdf(pdf_path: str) -> str:
37+
"""Extract text from PDF using PyMuPDF4LLM."""
38+
md_text = pymupdf4llm.to_markdown(pdf_path)
39+
return md_text
40+
41+
42+
def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
43+
"""Extract text from PDF bytes using PyMuPDF4LLM."""
44+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp:
45+
tmp.write(pdf_bytes)
46+
tmp_path = tmp.name
47+
48+
try:
49+
md_text = pymupdf4llm.to_markdown(tmp_path)
50+
return md_text
51+
finally:
52+
os.unlink(tmp_path)
53+
54+
55+
def build_extraction_prompt(syllabus_text: str) -> str:
56+
"""Build the prompt for the LLM to extract course structure."""
57+
# Truncate text if too long to fit in context
58+
max_chars = 6000 # Leave room for prompt and response
59+
if len(syllabus_text) > max_chars:
60+
syllabus_text = syllabus_text[:max_chars] + "\n...[truncated]..."
61+
62+
prompt = f"""Analyze the following course syllabus and extract structured information.
63+
64+
Extract these two things:
65+
66+
1. Course Contents (CC): These are ONLY the evaluation/assessment methods used to grade students.
67+
Examples: Midterm Exam, Final Exam, Project, Homework, Quiz, Presentation, Lab Report, Attendance, Participation
68+
DO NOT include weekly topics, lecture subjects, or chapter names as course contents.
69+
70+
2. Course Outcomes (CO): These are the learning outcomes, learning objectives, or competencies.
71+
Look for sections titled: "Learning Outcomes", "Course Outcomes", "Learning Objectives", "Course Objectives", "Competencies"
72+
Extract each item from the list exactly as written.
73+
74+
IMPORTANT RULES:
75+
- Extract text EXACTLY as written in the syllabus, do not modify or rephrase
76+
77+
Output ONLY valid JSON in this exact format:
78+
{{
79+
"course_contents": [
80+
"Exact name from syllabus"
81+
],
82+
"course_outcomes": [
83+
"Exact outcome text from syllabus"
84+
],
85+
}}
86+
87+
SYLLABUS:
88+
{syllabus_text}
89+
90+
Respond with ONLY the JSON object, no explanation or markdown:"""
91+
return prompt
92+
93+
94+
def parse_llm_response(response_text: str) -> Dict[str, Any]:
95+
"""Parse the LLM response to extract JSON."""
96+
# Try to find JSON in the response
97+
response_text = response_text.strip()
98+
99+
# Remove thinking tags if present (Qwen3 may use these)
100+
response_text = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL)
101+
response_text = response_text.strip()
102+
103+
# Try to extract JSON from the response
104+
json_match = re.search(r'\{[\s\S]*\}', response_text)
105+
if json_match:
106+
try:
107+
return json.loads(json_match.group())
108+
except json.JSONDecodeError:
109+
pass
110+
111+
# If no valid JSON found, return empty structure
112+
return {
113+
"course_contents": [],
114+
"course_outcomes": [],
115+
"parse_error": "Could not parse LLM response"
116+
}
117+
118+
119+
def extract_syllabus_structure(syllabus_text: str) -> Dict[str, Any]:
120+
"""
121+
Use LLM API to extract course structure from syllabus text.
122+
Returns dict with course_contents, course_outcomes.
123+
"""
124+
prompt = build_extraction_prompt(syllabus_text)
125+
126+
# Call LLM API using OpenAI client
127+
try:
128+
client = get_client()
129+
response = client.chat.completions.create(
130+
model=LLM_MODEL,
131+
messages=[
132+
{"role": "system", "content": "You are a helpful assistant that extracts structured data from academic syllabi. Always respond with valid JSON only."},
133+
{"role": "user", "content": prompt}
134+
],
135+
max_tokens=2048,
136+
)
137+
138+
response_text = response.choices[0].message.content or ""
139+
140+
except Exception as e:
141+
error_msg = str(e)
142+
if "Connection" in error_msg or "connect" in error_msg.lower():
143+
return {
144+
"course_contents": [],
145+
"course_outcomes": [],
146+
"parse_error": "The AI service is currently unavailable. Please try again later."
147+
}
148+
return {
149+
"course_contents": [],
150+
"course_outcomes": [],
151+
"parse_error": "Failed to analyze the syllabus. Please try again or contact support."
152+
}
153+
154+
result = parse_llm_response(response_text)
155+
156+
# Validate and clean up the result
157+
result = validate_and_clean_result(result)
158+
159+
return result
160+
161+
162+
def validate_and_clean_result(result: Dict[str, Any]) -> Dict[str, Any]:
163+
"""Validate and clean the extracted result."""
164+
# Ensure required keys exist
165+
if "course_contents" not in result:
166+
result["course_contents"] = []
167+
if "course_outcomes" not in result:
168+
result["course_outcomes"] = []
169+
170+
# Clean course contents - handle both string arrays and object arrays
171+
cleaned_contents = []
172+
for i, cc in enumerate(result["course_contents"]):
173+
if isinstance(cc, str):
174+
# New format: just strings
175+
name = cc.strip()
176+
if name:
177+
cleaned_contents.append({
178+
"name": name[:255],
179+
"index": i,
180+
})
181+
elif isinstance(cc, dict) and "name" in cc:
182+
# Old format: objects with "name" key
183+
cleaned_contents.append({
184+
"name": str(cc["name"])[:255],
185+
"index": i,
186+
})
187+
result["course_contents"] = cleaned_contents
188+
189+
# Clean course outcomes - handle both string arrays and object arrays
190+
cleaned_outcomes = []
191+
for i, co in enumerate(result["course_outcomes"]):
192+
name = co.strip()
193+
if name:
194+
cleaned_outcomes.append({
195+
"name": name[:255],
196+
"index": i,
197+
})
198+
result["course_outcomes"] = cleaned_outcomes
199+
200+
return result
201+
202+
203+
def process_syllabus(pdf_bytes: bytes) -> Dict[str, Any]:
204+
"""
205+
Main entry point: process a PDF syllabus and extract structure.
206+
207+
Args:
208+
pdf_bytes: Raw PDF file bytes
209+
210+
Returns:
211+
Dict containing:
212+
- course_contents: List of {name, index}
213+
- course_outcomes: List of {name, index}
214+
- raw_text: Extracted text from PDF (truncated)
215+
"""
216+
# Extract text from PDF
217+
syllabus_text = extract_text_from_pdf_bytes(pdf_bytes)
218+
219+
# Extract structure using LLM
220+
result = extract_syllabus_structure(syllabus_text)
221+
222+
return result

backend/giraph/urls.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,4 +19,8 @@
1919
path("create_program_outcome/", views.CreateProgramOutcome.as_view()),
2020
path("update_program_outcome/", views.UpdateProgramOutcome.as_view()),
2121
path("delete_program_outcome/", views.DeleteProgramOutcome.as_view()),
22+
# Syllabus import endpoints
23+
path("parse_syllabus/", views.ParseSyllabus.as_view()),
24+
path("apply_syllabus_import/", views.ApplySyllabusImport.as_view()),
25+
path("clear_course_nodes/", views.ClearCourseNodes.as_view()),
2226
]

0 commit comments

Comments
 (0)