|
| 1 | +""" |
| 2 | +Syllabus Import Service |
| 3 | +Extracts course contents, course outcomes from syllabus PDFs |
| 4 | +using PyMuPDF4LLM for PDF parsing and an OpenAI-compatible LLM API for extraction. |
| 5 | +""" |
| 6 | + |
| 7 | +import json |
| 8 | +import os |
| 9 | +import re |
| 10 | +import tempfile |
| 11 | +from typing import Any, Dict |
| 12 | + |
| 13 | +# PDF parsing |
| 14 | +import pymupdf4llm |
| 15 | +from openai import OpenAI |
| 16 | + |
| 17 | +# LLM API configuration - can be overridden via environment variables |
| 18 | +LLM_API_URL = os.environ.get("LLM_API_URL", "http://localhost:11434/v1") |
| 19 | +LLM_MODEL = os.environ.get("LLM_MODEL", "mlx-community/Qwen3-4B-4bit") |
| 20 | +LLM_API_KEY = os.environ.get("LLM_API_KEY", "ollama") |
| 21 | + |
| 22 | +# Initialize OpenAI client |
| 23 | +_client = None |
| 24 | + |
| 25 | +def get_client() -> OpenAI: |
| 26 | + """Get or initialize the OpenAI client (singleton pattern).""" |
| 27 | + global _client |
| 28 | + if _client is None: |
| 29 | + _client = OpenAI( |
| 30 | + base_url=LLM_API_URL, |
| 31 | + api_key=LLM_API_KEY, |
| 32 | + ) |
| 33 | + return _client |
| 34 | + |
| 35 | + |
| 36 | +def extract_text_from_pdf(pdf_path: str) -> str: |
| 37 | + """Extract text from PDF using PyMuPDF4LLM.""" |
| 38 | + md_text = pymupdf4llm.to_markdown(pdf_path) |
| 39 | + return md_text |
| 40 | + |
| 41 | + |
| 42 | +def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str: |
| 43 | + """Extract text from PDF bytes using PyMuPDF4LLM.""" |
| 44 | + with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as tmp: |
| 45 | + tmp.write(pdf_bytes) |
| 46 | + tmp_path = tmp.name |
| 47 | + |
| 48 | + try: |
| 49 | + md_text = pymupdf4llm.to_markdown(tmp_path) |
| 50 | + return md_text |
| 51 | + finally: |
| 52 | + os.unlink(tmp_path) |
| 53 | + |
| 54 | + |
| 55 | +def build_extraction_prompt(syllabus_text: str) -> str: |
| 56 | + """Build the prompt for the LLM to extract course structure.""" |
| 57 | + # Truncate text if too long to fit in context |
| 58 | + max_chars = 6000 # Leave room for prompt and response |
| 59 | + if len(syllabus_text) > max_chars: |
| 60 | + syllabus_text = syllabus_text[:max_chars] + "\n...[truncated]..." |
| 61 | + |
| 62 | + prompt = f"""Analyze the following course syllabus and extract structured information. |
| 63 | +
|
| 64 | +Extract these two things: |
| 65 | +
|
| 66 | +1. Course Contents (CC): These are ONLY the evaluation/assessment methods used to grade students. |
| 67 | + Examples: Midterm Exam, Final Exam, Project, Homework, Quiz, Presentation, Lab Report, Attendance, Participation |
| 68 | + DO NOT include weekly topics, lecture subjects, or chapter names as course contents. |
| 69 | +
|
| 70 | +2. Course Outcomes (CO): These are the learning outcomes, learning objectives, or competencies. |
| 71 | + Look for sections titled: "Learning Outcomes", "Course Outcomes", "Learning Objectives", "Course Objectives", "Competencies" |
| 72 | + Extract each item from the list exactly as written. |
| 73 | +
|
| 74 | +IMPORTANT RULES: |
| 75 | +- Extract text EXACTLY as written in the syllabus, do not modify or rephrase |
| 76 | +
|
| 77 | +Output ONLY valid JSON in this exact format: |
| 78 | +{{ |
| 79 | + "course_contents": [ |
| 80 | + "Exact name from syllabus" |
| 81 | + ], |
| 82 | + "course_outcomes": [ |
| 83 | + "Exact outcome text from syllabus" |
| 84 | + ], |
| 85 | +}} |
| 86 | +
|
| 87 | +SYLLABUS: |
| 88 | +{syllabus_text} |
| 89 | +
|
| 90 | +Respond with ONLY the JSON object, no explanation or markdown:""" |
| 91 | + return prompt |
| 92 | + |
| 93 | + |
| 94 | +def parse_llm_response(response_text: str) -> Dict[str, Any]: |
| 95 | + """Parse the LLM response to extract JSON.""" |
| 96 | + # Try to find JSON in the response |
| 97 | + response_text = response_text.strip() |
| 98 | + |
| 99 | + # Remove thinking tags if present (Qwen3 may use these) |
| 100 | + response_text = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL) |
| 101 | + response_text = response_text.strip() |
| 102 | + |
| 103 | + # Try to extract JSON from the response |
| 104 | + json_match = re.search(r'\{[\s\S]*\}', response_text) |
| 105 | + if json_match: |
| 106 | + try: |
| 107 | + return json.loads(json_match.group()) |
| 108 | + except json.JSONDecodeError: |
| 109 | + pass |
| 110 | + |
| 111 | + # If no valid JSON found, return empty structure |
| 112 | + return { |
| 113 | + "course_contents": [], |
| 114 | + "course_outcomes": [], |
| 115 | + "parse_error": "Could not parse LLM response" |
| 116 | + } |
| 117 | + |
| 118 | + |
| 119 | +def extract_syllabus_structure(syllabus_text: str) -> Dict[str, Any]: |
| 120 | + """ |
| 121 | + Use LLM API to extract course structure from syllabus text. |
| 122 | + Returns dict with course_contents, course_outcomes. |
| 123 | + """ |
| 124 | + prompt = build_extraction_prompt(syllabus_text) |
| 125 | + |
| 126 | + # Call LLM API using OpenAI client |
| 127 | + try: |
| 128 | + client = get_client() |
| 129 | + response = client.chat.completions.create( |
| 130 | + model=LLM_MODEL, |
| 131 | + messages=[ |
| 132 | + {"role": "system", "content": "You are a helpful assistant that extracts structured data from academic syllabi. Always respond with valid JSON only."}, |
| 133 | + {"role": "user", "content": prompt} |
| 134 | + ], |
| 135 | + max_tokens=2048, |
| 136 | + ) |
| 137 | + |
| 138 | + response_text = response.choices[0].message.content or "" |
| 139 | + |
| 140 | + except Exception as e: |
| 141 | + error_msg = str(e) |
| 142 | + if "Connection" in error_msg or "connect" in error_msg.lower(): |
| 143 | + return { |
| 144 | + "course_contents": [], |
| 145 | + "course_outcomes": [], |
| 146 | + "parse_error": "The AI service is currently unavailable. Please try again later." |
| 147 | + } |
| 148 | + return { |
| 149 | + "course_contents": [], |
| 150 | + "course_outcomes": [], |
| 151 | + "parse_error": "Failed to analyze the syllabus. Please try again or contact support." |
| 152 | + } |
| 153 | + |
| 154 | + result = parse_llm_response(response_text) |
| 155 | + |
| 156 | + # Validate and clean up the result |
| 157 | + result = validate_and_clean_result(result) |
| 158 | + |
| 159 | + return result |
| 160 | + |
| 161 | + |
| 162 | +def validate_and_clean_result(result: Dict[str, Any]) -> Dict[str, Any]: |
| 163 | + """Validate and clean the extracted result.""" |
| 164 | + # Ensure required keys exist |
| 165 | + if "course_contents" not in result: |
| 166 | + result["course_contents"] = [] |
| 167 | + if "course_outcomes" not in result: |
| 168 | + result["course_outcomes"] = [] |
| 169 | + |
| 170 | + # Clean course contents - handle both string arrays and object arrays |
| 171 | + cleaned_contents = [] |
| 172 | + for i, cc in enumerate(result["course_contents"]): |
| 173 | + if isinstance(cc, str): |
| 174 | + # New format: just strings |
| 175 | + name = cc.strip() |
| 176 | + if name: |
| 177 | + cleaned_contents.append({ |
| 178 | + "name": name[:255], |
| 179 | + "index": i, |
| 180 | + }) |
| 181 | + elif isinstance(cc, dict) and "name" in cc: |
| 182 | + # Old format: objects with "name" key |
| 183 | + cleaned_contents.append({ |
| 184 | + "name": str(cc["name"])[:255], |
| 185 | + "index": i, |
| 186 | + }) |
| 187 | + result["course_contents"] = cleaned_contents |
| 188 | + |
| 189 | + # Clean course outcomes - handle both string arrays and object arrays |
| 190 | + cleaned_outcomes = [] |
| 191 | + for i, co in enumerate(result["course_outcomes"]): |
| 192 | + name = co.strip() |
| 193 | + if name: |
| 194 | + cleaned_outcomes.append({ |
| 195 | + "name": name[:255], |
| 196 | + "index": i, |
| 197 | + }) |
| 198 | + result["course_outcomes"] = cleaned_outcomes |
| 199 | + |
| 200 | + return result |
| 201 | + |
| 202 | + |
| 203 | +def process_syllabus(pdf_bytes: bytes) -> Dict[str, Any]: |
| 204 | + """ |
| 205 | + Main entry point: process a PDF syllabus and extract structure. |
| 206 | + |
| 207 | + Args: |
| 208 | + pdf_bytes: Raw PDF file bytes |
| 209 | + |
| 210 | + Returns: |
| 211 | + Dict containing: |
| 212 | + - course_contents: List of {name, index} |
| 213 | + - course_outcomes: List of {name, index} |
| 214 | + - raw_text: Extracted text from PDF (truncated) |
| 215 | + """ |
| 216 | + # Extract text from PDF |
| 217 | + syllabus_text = extract_text_from_pdf_bytes(pdf_bytes) |
| 218 | + |
| 219 | + # Extract structure using LLM |
| 220 | + result = extract_syllabus_structure(syllabus_text) |
| 221 | + |
| 222 | + return result |
0 commit comments