@@ -251,14 +251,70 @@ def _load_dictionary(dictionary_path: Path) -> list[str]:
251251 """Load and flatten the keyword dictionary."""
252252 with open (dictionary_path , 'r' , encoding = 'utf-8' ) as f :
253253 data = json .load (f )
254-
254+
255255 # Flatten all categories into a single list
256256 all_keywords = []
257257 for category , keywords in data .get ("keywords" , {}).items ():
258258 all_keywords .extend (keywords )
259-
259+
260260 return all_keywords
261-
261+
262+ @staticmethod
263+ def _clean_json_response (response_text : str ) -> str :
264+ """Clean up LLM response to extract valid JSON."""
265+ # Remove markdown code blocks
266+ if response_text .startswith ("```json" ):
267+ response_text = response_text [7 :]
268+ if response_text .startswith ("```" ):
269+ response_text = response_text [3 :]
270+ if response_text .endswith ("```" ):
271+ response_text = response_text [:- 3 ]
272+
273+ # Remove any leading/trailing whitespace
274+ response_text = response_text .strip ()
275+
276+ # Try to find JSON object boundaries
277+ start_idx = response_text .find ('{' )
278+ if start_idx > 0 :
279+ response_text = response_text [start_idx :]
280+
281+ return response_text
282+
283+ @staticmethod
284+ def _repair_truncated_json (json_str : str ) -> str | None :
285+ """
286+ Attempt to repair truncated JSON by closing open brackets.
287+
288+ Returns repaired JSON string or None if repair failed.
289+ """
290+ try :
291+ # Count open brackets
292+ open_braces = json_str .count ('{' ) - json_str .count ('}' )
293+ open_brackets = json_str .count ('[' ) - json_str .count (']' )
294+
295+ # If severely unbalanced, try to find last complete entry
296+ if open_braces > 2 or open_brackets > 2 :
297+ # Find the last complete keyword entry (ends with })
298+ last_complete = json_str .rfind ('},' )
299+ if last_complete > 0 :
300+ json_str = json_str [:last_complete + 1 ]
301+ # Recount after truncation
302+ open_braces = json_str .count ('{' ) - json_str .count ('}' )
303+ open_brackets = json_str .count ('[' ) - json_str .count (']' )
304+
305+ # Remove trailing comma if present
306+ json_str = json_str .rstrip ().rstrip (',' )
307+
308+ # Close brackets
309+ json_str += ']' * open_brackets
310+ json_str += '}' * open_braces
311+
312+ # Validate by parsing
313+ json .loads (json_str )
314+ return json_str
315+ except Exception :
316+ return None
317+
262318 @staticmethod
263319 def _prepare_text_for_matching (text : str ) -> str :
264320 """
@@ -418,7 +474,7 @@ def _extract_with_llm(self, resume_text: str) -> list[dict[str, Any]]:
418474 "mode" : "chat" ,
419475 "completion_params" : {
420476 "temperature" : 0.3 , # Lower temperature for more precise extraction
421- "max_tokens" : 2000
477+ "max_tokens" : 4000 # Increased to avoid truncation
422478 }
423479 }
424480
@@ -450,17 +506,20 @@ def _extract_with_llm(self, resume_text: str) -> list[dict[str, Any]]:
450506 print (f"❌ LLM returned empty response after { max_retries } attempts" )
451507 return []
452508
453- # Remove markdown code blocks if present
454- if response_text .startswith ("```json" ):
455- response_text = response_text [7 :]
456- if response_text .startswith ("```" ):
457- response_text = response_text [3 :]
458- if response_text .endswith ("```" ):
459- response_text = response_text [:- 3 ]
460- response_text = response_text .strip ()
461-
462- # Parse JSON
463- llm_data = json .loads (response_text )
509+ # Clean up response text
510+ response_text = self ._clean_json_response (response_text )
511+
512+ # Parse JSON with repair attempt
513+ try :
514+ llm_data = json .loads (response_text )
515+ except json .JSONDecodeError :
516+ # Try to repair truncated JSON
517+ repaired = self ._repair_truncated_json (response_text )
518+ if repaired :
519+ llm_data = json .loads (repaired )
520+ else :
521+ raise
522+
464523 keywords = llm_data .get ("keywords" , [])
465524
466525 if keywords :
0 commit comments