Skip to content

Commit 46f183f

Browse files
authored
feat(dingo): v0.4.9 - Fix JSON parsing and improve LLM error handling (#2164)
1 parent ce98548 commit 46f183f

File tree

4 files changed

+305
-638
lines changed

4 files changed

+305
-638
lines changed

tools/dingo/manifest.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: 0.4.8
1+
version: 0.4.9
22
type: plugin
33
author: langgenius
44
name: dingo

tools/dingo/tools/keyword_extraction.py

Lines changed: 74 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -251,14 +251,70 @@ def _load_dictionary(dictionary_path: Path) -> list[str]:
251251
"""Load and flatten the keyword dictionary."""
252252
with open(dictionary_path, 'r', encoding='utf-8') as f:
253253
data = json.load(f)
254-
254+
255255
# Flatten all categories into a single list
256256
all_keywords = []
257257
for category, keywords in data.get("keywords", {}).items():
258258
all_keywords.extend(keywords)
259-
259+
260260
return all_keywords
261-
261+
262+
@staticmethod
263+
def _clean_json_response(response_text: str) -> str:
264+
"""Clean up LLM response to extract valid JSON."""
265+
# Remove markdown code blocks
266+
if response_text.startswith("```json"):
267+
response_text = response_text[7:]
268+
if response_text.startswith("```"):
269+
response_text = response_text[3:]
270+
if response_text.endswith("```"):
271+
response_text = response_text[:-3]
272+
273+
# Remove any leading/trailing whitespace
274+
response_text = response_text.strip()
275+
276+
# Try to find JSON object boundaries
277+
start_idx = response_text.find('{')
278+
if start_idx > 0:
279+
response_text = response_text[start_idx:]
280+
281+
return response_text
282+
283+
@staticmethod
284+
def _repair_truncated_json(json_str: str) -> str | None:
285+
"""
286+
Attempt to repair truncated JSON by closing open brackets.
287+
288+
Returns repaired JSON string or None if repair failed.
289+
"""
290+
try:
291+
# Count open brackets
292+
open_braces = json_str.count('{') - json_str.count('}')
293+
open_brackets = json_str.count('[') - json_str.count(']')
294+
295+
# If severely unbalanced, try to find last complete entry
296+
if open_braces > 2 or open_brackets > 2:
297+
# Find the last complete keyword entry (ends with })
298+
last_complete = json_str.rfind('},')
299+
if last_complete > 0:
300+
json_str = json_str[:last_complete + 1]
301+
# Recount after truncation
302+
open_braces = json_str.count('{') - json_str.count('}')
303+
open_brackets = json_str.count('[') - json_str.count(']')
304+
305+
# Remove trailing comma if present
306+
json_str = json_str.rstrip().rstrip(',')
307+
308+
# Close brackets
309+
json_str += ']' * open_brackets
310+
json_str += '}' * open_braces
311+
312+
# Validate by parsing
313+
json.loads(json_str)
314+
return json_str
315+
except Exception:
316+
return None
317+
262318
@staticmethod
263319
def _prepare_text_for_matching(text: str) -> str:
264320
"""
@@ -418,7 +474,7 @@ def _extract_with_llm(self, resume_text: str) -> list[dict[str, Any]]:
418474
"mode": "chat",
419475
"completion_params": {
420476
"temperature": 0.3, # Lower temperature for more precise extraction
421-
"max_tokens": 2000
477+
"max_tokens": 4000 # Increased to avoid truncation
422478
}
423479
}
424480

@@ -450,17 +506,20 @@ def _extract_with_llm(self, resume_text: str) -> list[dict[str, Any]]:
450506
print(f"❌ LLM returned empty response after {max_retries} attempts")
451507
return []
452508

453-
# Remove markdown code blocks if present
454-
if response_text.startswith("```json"):
455-
response_text = response_text[7:]
456-
if response_text.startswith("```"):
457-
response_text = response_text[3:]
458-
if response_text.endswith("```"):
459-
response_text = response_text[:-3]
460-
response_text = response_text.strip()
461-
462-
# Parse JSON
463-
llm_data = json.loads(response_text)
509+
# Clean up response text
510+
response_text = self._clean_json_response(response_text)
511+
512+
# Parse JSON with repair attempt
513+
try:
514+
llm_data = json.loads(response_text)
515+
except json.JSONDecodeError:
516+
# Try to repair truncated JSON
517+
repaired = self._repair_truncated_json(response_text)
518+
if repaired:
519+
llm_data = json.loads(repaired)
520+
else:
521+
raise
522+
464523
keywords = llm_data.get("keywords", [])
465524

466525
if keywords:

0 commit comments

Comments
 (0)