Skip to content

Commit 6cf0986

Browse files
Add error handling for LLM timeouts and rate limits
- Add try/except with logging to evaluation, assessment, and feedback agents - Map timeouts to HTTP 504, rate limits to HTTP 429 in API endpoints - Add error_type field to streaming SSE error events - Sanitize error messages to avoid leaking internal details - Add debug log for silent ACCEPT fallback in evaluation parsing
1 parent c89dcac commit 6cf0986

File tree

4 files changed

+107
-9
lines changed

4 files changed

+107
-9
lines changed

src/agents/assessment_agent.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,16 @@
44
elements or dimensions in the HED annotation.
55
"""
66

7+
import logging
78
from pathlib import Path
89

910
from langchain_core.language_models import BaseChatModel
1011
from langchain_core.messages import HumanMessage, SystemMessage
1112

1213
from src.agents.state import HedAnnotationState
1314

15+
logger = logging.getLogger(__name__)
16+
1417

1518
class AssessmentAgent:
1619
"""Agent that performs final assessment of HED annotations.
@@ -104,7 +107,11 @@ async def assess(self, state: HedAnnotationState) -> dict:
104107
HumanMessage(content=user_prompt),
105108
]
106109

107-
response = await self.llm.ainvoke(messages)
110+
try:
111+
response = await self.llm.ainvoke(messages)
112+
except Exception as e:
113+
logger.error("Assessment LLM invocation failed: %s", e, exc_info=True)
114+
raise
108115
content = response.content
109116
feedback = content.strip() if isinstance(content, str) else str(content)
110117

src/agents/evaluation_agent.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
the original natural language event description.
55
"""
66

7+
import logging
78
import re
89
from pathlib import Path
910

@@ -13,6 +14,8 @@
1314
from src.agents.state import HedAnnotationState
1415
from src.utils.json_schema_loader import HedJsonSchemaLoader, load_latest_schema
1516

17+
logger = logging.getLogger(__name__)
18+
1619

1720
class EvaluationAgent:
1821
"""Agent that evaluates the faithfulness of HED annotations.
@@ -164,7 +167,11 @@ async def evaluate(self, state: HedAnnotationState) -> dict:
164167
HumanMessage(content=user_prompt),
165168
]
166169

167-
response = await self.llm.ainvoke(messages)
170+
try:
171+
response = await self.llm.ainvoke(messages)
172+
except Exception as e:
173+
logger.error("Evaluation LLM invocation failed: %s", e, exc_info=True)
174+
raise
168175
content = response.content
169176
feedback = content.strip() if isinstance(content, str) else str(content)
170177

@@ -206,6 +213,9 @@ def _parse_decision(self, feedback: str) -> bool:
206213
return False
207214

208215
# Default to accept if ambiguous -- avoid unnecessary refinement loops
216+
logger.debug(
217+
"Evaluation parsing: no explicit DECISION/FAITHFUL/refine indicator found; defaulting to ACCEPT"
218+
)
209219
return True
210220

211221
def _check_tags_and_suggest(self, annotation: str) -> str:

src/agents/feedback_summarizer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,15 @@
44
into concise, actionable points for the annotation agent.
55
"""
66

7+
import logging
8+
79
from langchain_core.language_models import BaseChatModel
810
from langchain_core.messages import HumanMessage, SystemMessage
911

1012
from src.agents.state import HedAnnotationState
1113

14+
logger = logging.getLogger(__name__)
15+
1216

1317
class FeedbackSummarizer:
1418
"""Agent that summarizes validation errors and feedback.
@@ -112,7 +116,11 @@ async def summarize(self, state: HedAnnotationState) -> dict:
112116
HumanMessage(content=user_prompt),
113117
]
114118

115-
response = await self.llm.ainvoke(messages)
119+
try:
120+
response = await self.llm.ainvoke(messages)
121+
except Exception as e:
122+
logger.error("Feedback summarization LLM invocation failed: %s", e, exc_info=True)
123+
raise
116124
content = response.content
117125
summarized_feedback = content.strip() if isinstance(content, str) else str(content)
118126

src/api/main.py

Lines changed: 79 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from fastapi.middleware.cors import CORSMiddleware
1919
from fastapi.responses import StreamingResponse
2020
from langchain_community.chat_models import ChatOllama
21+
from openai import APITimeoutError, RateLimitError
2122

2223
from src import __version__
2324
from src.agents.vision_agent import VisionAgent
@@ -699,10 +700,21 @@ async def annotate(
699700
status=status,
700701
)
701702

703+
except APITimeoutError as e:
704+
raise HTTPException(
705+
status_code=504,
706+
detail="LLM request timed out. Try again or use a faster model/provider.",
707+
) from e
708+
except RateLimitError as e:
709+
raise HTTPException(
710+
status_code=429,
711+
detail="LLM rate limit exceeded. Please wait and try again.",
712+
) from e
702713
except Exception as e:
714+
logging.exception("Annotation workflow failed")
703715
raise HTTPException(
704716
status_code=500,
705-
detail=f"Annotation workflow failed: {str(e)}",
717+
detail="An error occurred during annotation processing.",
706718
) from e
707719

708720

@@ -895,10 +907,21 @@ async def annotate_from_image(
895907
image_metadata=image_metadata,
896908
)
897909

910+
except APITimeoutError as e:
911+
raise HTTPException(
912+
status_code=504,
913+
detail="LLM request timed out. Try again or use a faster model/provider.",
914+
) from e
915+
except RateLimitError as e:
916+
raise HTTPException(
917+
status_code=429,
918+
detail="LLM rate limit exceeded. Please wait and try again.",
919+
) from e
898920
except Exception as e:
921+
logging.exception("Image annotation workflow failed")
899922
raise HTTPException(
900923
status_code=500,
901-
detail=f"Image annotation workflow failed: {str(e)}",
924+
detail="An error occurred during image annotation processing.",
902925
) from e
903926

904927

@@ -1110,10 +1133,35 @@ def send_event(event_type: str, data: dict) -> str:
11101133

11111134
except asyncio.CancelledError:
11121135
raise
1136+
except APITimeoutError:
1137+
logging.exception("Streaming workflow timeout")
1138+
yield send_event(
1139+
"error",
1140+
{
1141+
"message": "LLM request timed out. Try again or use a faster model/provider.",
1142+
"error_type": "timeout",
1143+
},
1144+
)
1145+
yield send_event("done", {"message": "Workflow ended with error"})
1146+
except RateLimitError:
1147+
logging.exception("Streaming workflow rate limit")
1148+
yield send_event(
1149+
"error",
1150+
{
1151+
"message": "LLM rate limit exceeded. Please wait and try again.",
1152+
"error_type": "rate_limit",
1153+
},
1154+
)
1155+
yield send_event("done", {"message": "Workflow ended with error"})
11131156
except Exception:
1114-
# Log the actual error for debugging, but return a generic message
11151157
logging.exception("Streaming workflow error")
1116-
yield send_event("error", {"message": "An error occurred during annotation processing"})
1158+
yield send_event(
1159+
"error",
1160+
{
1161+
"message": "An error occurred during annotation processing.",
1162+
"error_type": "internal",
1163+
},
1164+
)
11171165
yield send_event("done", {"message": "Workflow ended with error"})
11181166

11191167
return StreamingResponse(
@@ -1381,10 +1429,35 @@ def send_event(event_type: str, data: dict) -> str:
13811429

13821430
except asyncio.CancelledError:
13831431
raise
1432+
except APITimeoutError:
1433+
logging.exception("Streaming image workflow timeout")
1434+
yield send_event(
1435+
"error",
1436+
{
1437+
"message": "LLM request timed out. Try again or use a faster model/provider.",
1438+
"error_type": "timeout",
1439+
},
1440+
)
1441+
yield send_event("done", {"message": "Workflow ended with error"})
1442+
except RateLimitError:
1443+
logging.exception("Streaming image workflow rate limit")
1444+
yield send_event(
1445+
"error",
1446+
{
1447+
"message": "LLM rate limit exceeded. Please wait and try again.",
1448+
"error_type": "rate_limit",
1449+
},
1450+
)
1451+
yield send_event("done", {"message": "Workflow ended with error"})
13841452
except Exception:
1385-
# Log the actual error for debugging, but return a generic message
13861453
logging.exception("Streaming image annotation workflow error")
1387-
yield send_event("error", {"message": "An error occurred during image annotation"})
1454+
yield send_event(
1455+
"error",
1456+
{
1457+
"message": "An error occurred during image annotation processing.",
1458+
"error_type": "internal",
1459+
},
1460+
)
13881461
yield send_event("done", {"message": "Workflow ended with error"})
13891462

13901463
return StreamingResponse(

0 commit comments

Comments
 (0)