Skip to content

Commit d62845e

Browse files
amosttAygentic
andcommitted
feat(ocr): implement error handling and retry logic with rate limiting
Added comprehensive error classification for OCR API calls with automatic retry logic using modern Celery patterns (autoretry_for, retry_backoff, retry_jitter). Implements smart error handling: - RetryableError (500, 502, 503, 504, 408) with exponential backoff - NonRetryableError (400, 401, 403, 404) fail immediately - RateLimitError (429) respects Retry-After header Enhanced structured logging with full context (ingestion_id, error_type, status_code, retry_count) for debugging and monitoring. All tests passing (101 tests, 86% coverage). No PaddleOCR fallback implemented per project decision. Fixes CUR-48 🤖 Generated by Aygentic Co-Authored-By: Aygentic <[email protected]>
1 parent cbade4e commit d62845e

File tree

4 files changed

+528
-37
lines changed

4 files changed

+528
-37
lines changed

backend/app/services/ocr.py

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,45 @@
1313

1414

1515
class OCRProviderError(Exception):
16-
"""Exception raised when OCR provider encounters an error."""
16+
"""Base exception for OCR provider errors."""
1717

1818
pass
1919

2020

21+
class RetryableError(OCRProviderError):
22+
"""Error that should trigger a retry (500, 502, 503, 504, 408).
23+
24+
These are transient errors that may resolve with retry.
25+
"""
26+
27+
def __init__(self, message: str, status_code: int | None = None):
28+
super().__init__(message)
29+
self.status_code = status_code
30+
31+
32+
class NonRetryableError(OCRProviderError):
33+
"""Error that should NOT be retried (400, 401, 403, 404).
34+
35+
These are permanent errors that won't resolve with retry.
36+
"""
37+
38+
def __init__(self, message: str, status_code: int | None = None):
39+
super().__init__(message)
40+
self.status_code = status_code
41+
42+
43+
class RateLimitError(RetryableError):
44+
"""429 Rate Limit error with optional Retry-After header.
45+
46+
Indicates the API rate limit has been exceeded.
47+
The retry_after attribute contains seconds to wait (from Retry-After header).
48+
"""
49+
50+
def __init__(self, message: str, retry_after: int | None = None):
51+
super().__init__(message, status_code=429)
52+
self.retry_after = retry_after # Seconds to wait before retry
53+
54+
2155
class BoundingBox(BaseModel):
2256
"""Bounding box coordinates for content elements."""
2357

@@ -196,14 +230,41 @@ async def extract_text(self, pdf_bytes: bytes) -> OCRResult:
196230
},
197231
)
198232

199-
if response.status_code != 200:
200-
error_msg = f"Mistral API error (status {response.status_code})"
201-
try:
202-
error_data = response.json()
203-
error_msg += f": {error_data.get('error', 'Unknown error')}"
204-
except Exception:
205-
pass
206-
raise OCRProviderError(error_msg)
233+
# Error classification based on HTTP status code
234+
if response.status_code == 429:
235+
# Rate limit - extract Retry-After header
236+
retry_after_header = response.headers.get("retry-after")
237+
retry_seconds = int(retry_after_header) if retry_after_header else None
238+
raise RateLimitError(
239+
"Mistral API rate limit exceeded", retry_after=retry_seconds
240+
)
241+
242+
elif response.status_code == 401:
243+
raise NonRetryableError(
244+
"Mistral API authentication failed - check API key",
245+
status_code=401,
246+
)
247+
248+
elif response.status_code in (400, 403, 404):
249+
# Client errors - don't retry
250+
raise NonRetryableError(
251+
f"Mistral API error: {response.status_code}",
252+
status_code=response.status_code,
253+
)
254+
255+
elif response.status_code in (500, 502, 503, 504, 408):
256+
# Server errors - retry
257+
raise RetryableError(
258+
f"Mistral API server error: {response.status_code}",
259+
status_code=response.status_code,
260+
)
261+
262+
elif response.status_code != 200:
263+
# Unknown error - default to retryable for safety
264+
raise RetryableError(
265+
f"Mistral API error: {response.status_code}",
266+
status_code=response.status_code,
267+
)
207268

208269
api_response = response.json()
209270
processing_time = time.time() - start_time

backend/app/tasks/extraction.py

Lines changed: 86 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,13 @@
1212
from app.core.config import settings
1313
from app.core.db import engine
1414
from app.models import ExtractionStatus, Ingestion
15-
from app.services.ocr import MistralOCRProvider, OCRProviderError
15+
from app.services.ocr import (
16+
MistralOCRProvider,
17+
NonRetryableError,
18+
OCRProviderError,
19+
RateLimitError,
20+
RetryableError,
21+
)
1622
from app.services.storage import download_from_storage
1723
from app.worker import celery_app
1824

@@ -29,13 +35,21 @@ def get_db_context() -> Generator[Session, None, None]:
2935
@celery_app.task(
3036
bind=True,
3137
name="app.tasks.extraction.process_ocr",
32-
max_retries=3,
33-
time_limit=600, # 10 minutes
38+
autoretry_for=(RetryableError,), # Auto-retry on transient errors
39+
retry_backoff=True, # Exponential backoff: 1s, 2s, 4s, 8s...
40+
retry_backoff_max=600, # Cap backoff at 10 minutes
41+
retry_jitter=True, # Add randomness to prevent thundering herd
42+
retry_kwargs={"max_retries": 3},
43+
time_limit=600, # 10 minutes max per attempt
3444
) # type: ignore[misc]
3545
def process_ocr_task(self: Any, ingestion_id: str) -> dict[str, Any]:
3646
"""
3747
Process a PDF ingestion through OCR using Mistral AI.
3848
49+
Automatically retries on RetryableError (500, 502, 503, 504, 408) with
50+
exponential backoff. Does NOT retry on NonRetryableError (400, 401, 403, 404).
51+
Special handling for RateLimitError (429) to respect Retry-After header.
52+
3953
Args:
4054
ingestion_id: UUID of the ingestion record
4155
@@ -44,10 +58,13 @@ def process_ocr_task(self: Any, ingestion_id: str) -> dict[str, Any]:
4458
4559
Raises:
4660
ValueError: If ingestion not found or invalid ID format
47-
OCRProviderError: If OCR processing fails
48-
Retry: If task should be retried (transient errors)
61+
NonRetryableError: Permanent errors (auth, bad request)
62+
RetryableError: Transient errors (after max retries exhausted)
63+
RateLimitError: Rate limit errors (after max retries exhausted)
4964
"""
50-
logger.info(f"Starting OCR processing for ingestion: {ingestion_id}")
65+
logger.info(
66+
f"Starting OCR for {ingestion_id} (attempt {self.request.retries + 1})"
67+
)
5168

5269
# Validate ingestion_id format
5370
try:
@@ -95,8 +112,14 @@ def process_ocr_task(self: Any, ingestion_id: str) -> dict[str, Any]:
95112
loop.close()
96113

97114
logger.info(
98-
f"[{ingestion_id}] OCR completed: {ocr_result.total_pages} pages, "
99-
f"{ocr_result.processing_time_seconds:.2f}s"
115+
f"[{ingestion_id}] OCR completed successfully",
116+
extra={
117+
"ingestion_id": ingestion_id,
118+
"provider": ocr_result.ocr_provider,
119+
"total_pages": ocr_result.total_pages,
120+
"processing_time_seconds": ocr_result.processing_time_seconds,
121+
"retry_count": self.request.retries,
122+
},
100123
)
101124

102125
# Update ingestion status to OCR_COMPLETE
@@ -115,8 +138,37 @@ def process_ocr_task(self: Any, ingestion_id: str) -> dict[str, Any]:
115138
"metadata": ocr_result.metadata,
116139
}
117140

118-
except OCRProviderError as e:
119-
logger.error(f"[{ingestion_id}] OCR provider error: {str(e)}")
141+
except RateLimitError as e:
142+
# Special handling for 429 - respect Retry-After header
143+
logger.warning(
144+
f"[{ingestion_id}] Rate limited (429). "
145+
f"Retry-After: {e.retry_after}s. Attempt {self.request.retries + 1}/3",
146+
extra={
147+
"ingestion_id": ingestion_id,
148+
"error_type": "RateLimitError",
149+
"status_code": 429,
150+
"retry_after": e.retry_after,
151+
"retry_count": self.request.retries,
152+
},
153+
)
154+
155+
if e.retry_after:
156+
# Override default backoff with Retry-After value from API
157+
raise self.retry(exc=e, countdown=e.retry_after)
158+
else:
159+
# Let autoretry_for handle it with exponential backoff
160+
raise
161+
162+
except NonRetryableError as e:
163+
# Don't retry - permanent errors (401, 400, 403, 404)
164+
logger.error(
165+
f"[{ingestion_id}] Non-retryable error: {e}",
166+
extra={
167+
"ingestion_id": ingestion_id,
168+
"error_type": type(e).__name__,
169+
"status_code": e.status_code,
170+
},
171+
)
120172

121173
# Update status to FAILED
122174
with get_db_context() as db:
@@ -126,18 +178,32 @@ def process_ocr_task(self: Any, ingestion_id: str) -> dict[str, Any]:
126178
db.add(ingestion)
127179
db.commit()
128180

129-
# Retry on transient errors (rate limits, timeouts)
130-
if "rate limit" in str(e).lower() or "timeout" in str(e).lower():
131-
retry_countdown = 2**self.request.retries # Exponential backoff
132-
logger.warning(
133-
f"[{ingestion_id}] Retrying after {retry_countdown}s (attempt {self.request.retries + 1}/3)"
134-
)
135-
raise self.retry(exc=e, countdown=retry_countdown, max_retries=3)
136-
else:
137-
raise
181+
raise # Don't retry, fail immediately
182+
183+
except RetryableError as e:
184+
# Transient errors - will be caught by autoretry_for
185+
logger.warning(
186+
f"[{ingestion_id}] Retryable error: {e}. "
187+
f"Attempt {self.request.retries + 1}/3",
188+
extra={
189+
"ingestion_id": ingestion_id,
190+
"error_type": type(e).__name__,
191+
"status_code": e.status_code,
192+
"retry_count": self.request.retries,
193+
},
194+
)
195+
raise # Let autoretry_for handle it with exponential backoff
138196

139197
except Exception as e:
140-
logger.error(f"[{ingestion_id}] Unexpected error during OCR: {str(e)}")
198+
# Unexpected error - log and fail
199+
logger.error(
200+
f"[{ingestion_id}] Unexpected error: {str(e)}",
201+
exc_info=True,
202+
extra={
203+
"ingestion_id": ingestion_id,
204+
"error_type": type(e).__name__,
205+
},
206+
)
141207

142208
# Update status to FAILED
143209
try:

0 commit comments

Comments
 (0)