44
55import asyncio
66import logging
7+ import traceback
78import uuid
89from collections import deque
910from collections .abc import Callable , Coroutine
10- from dataclasses import dataclass
11+ from dataclasses import dataclass , field
1112from datetime import datetime , timedelta
1213from enum import Enum
1314from typing import Any
1415
1516from ..observability .metrics import (
17+ dead_letter_active_gauge ,
18+ dead_letter_purged_total ,
19+ dead_letter_recorded_total ,
20+ dead_letter_requeued_total ,
1621 record_task_completion ,
1722 record_task_enqueued ,
1823 record_task_latency ,
2126from ..utils .datetime import utc_now
2227from .redis import (
2328 clear_dead_letter ,
29+ count_dead_letters ,
2430 enqueue_task as redis_enqueue_task ,
2531 fetch_dead_letters ,
2632 get_dead_letter ,
@@ -53,6 +59,7 @@ class Task:
5359 coro_fn : Callable [..., Coroutine [Any , Any , Any ]]
5460 args : tuple
5561 kwargs : dict
62+ metadata : dict [str , Any ] = field (default_factory = dict )
5663 status : TaskStatus = TaskStatus .PENDING
5764 created_at : datetime = None # type: ignore[assignment]
5865 started_at : datetime | None = None
@@ -104,6 +111,7 @@ async def enqueue(
104111 coro_fn : Callable [..., Coroutine [Any , Any , Any ]],
105112 * args : Any ,
106113 max_retries : int = 3 ,
114+ metadata : dict [str , Any ] | None = None ,
107115 ** kwargs : Any ,
108116 ) -> str :
109117 """Enqueue a task and return task ID."""
@@ -115,6 +123,7 @@ async def enqueue(
115123 args = args ,
116124 kwargs = kwargs ,
117125 max_retries = max_retries ,
126+ metadata = metadata or {},
118127 )
119128 async with self ._lock :
120129 self .queue .append (task )
@@ -199,8 +208,14 @@ async def _execute_task(self, task: Task):
199208 record_task_latency (self .queue_name , (task .completed_at - task .started_at ).total_seconds ())
200209 logger .error ("Task permanently failed: %s (id=%s)" , task .name , task .task_id )
201210
202- async def list_dead_letters (self , limit : int = 100 ) -> list [dict [str , Any ]]:
203- return []
211+ async def list_dead_letters (
212+ self ,
213+ limit : int = 100 ,
214+ offset : int = 0 ,
215+ workflow_id : str | None = None ,
216+ error_type : str | None = None ,
217+ ) -> tuple [list [dict [str , Any ]], int ]:
218+ return [], 0
204219
205220 async def delete_dead_letter (self , task_id : str ) -> None :
206221 return None
@@ -254,6 +269,7 @@ async def enqueue( # type: ignore[override]
254269 coro_fn : Callable [..., Coroutine [Any , Any , Any ]],
255270 * args : Any ,
256271 max_retries : int = 3 ,
272+ metadata : dict [str , Any ] | None = None ,
257273 ** kwargs : Any ,
258274 ) -> str :
259275 self .register (name , coro_fn )
@@ -264,6 +280,7 @@ async def enqueue( # type: ignore[override]
264280 "kwargs" : kwargs ,
265281 "max_retries" : max_retries ,
266282 "enqueued_at" : utc_now ().isoformat (),
283+ "metadata" : metadata or {},
267284 }
268285 await redis_enqueue_task (self ._redis , name , payload )
269286 record_task_enqueued (self .queue_name )
@@ -289,10 +306,20 @@ async def _worker(self, worker_id: int):
289306 coro_fn = self ._registry .get (name )
290307 if coro_fn is None :
291308 logger .error ("No registered task callable for %s" , name )
292- await record_dead_letter (
293- self ._redis ,
294- {** payload , "error" : "missing_callable" , "worker_id" : worker_id },
295- )
309+ payload .setdefault ("metadata" , {})
310+ payload ["error_type" ] = "MissingCallable"
311+ payload ["error_message" ] = f"Task callable not registered: { name } "
312+ payload ["stack_trace" ] = None
313+ payload ["worker_id" ] = worker_id
314+ await record_dead_letter (self ._redis , payload )
315+ dead_letter_recorded_total .labels (
316+ queue = self .queue_name ,
317+ error_type = "TimeoutError" ,
318+ ).inc ()
319+ dead_letter_recorded_total .labels (
320+ queue = self .queue_name ,
321+ error_type = payload .get ("error_type" , payload .get ("last_error" , "unknown" )),
322+ ).inc ()
296323 record_task_completion (self .queue_name , TaskStatus .FAILED .value )
297324 continue
298325
@@ -305,21 +332,35 @@ async def _worker(self, worker_id: int):
305332 coro_fn (* payload .get ("args" , ()), ** payload .get ("kwargs" , {})),
306333 timeout = self ._task_timeout ,
307334 )
308- except asyncio .TimeoutError :
335+ except TimeoutError :
336+ payload .setdefault ("metadata" , {})
309337 payload ["retry_count" ] = retries + 1
310338 payload ["last_error" ] = "timeout"
339+ payload ["error_type" ] = "TimeoutError"
340+ payload ["error_message" ] = (
341+ f"Task execution exceeded timeout of { self ._task_timeout } seconds"
342+ )
343+ payload ["stack_trace" ] = None
311344 payload ["worker_id" ] = worker_id
312345 await record_dead_letter (self ._redis , payload )
313346 record_task_completion (self .queue_name , TaskStatus .FAILED .value )
314347 record_task_latency (self .queue_name , (utc_now () - start ).total_seconds ())
315348 continue
316349 except Exception as exc : # pylint: disable=broad-except
350+ payload .setdefault ("metadata" , {})
317351 retries += 1
318352 payload ["retry_count" ] = retries
319353 payload ["last_error" ] = str (exc )
320354 if retries >= max_retries :
355+ payload ["error_type" ] = exc .__class__ .__name__
356+ payload ["error_message" ] = str (exc )
357+ payload ["stack_trace" ] = traceback .format_exc ()
321358 payload ["worker_id" ] = worker_id
322359 await record_dead_letter (self ._redis , payload )
360+ dead_letter_recorded_total .labels (
361+ queue = self .queue_name ,
362+ error_type = payload .get ("error_type" , "unknown" ),
363+ ).inc ()
323364 record_task_completion (self .queue_name , TaskStatus .FAILED .value )
324365 record_task_latency (self .queue_name , (utc_now () - start ).total_seconds ())
325366 continue
@@ -343,8 +384,25 @@ async def _worker(self, worker_id: int):
343384
344385 logger .info ("Redis worker %d stopped" , worker_id )
345386
346- async def list_dead_letters (self , limit : int = 100 ) -> list [dict [str , Any ]]:
347- return await fetch_dead_letters (self ._redis , limit )
387+ async def list_dead_letters (
388+ self ,
389+ limit : int = 100 ,
390+ offset : int = 0 ,
391+ workflow_id : str | None = None ,
392+ error_type : str | None = None ,
393+ ) -> tuple [list [dict [str , Any ]], int ]:
394+ items , total_raw = await fetch_dead_letters (self ._redis , limit = None , include_total = True )
395+ filtered : list [dict [str , Any ]] = []
396+ for item in items :
397+ if workflow_id and item .get ("metadata" , {}).get ("workflow_id" ) != workflow_id :
398+ continue
399+ if error_type and item .get ("error_type" ) != error_type :
400+ continue
401+ filtered .append (item )
402+ total_filtered = len (filtered )
403+ window = filtered [offset : offset + limit ]
404+ dead_letter_active_gauge .labels (queue = self .queue_name ).set (total_filtered )
405+ return window , total_filtered
348406
349407 async def delete_dead_letter (self , task_id : str ) -> None :
350408 await clear_dead_letter (self ._redis , task_id )
@@ -363,6 +421,9 @@ async def requeue_dead_letter(self, task_id: str) -> dict[str, Any] | None:
363421 payload ["requeued_at" ] = utc_now ().isoformat ()
364422 await redis_enqueue_task (self ._redis , payload .get ("name" , "unknown" ), payload )
365423 record_task_enqueued (self .queue_name )
424+ dead_letter_requeued_total .labels (
425+ queue = self .queue_name , error_type = payload .get ("error_type" , "unknown" )
426+ ).inc ()
366427 logger .info ("Dead-letter task requeued: %s" , task_id )
367428 return payload
368429
@@ -371,9 +432,15 @@ async def get_dead_letter(self, task_id: str) -> dict[str, Any] | None:
371432
372433 async def purge_dead_letters (self , * , older_than : timedelta | None = None ) -> int :
373434 if older_than is None :
374- return await purge_dead_letters (self ._redis )
435+ deleted = await purge_dead_letters (self ._redis )
436+ dead_letter_purged_total .labels (queue = self .queue_name , mode = "all" ).inc (deleted )
437+ dead_letter_active_gauge .labels (queue = self .queue_name ).set (await count_dead_letters (self ._redis ))
438+ return deleted
375439 cutoff = utc_now () - older_than
376- return await purge_dead_letters (self ._redis , older_than = cutoff )
440+ deleted = await purge_dead_letters (self ._redis , older_than = cutoff )
441+ dead_letter_purged_total .labels (queue = self .queue_name , mode = "age_filter" ).inc (deleted )
442+ dead_letter_active_gauge .labels (queue = self .queue_name ).set (await count_dead_letters (self ._redis ))
443+ return deleted
377444
378445 _task_queue = RedisTaskQueue (max_workers = settings .task_queue_workers )
379446 else :
0 commit comments