@@ -323,6 +323,7 @@ async def _worker(self, worker_id: int):
323323 recent_failures : dict [str , list [datetime ]] = {}
324324 auto_requeue_counts : dict [str , int ] = {}
325325 last_alert_sent : dict [str , datetime ] = {}
326+ failure_metrics : dict [str , list [bool ]] = {}
326327
327328 while self .running :
328329 auto_errors = set (settings .task_queue_auto_requeue_errors )
@@ -372,6 +373,23 @@ async def _send_alert(error_type: str, payload: dict[str, Any]) -> None:
372373 except Exception as exc : # pragma: no cover - logging
373374 logger .error ("Failed to send Slack alert: %s" , exc )
374375
376+ async def _apply_adaptive_policy (task_name : str ) -> None :
377+ samples = failure_metrics .get (task_name , [])
378+ if len (samples ) < settings .task_queue_adaptive_min_samples :
379+ return
380+ failure_rate = 1 - (sum (1 for success in samples if success ) / len (samples ))
381+ if failure_rate < settings .task_queue_adaptive_failure_threshold :
382+ return
383+ policy = await get_retry_policy (self ._redis , task_name ) or {}
384+ policy .setdefault ("max_retries" , payload .get ("max_retries" , 3 ))
385+ policy .setdefault ("timeout" , self ._task_timeout )
386+ policy .setdefault ("backoff_base" , self ._backoff_base )
387+ policy .setdefault ("backoff_max" , self ._backoff_max )
388+ policy ["max_retries" ] = min (int (policy ["max_retries" ]) + 1 , settings .task_queue_max_auto_requeues )
389+ policy ["timeout" ] = float (policy .get ("timeout" , self ._task_timeout )) + 5.0
390+ await set_retry_policy (self ._redis , task_name , policy )
391+ failure_metrics [task_name ] = []
392+
375393 payload = await self .pop ()
376394 if not payload :
377395 await asyncio .sleep (self ._poll_interval )
@@ -460,6 +478,9 @@ async def _send_alert(error_type: str, payload: dict[str, Any]) -> None:
460478 auto_requeue_counts [key ] = count + 1
461479 if auto_payload :
462480 payload = auto_payload
481+ metrics = failure_metrics .setdefault (name , [])
482+ metrics .append (False )
483+ await _apply_adaptive_policy (name )
463484 if _record_failure (error_type , identifier ):
464485 await _send_alert (error_type , payload )
465486 continue
@@ -469,9 +490,15 @@ async def _send_alert(error_type: str, payload: dict[str, Any]) -> None:
469490 float (policy .get ("backoff_max" , self ._backoff_max )),
470491 )
471492 await asyncio .sleep (backoff )
493+ metrics = failure_metrics .setdefault (name , [])
494+ metrics .append (False )
495+ await _apply_adaptive_policy (name )
472496 await redis_enqueue_task (self ._redis , name , payload )
473497 continue
474498
499+ metrics = failure_metrics .setdefault (name , [])
500+ metrics .append (True )
501+ await _apply_adaptive_policy (name )
475502 await set_task_result (self ._redis , task_id , {"status" : "completed" , "result" : result })
476503 record_task_completion (self .queue_name , TaskStatus .COMPLETED .value )
477504 record_task_latency (self .queue_name , (utc_now () - start ).total_seconds ())
0 commit comments