@@ -34,7 +34,8 @@ type podEventLoggerOptions struct {
34
34
35
35
logger slog.Logger
36
36
logDebounce time.Duration
37
- maxRetries int
37
+ // maxRetries is the maximum number of retries for a log send failure.
38
+ maxRetries int
38
39
39
40
// The following fields are optional!
40
41
namespaces []string
@@ -414,7 +415,9 @@ type logQueuer struct {
414
415
loggers map [string ]agentLoggerLifecycle
415
416
logCache logCache
416
417
417
- retries map [string ]* retryState
418
+ // retries maps agent tokens to their retry state for exponential backoff
419
+ retries map [string ]* retryState
420
+ // maxRetries is the maximum number of retries for a log send failure.
418
421
maxRetries int
419
422
}
420
423
@@ -436,7 +439,7 @@ func (l *logQueuer) work(ctx context.Context) {
436
439
}
437
440
}
438
441
439
- func (l * logQueuer ) newLogger (ctx context.Context , log agentLog , queuedLogs []agentsdk. Log ) (agentLoggerLifecycle , error ) {
442
+ func (l * logQueuer ) newLogger (ctx context.Context , log agentLog ) (agentLoggerLifecycle , error ) {
440
443
client := agentsdk .New (l .coderURL )
441
444
client .SetSessionToken (log .agentToken )
442
445
logger := l .logger .With (slog .F ("resource_name" , log .resourceName ))
@@ -448,10 +451,9 @@ func (l *logQueuer) newLogger(ctx context.Context, log agentLog, queuedLogs []ag
448
451
DisplayName : "Kubernetes" ,
449
452
})
450
453
if err != nil {
451
- // This shouldn't fail sending the log, as it only affects how they
452
- // appear .
454
+ // Posting the log source failed, which affects how logs appear.
455
+ // We'll retry to ensure the log source is properly registered .
453
456
logger .Error (ctx , "post log source" , slog .Error (err ))
454
- l .scheduleRetry (ctx , log .agentToken )
455
457
return agentLoggerLifecycle {}, err
456
458
}
457
459
@@ -466,7 +468,6 @@ func (l *logQueuer) newLogger(ctx context.Context, log agentLog, queuedLogs []ag
466
468
if err != nil {
467
469
logger .Error (ctx , "drpc connect" , slog .Error (err ))
468
470
gracefulCancel ()
469
- l .scheduleRetry (ctx , log .agentToken )
470
471
return agentLoggerLifecycle {}, err
471
472
}
472
473
go func () {
@@ -485,6 +486,8 @@ func (l *logQueuer) newLogger(ctx context.Context, log agentLog, queuedLogs []ag
485
486
lifecycle := agentLoggerLifecycle {
486
487
scriptLogger : sl ,
487
488
close : func () {
489
+ defer arpc .DRPCConn ().Close ()
490
+ defer client .SDK .HTTPClient .CloseIdleConnections ()
488
491
// We could be stopping for reasons other than the timeout. If
489
492
// so, stop the timer.
490
493
closeTimer .Stop ()
@@ -503,9 +506,6 @@ func (l *logQueuer) newLogger(ctx context.Context, log agentLog, queuedLogs []ag
503
506
// ctx err
504
507
logger .Warn (gracefulCtx , "timeout reached while waiting for log queue to empty" )
505
508
}
506
-
507
- _ = arpc .DRPCConn ().Close ()
508
- client .SDK .HTTPClient .CloseIdleConnections ()
509
509
},
510
510
}
511
511
lifecycle .closeTimer = closeTimer
@@ -533,7 +533,7 @@ func (l *logQueuer) processLog(ctx context.Context, log agentLog) {
533
533
}
534
534
535
535
var err error
536
- lgr , err = l .newLogger (ctx , log , queuedLogs )
536
+ lgr , err = l .newLogger (ctx , log )
537
537
if err != nil {
538
538
l .scheduleRetry (ctx , log .agentToken )
539
539
return
@@ -549,7 +549,7 @@ func (l *logQueuer) processLog(ctx context.Context, log agentLog) {
549
549
l .scheduleRetry (ctx , log .agentToken )
550
550
return
551
551
}
552
- l .clearRetry (log .agentToken )
552
+ l .clearRetryLocked (log .agentToken )
553
553
l .logCache .delete (log .agentToken )
554
554
}
555
555
@@ -558,9 +558,8 @@ func (l *logQueuer) processDelete(log agentLog) {
558
558
lgr , ok := l .loggers [log .agentToken ]
559
559
if ok {
560
560
delete (l .loggers , log .agentToken )
561
-
562
561
}
563
- l .clearRetry (log .agentToken )
562
+ l .clearRetryLocked (log .agentToken )
564
563
l .logCache .delete (log .agentToken )
565
564
l .mu .Unlock ()
566
565
@@ -598,6 +597,7 @@ type retryState struct {
598
597
delay time.Duration
599
598
timer * quartz.Timer
600
599
retryCount int
600
+ exhausted bool // prevent retry state recreation after max retries
601
601
}
602
602
603
603
func (l * logQueuer ) scheduleRetry (ctx context.Context , token string ) {
@@ -606,8 +606,13 @@ func (l *logQueuer) scheduleRetry(ctx context.Context, token string) {
606
606
}
607
607
608
608
rs := l .retries [token ]
609
+
610
+ if rs != nil && rs .exhausted {
611
+ return
612
+ }
613
+
609
614
if rs == nil {
610
- rs = & retryState {delay : time .Second }
615
+ rs = & retryState {delay : time .Second , retryCount : 0 , exhausted : false }
611
616
l .retries [token ] = rs
612
617
}
613
618
@@ -618,7 +623,11 @@ func (l *logQueuer) scheduleRetry(ctx context.Context, token string) {
618
623
l .logger .Error (ctx , "max retries exceeded" ,
619
624
slog .F ("retryCount" , rs .retryCount ),
620
625
slog .F ("maxRetries" , l .maxRetries ))
621
- l .clearRetry (token )
626
+ rs .exhausted = true
627
+ if rs .timer != nil {
628
+ rs .timer .Stop ()
629
+ rs .timer = nil
630
+ }
622
631
l .logCache .delete (token )
623
632
return
624
633
}
@@ -627,24 +636,18 @@ func (l *logQueuer) scheduleRetry(ctx context.Context, token string) {
627
636
return
628
637
}
629
638
630
- if rs .delay < time .Second {
631
- rs .delay = time .Second
632
- } else if rs .delay > 30 * time .Second {
633
- rs .delay = 30 * time .Second
634
- }
635
-
636
639
l .logger .Info (ctx , "scheduling retry" ,
637
640
slog .F ("delay" , rs .delay .String ()),
638
641
slog .F ("retryCount" , rs .retryCount ))
639
642
640
643
rs .timer = l .clock .AfterFunc (rs .delay , func () {
641
644
l .mu .Lock ()
642
- if cur := l .retries [token ]; cur != nil {
645
+ defer l .mu .Unlock ()
646
+
647
+ if cur := l .retries [token ]; cur != nil && ! cur .exhausted {
643
648
cur .timer = nil
649
+ l .q <- agentLog {op : opLog , agentToken : token }
644
650
}
645
- l .mu .Unlock ()
646
-
647
- l .q <- agentLog {op : opLog , agentToken : token }
648
651
})
649
652
650
653
rs .delay *= 2
@@ -653,7 +656,9 @@ func (l *logQueuer) scheduleRetry(ctx context.Context, token string) {
653
656
}
654
657
}
655
658
656
- func (l * logQueuer ) clearRetry (token string ) {
659
+ // clearRetryLocked clears the retry state for the given token.
660
+ // The caller must hold the mutex lock.
661
+ func (l * logQueuer ) clearRetryLocked (token string ) {
657
662
if rs := l .retries [token ]; rs != nil {
658
663
if rs .timer != nil {
659
664
rs .timer .Stop ()
0 commit comments