@@ -13,6 +13,8 @@ import (
13
13
14
14
dockerTypes "github.com/docker/docker/api/types"
15
15
dockerContainer "github.com/docker/docker/api/types/container"
16
+ dockerTypesEvents "github.com/docker/docker/api/types/events"
17
+ dockerFilter "github.com/docker/docker/api/types/filters"
16
18
"github.com/docker/docker/client"
17
19
"github.com/prometheus/client_golang/prometheus"
18
20
log "github.com/sirupsen/logrus"
@@ -53,7 +55,6 @@ type DockerSource struct {
53
55
runningContainerState map [string ]* ContainerConfig
54
56
compiledContainerName []* regexp.Regexp
55
57
compiledContainerID []* regexp.Regexp
56
- CheckIntervalDuration time.Duration
57
58
logger * log.Entry
58
59
Client client.CommonAPIClient
59
60
t * tomb.Tomb
@@ -75,9 +76,8 @@ func (d *DockerSource) GetUuid() string {
75
76
76
77
func (d * DockerSource ) UnmarshalConfig (yamlConfig []byte ) error {
77
78
d .Config = DockerConfiguration {
78
- FollowStdout : true , // default
79
- FollowStdErr : true , // default
80
- CheckInterval : "1s" , // default
79
+ FollowStdout : true , // default
80
+ FollowStdErr : true , // default
81
81
}
82
82
83
83
err := yaml .UnmarshalStrict (yamlConfig , & d .Config )
@@ -97,9 +97,8 @@ func (d *DockerSource) UnmarshalConfig(yamlConfig []byte) error {
97
97
return errors .New ("use_container_labels and container_name, container_id, container_id_regexp, container_name_regexp are mutually exclusive" )
98
98
}
99
99
100
- d .CheckIntervalDuration , err = time .ParseDuration (d .Config .CheckInterval )
101
- if err != nil {
102
- return fmt .Errorf ("parsing 'check_interval' parameters: %s" , d .CheckIntervalDuration )
100
+ if d .Config .CheckInterval != "" {
101
+ d .logger .Warn ("check_interval is deprecated, it will be removed in a future version" )
103
102
}
104
103
105
104
if d .Config .Mode == "" {
@@ -495,63 +494,164 @@ func (d *DockerSource) EvalContainer(ctx context.Context, container dockerTypes.
495
494
return nil
496
495
}
497
496
497
+ func (d * DockerSource ) checkContainers (ctx context.Context , monitChan chan * ContainerConfig , deleteChan chan * ContainerConfig ) error {
498
+ // to track for garbage collection
499
+ runningContainersID := make (map [string ]bool )
500
+
501
+ runningContainers , err := d .Client .ContainerList (ctx , dockerContainer.ListOptions {})
502
+ if err != nil {
503
+ if strings .Contains (strings .ToLower (err .Error ()), "cannot connect to the docker daemon at" ) {
504
+ for idx , container := range d .runningContainerState {
505
+ if d .runningContainerState [idx ].t .Alive () {
506
+ d .logger .Infof ("killing tail for container %s" , container .Name )
507
+ d .runningContainerState [idx ].t .Kill (nil )
508
+
509
+ if err := d .runningContainerState [idx ].t .Wait (); err != nil {
510
+ d .logger .Infof ("error while waiting for death of %s : %s" , container .Name , err )
511
+ }
512
+ }
513
+
514
+ delete (d .runningContainerState , idx )
515
+ }
516
+ } else {
517
+ log .Errorf ("container list err: %s" , err )
518
+ }
519
+
520
+ return err
521
+ }
522
+
523
+ for _ , container := range runningContainers {
524
+ runningContainersID [container .ID ] = true
525
+
526
+ // don't need to re eval an already monitored container
527
+ if _ , ok := d .runningContainerState [container .ID ]; ok {
528
+ continue
529
+ }
530
+
531
+ if containerConfig := d .EvalContainer (ctx , container ); containerConfig != nil {
532
+ monitChan <- containerConfig
533
+ }
534
+ }
535
+
536
+ for containerStateID , containerConfig := range d .runningContainerState {
537
+ if _ , ok := runningContainersID [containerStateID ]; ! ok {
538
+ deleteChan <- containerConfig
539
+ }
540
+ }
541
+
542
+ d .logger .Tracef ("Reading logs from %d containers" , len (d .runningContainerState ))
543
+ return nil
544
+ }
545
+
546
+ // subscribeEvents will loop until it can successfully call d.Client.Events()
547
+ // without immediately receiving an error. It applies exponential backoff on failures.
548
+ // Returns the new (eventsChan, errChan) pair or an error if context/tomb is done.
549
+ func (d * DockerSource ) subscribeEvents (ctx context.Context ) (<- chan dockerTypesEvents.Message , <- chan error , error ) {
550
+ const (
551
+ initialBackoff = 2 * time .Second
552
+ backoffFactor = 2
553
+ maxBackoff = 60 * time .Second
554
+ )
555
+
556
+ f := dockerFilter .NewArgs ()
557
+ f .Add ("type" , "container" )
558
+
559
+ options := dockerTypesEvents.ListOptions {
560
+ Filters : f ,
561
+ }
562
+
563
+ backoff := initialBackoff
564
+ retries := 0
565
+
566
+ d .logger .Infof ("Subscribing to Docker events" )
567
+
568
+ for {
569
+ // bail out immediately if the context is canceled
570
+ select {
571
+ case <- ctx .Done ():
572
+ return nil , nil , ctx .Err ()
573
+ case <- d .t .Dying ():
574
+ return nil , nil , errors .New ("connection aborted, shutting down docker watcher" )
575
+ default :
576
+ }
577
+
578
+ // Try to reconnect
579
+ eventsChan , errChan := d .Client .Events (ctx , options )
580
+
581
+ // Retry if the connection is immediately broken
582
+ select {
583
+ case err := <- errChan :
584
+ d .logger .Errorf ("Connection to Docker failed (attempt %d): %v" , retries + 1 , err )
585
+
586
+ retries ++
587
+
588
+ d .logger .Infof ("Sleeping %s before next retry" , backoff )
589
+
590
+ // Wait for 'backoff', but still allow cancellation
591
+ select {
592
+ case <- time .After (backoff ):
593
+ // Continue after backoff
594
+ case <- ctx .Done ():
595
+ return nil , nil , ctx .Err ()
596
+ case <- d .t .Dying ():
597
+ return nil , nil , errors .New ("connection aborted, shutting down docker watcher" )
598
+ }
599
+
600
+ backoff = max (backoff * backoffFactor , maxBackoff )
601
+
602
+ continue
603
+ default :
604
+ // great success!
605
+ return eventsChan , errChan , nil
606
+ }
607
+ }
608
+ }
609
+
498
610
func (d * DockerSource ) WatchContainer (ctx context.Context , monitChan chan * ContainerConfig , deleteChan chan * ContainerConfig ) error {
499
- ticker := time .NewTicker (d .CheckIntervalDuration )
500
- d .logger .Infof ("Container watcher started, interval: %s" , d .CheckIntervalDuration .String ())
611
+ err := d .checkContainers (ctx , monitChan , deleteChan )
612
+ if err != nil {
613
+ return err
614
+ }
615
+
616
+ eventsChan , errChan , err := d .subscribeEvents (ctx )
617
+ if err != nil {
618
+ return err
619
+ }
501
620
502
621
for {
503
622
select {
504
623
case <- d .t .Dying ():
505
624
d .logger .Infof ("stopping container watcher" )
506
625
return nil
507
- case <- ticker .C :
508
- // to track for garbage collection
509
- runningContainersID := make (map [string ]bool )
510
626
511
- runningContainers , err := d .Client .ContainerList (ctx , dockerContainer.ListOptions {})
512
- if err != nil {
513
- if strings .Contains (strings .ToLower (err .Error ()), "cannot connect to the docker daemon at" ) {
514
- for idx , container := range d .runningContainerState {
515
- if d .runningContainerState [idx ].t .Alive () {
516
- d .logger .Infof ("killing tail for container %s" , container .Name )
517
- d .runningContainerState [idx ].t .Kill (nil )
518
-
519
- if err := d .runningContainerState [idx ].t .Wait (); err != nil {
520
- d .logger .Infof ("error while waiting for death of %s : %s" , container .Name , err )
521
- }
522
- }
523
-
524
- delete (d .runningContainerState , idx )
525
- }
526
- } else {
527
- log .Errorf ("container list err: %s" , err )
627
+ case event := <- eventsChan :
628
+ if event .Action == dockerTypesEvents .ActionStart || event .Action == dockerTypesEvents .ActionDie {
629
+ if err := d .checkContainers (ctx , monitChan , deleteChan ); err != nil {
630
+ d .logger .Warnf ("Failed to check containers: %v" , err )
528
631
}
632
+ }
529
633
634
+ case err := <- errChan :
635
+ if err == nil {
530
636
continue
531
637
}
532
638
533
- for _ , container := range runningContainers {
534
- runningContainersID [container .ID ] = true
639
+ d .logger .Errorf ("Docker events error: %v" , err )
535
640
536
- // don't need to re eval an already monitored container
537
- if _ , ok := d .runningContainerState [container .ID ]; ok {
538
- continue
539
- }
540
-
541
- if containerConfig := d .EvalContainer (ctx , container ); containerConfig != nil {
542
- monitChan <- containerConfig
543
- }
641
+ // try to reconnect, replacing our channels on success. They are never nil if err is nil.
642
+ newEvents , newErr , recErr := d .subscribeEvents (ctx )
643
+ if recErr != nil {
644
+ return recErr
544
645
}
545
646
546
- for containerStateID , containerConfig := range d .runningContainerState {
547
- if _ , ok := runningContainersID [containerStateID ]; ! ok {
548
- deleteChan <- containerConfig
549
- }
550
- }
647
+ eventsChan , errChan = newEvents , newErr
551
648
552
- d .logger .Tracef ("Reading logs from %d containers" , len (d .runningContainerState ))
553
-
554
- ticker .Reset (d .CheckIntervalDuration )
649
+ d .logger .Info ("Successfully reconnected to Docker events" )
650
+ // We check containers after a reconnection because the docker daemon might have restarted
651
+ // and the container tombs may have self deleted
652
+ if err := d .checkContainers (ctx , monitChan , deleteChan ); err != nil {
653
+ d .logger .Warnf ("Failed to check containers: %v" , err )
654
+ }
555
655
}
556
656
}
557
657
}
0 commit comments