Skip to content

Commit e02e8eb

Browse files
authored
fix: container logging deadlocks (testcontainers#2791)
Refactor container log handling simplifying the logic fixing various issues with error handling and race conditions between the complex combinations of multiple channels that have been causing random deadlocks in tests. The new version has simple for loop with an inter call to ContainerLogs and stdcopy.StdCopy leveraging an adapter between io.Writer and LogConsumer. This could be used to easily expose separate stdout and stderr handlers.
1 parent 738e8fc commit e02e8eb

File tree

4 files changed

+172
-151
lines changed

4 files changed

+172
-151
lines changed

docker.go

Lines changed: 137 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import (
55
"bufio"
66
"context"
77
"encoding/base64"
8-
"encoding/binary"
98
"encoding/json"
109
"errors"
1110
"fmt"
@@ -17,7 +16,6 @@ import (
1716
"path/filepath"
1817
"regexp"
1918
"strings"
20-
"sync"
2119
"time"
2220

2321
"github.com/cenkalti/backoff/v4"
@@ -30,6 +28,7 @@ import (
3028
"github.com/docker/docker/client"
3129
"github.com/docker/docker/errdefs"
3230
"github.com/docker/docker/pkg/jsonmessage"
31+
"github.com/docker/docker/pkg/stdcopy"
3332
"github.com/docker/go-connections/nat"
3433
"github.com/moby/term"
3534
specs "github.com/opencontainers/image-spec/specs-go/v1"
@@ -48,11 +47,21 @@ const (
4847
Podman = "podman"
4948
ReaperDefault = "reaper_default" // Default network name when bridge is not available
5049
packagePath = "github.com/testcontainers/testcontainers-go"
51-
52-
logStoppedForOutOfSyncMessage = "Stopping log consumer: Headers out of sync"
5350
)
5451

55-
var createContainerFailDueToNameConflictRegex = regexp.MustCompile("Conflict. The container name .* is already in use by container .*")
52+
var (
53+
// createContainerFailDueToNameConflictRegex is a regular expression that matches the container is already in use error.
54+
createContainerFailDueToNameConflictRegex = regexp.MustCompile("Conflict. The container name .* is already in use by container .*")
55+
56+
// minLogProductionTimeout is the minimum log production timeout.
57+
minLogProductionTimeout = time.Duration(5 * time.Second)
58+
59+
// maxLogProductionTimeout is the maximum log production timeout.
60+
maxLogProductionTimeout = time.Duration(60 * time.Second)
61+
62+
// errLogProductionStop is the cause for stopping log production.
63+
errLogProductionStop = errors.New("log production stopped")
64+
)
5665

5766
// DockerContainer represents a container started using Docker
5867
type DockerContainer struct {
@@ -65,23 +74,19 @@ type DockerContainer struct {
6574
isRunning bool
6675
imageWasBuilt bool
6776
// keepBuiltImage makes Terminate not remove the image if imageWasBuilt.
68-
keepBuiltImage bool
69-
provider *DockerProvider
70-
sessionID string
71-
terminationSignal chan bool
72-
consumers []LogConsumer
73-
logProductionError chan error
77+
keepBuiltImage bool
78+
provider *DockerProvider
79+
sessionID string
80+
terminationSignal chan bool
81+
consumers []LogConsumer
7482

7583
// TODO: Remove locking and wait group once the deprecated StartLogProducer and
7684
// StopLogProducer have been removed and hence logging can only be started and
7785
// stopped once.
7886

79-
// logProductionWaitGroup is used to signal when the log production has stopped.
80-
// This allows stopLogProduction to safely set logProductionStop to nil.
81-
// See simplification in https://go.dev/play/p/x0pOElF2Vjf
82-
logProductionWaitGroup sync.WaitGroup
83-
84-
logProductionStop chan struct{}
87+
// logProductionCancel is used to signal the log production to stop.
88+
logProductionCancel context.CancelCauseFunc
89+
logProductionCtx context.Context
8590

8691
logProductionTimeout *time.Duration
8792
logger Logging
@@ -263,7 +268,6 @@ func (c *DockerContainer) Stop(ctx context.Context, timeout *time.Duration) erro
263268
// without exposing the ability to fully initialize the container state.
264269
// See: https://github.com/testcontainers/testcontainers-go/issues/2667
265270
// TODO: Add a check for isRunning when the above issue is resolved.
266-
267271
err := c.stoppingHook(ctx)
268272
if err != nil {
269273
return fmt.Errorf("stopping hook: %w", err)
@@ -310,7 +314,7 @@ func (c *DockerContainer) Terminate(ctx context.Context) error {
310314
}
311315

312316
select {
313-
// close reaper if it was created
317+
// Close reaper connection if it was attached.
314318
case c.terminationSignal <- true:
315319
default:
316320
}
@@ -690,6 +694,29 @@ func (c *DockerContainer) copyToContainer(ctx context.Context, fileContent func(
690694
return nil
691695
}
692696

697+
// logConsumerWriter is a writer that writes to a LogConsumer.
698+
type logConsumerWriter struct {
699+
log Log
700+
consumers []LogConsumer
701+
}
702+
703+
// newLogConsumerWriter creates a new logConsumerWriter for logType that sends messages to all consumers.
704+
func newLogConsumerWriter(logType string, consumers []LogConsumer) *logConsumerWriter {
705+
return &logConsumerWriter{
706+
log: Log{LogType: logType},
707+
consumers: consumers,
708+
}
709+
}
710+
711+
// Write writes the p content to all consumers.
712+
func (lw logConsumerWriter) Write(p []byte) (int, error) {
713+
lw.log.Content = p
714+
for _, consumer := range lw.consumers {
715+
consumer.Accept(lw.log)
716+
}
717+
return len(p), nil
718+
}
719+
693720
type LogProductionOption func(*DockerContainer)
694721

695722
// WithLogProductionTimeout is a functional option that sets the timeout for the log production.
@@ -707,124 +734,94 @@ func (c *DockerContainer) StartLogProducer(ctx context.Context, opts ...LogProdu
707734

708735
// startLogProduction will start a concurrent process that will continuously read logs
709736
// from the container and will send them to each added LogConsumer.
737+
//
710738
// Default log production timeout is 5s. It is used to set the context timeout
711-
// which means that each log-reading loop will last at least the specified timeout
712-
// and that it cannot be cancelled earlier.
739+
// which means that each log-reading loop will last at up to the specified timeout.
740+
//
713741
// Use functional option WithLogProductionTimeout() to override default timeout. If it's
714742
// lower than 5s and greater than 60s it will be set to 5s or 60s respectively.
715743
func (c *DockerContainer) startLogProduction(ctx context.Context, opts ...LogProductionOption) error {
716-
c.logProductionStop = make(chan struct{}, 1) // buffered channel to avoid blocking
717-
c.logProductionWaitGroup.Add(1)
718-
719744
for _, opt := range opts {
720745
opt(c)
721746
}
722747

723-
minLogProductionTimeout := time.Duration(5 * time.Second)
724-
maxLogProductionTimeout := time.Duration(60 * time.Second)
725-
726-
if c.logProductionTimeout == nil {
748+
// Validate the log production timeout.
749+
switch {
750+
case c.logProductionTimeout == nil:
727751
c.logProductionTimeout = &minLogProductionTimeout
728-
}
729-
730-
if *c.logProductionTimeout < minLogProductionTimeout {
752+
case *c.logProductionTimeout < minLogProductionTimeout:
731753
c.logProductionTimeout = &minLogProductionTimeout
732-
}
733-
734-
if *c.logProductionTimeout > maxLogProductionTimeout {
754+
case *c.logProductionTimeout > maxLogProductionTimeout:
735755
c.logProductionTimeout = &maxLogProductionTimeout
736756
}
737757

738-
c.logProductionError = make(chan error, 1)
758+
// Setup the log writers.
759+
stdout := newLogConsumerWriter(StdoutLog, c.consumers)
760+
stderr := newLogConsumerWriter(StderrLog, c.consumers)
761+
762+
// Setup the log production context which will be used to stop the log production.
763+
c.logProductionCtx, c.logProductionCancel = context.WithCancelCause(ctx)
739764

740765
go func() {
741-
defer func() {
742-
close(c.logProductionError)
743-
c.logProductionWaitGroup.Done()
744-
}()
745-
746-
since := ""
747-
// if the socket is closed we will make additional logs request with updated Since timestamp
748-
BEGIN:
749-
options := container.LogsOptions{
750-
ShowStdout: true,
751-
ShowStderr: true,
752-
Follow: true,
753-
Since: since,
754-
}
766+
err := c.logProducer(stdout, stderr)
767+
// Set context cancel cause, if not already set.
768+
c.logProductionCancel(err)
769+
}()
755770

756-
ctx, cancel := context.WithTimeout(ctx, *c.logProductionTimeout)
771+
return nil
772+
}
773+
774+
// logProducer read logs from the container and writes them to stdout, stderr until either:
775+
// - logProductionCtx is done
776+
// - A fatal error occurs
777+
// - No more logs are available
778+
func (c *DockerContainer) logProducer(stdout, stderr io.Writer) error {
779+
// Clean up idle client connections.
780+
defer c.provider.Close()
781+
782+
// Setup the log options, start from the beginning.
783+
options := container.LogsOptions{
784+
ShowStdout: true,
785+
ShowStderr: true,
786+
Follow: true,
787+
}
788+
789+
for {
790+
timeoutCtx, cancel := context.WithTimeout(c.logProductionCtx, *c.logProductionTimeout)
757791
defer cancel()
758792

759-
r, err := c.provider.client.ContainerLogs(ctx, c.GetContainerID(), options)
760-
if err != nil {
761-
c.logProductionError <- err
762-
return
793+
err := c.copyLogs(timeoutCtx, stdout, stderr, options)
794+
switch {
795+
case err == nil:
796+
// No more logs available.
797+
return nil
798+
case c.logProductionCtx.Err() != nil:
799+
// Log production was stopped or caller context is done.
800+
return nil
801+
case timeoutCtx.Err() != nil, errors.Is(err, net.ErrClosed):
802+
// Timeout or client connection closed, retry.
803+
default:
804+
// Unexpected error, retry.
805+
Logger.Printf("Unexpected error reading logs: %v", err)
763806
}
764-
defer c.provider.Close()
765807

766-
for {
767-
select {
768-
case <-c.logProductionStop:
769-
c.logProductionError <- r.Close()
770-
return
771-
default:
772-
}
773-
h := make([]byte, 8)
774-
_, err := io.ReadFull(r, h)
775-
if err != nil {
776-
switch {
777-
case err == io.EOF:
778-
// No more logs coming
779-
case errors.Is(err, net.ErrClosed):
780-
now := time.Now()
781-
since = fmt.Sprintf("%d.%09d", now.Unix(), int64(now.Nanosecond()))
782-
goto BEGIN
783-
case errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled):
784-
// Probably safe to continue here
785-
continue
786-
default:
787-
_, _ = fmt.Fprintf(os.Stderr, "container log error: %+v. %s", err, logStoppedForOutOfSyncMessage)
788-
// if we would continue here, the next header-read will result into random data...
789-
}
790-
return
791-
}
792-
793-
count := binary.BigEndian.Uint32(h[4:])
794-
if count == 0 {
795-
continue
796-
}
797-
logType := h[0]
798-
if logType > 2 {
799-
_, _ = fmt.Fprintf(os.Stderr, "received invalid log type: %d", logType)
800-
// sometimes docker returns logType = 3 which is an undocumented log type, so treat it as stdout
801-
logType = 1
802-
}
808+
// Retry from the last log received.
809+
now := time.Now()
810+
options.Since = fmt.Sprintf("%d.%09d", now.Unix(), int64(now.Nanosecond()))
811+
}
812+
}
803813

804-
// a map of the log type --> int representation in the header, notice the first is blank, this is stdin, but the go docker client doesn't allow following that in logs
805-
logTypes := []string{"", StdoutLog, StderrLog}
814+
// copyLogs copies logs from the container to stdout and stderr.
815+
func (c *DockerContainer) copyLogs(ctx context.Context, stdout, stderr io.Writer, options container.LogsOptions) error {
816+
rc, err := c.provider.client.ContainerLogs(ctx, c.GetContainerID(), options)
817+
if err != nil {
818+
return fmt.Errorf("container logs: %w", err)
819+
}
820+
defer rc.Close()
806821

807-
b := make([]byte, count)
808-
_, err = io.ReadFull(r, b)
809-
if err != nil {
810-
// TODO: add-logger: use logger to log out this error
811-
_, _ = fmt.Fprintf(os.Stderr, "error occurred reading log with known length %s", err.Error())
812-
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
813-
// Probably safe to continue here
814-
continue
815-
}
816-
// we can not continue here as the next read most likely will not be the next header
817-
_, _ = fmt.Fprintln(os.Stderr, logStoppedForOutOfSyncMessage)
818-
return
819-
}
820-
for _, c := range c.consumers {
821-
c.Accept(Log{
822-
LogType: logTypes[logType],
823-
Content: b,
824-
})
825-
}
826-
}
827-
}()
822+
if _, err = stdcopy.StdCopy(stdout, stderr, rc); err != nil {
823+
return fmt.Errorf("stdcopy: %w", err)
824+
}
828825

829826
return nil
830827
}
@@ -837,18 +834,25 @@ func (c *DockerContainer) StopLogProducer() error {
837834
// stopLogProduction will stop the concurrent process that is reading logs
838835
// and sending them to each added LogConsumer
839836
func (c *DockerContainer) stopLogProduction() error {
840-
// signal the log production to stop
841-
c.logProductionStop <- struct{}{}
837+
if c.logProductionCancel == nil {
838+
return nil
839+
}
842840

843-
c.logProductionWaitGroup.Wait()
841+
// Signal the log production to stop.
842+
c.logProductionCancel(errLogProductionStop)
844843

845-
if err := <-c.logProductionError; err != nil {
846-
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
847-
// Returning context errors is not useful for the consumer.
844+
if err := context.Cause(c.logProductionCtx); err != nil {
845+
switch {
846+
case errors.Is(err, errLogProductionStop):
847+
// Log production was stopped.
848848
return nil
849+
case errors.Is(err, context.DeadlineExceeded),
850+
errors.Is(err, context.Canceled):
851+
// Parent context is done.
852+
return nil
853+
default:
854+
return err
849855
}
850-
851-
return err
852856
}
853857

854858
return nil
@@ -857,7 +861,16 @@ func (c *DockerContainer) stopLogProduction() error {
857861
// GetLogProductionErrorChannel exposes the only way for the consumer
858862
// to be able to listen to errors and react to them.
859863
func (c *DockerContainer) GetLogProductionErrorChannel() <-chan error {
860-
return c.logProductionError
864+
if c.logProductionCtx == nil {
865+
return nil
866+
}
867+
868+
errCh := make(chan error, 1)
869+
go func() {
870+
<-c.logProductionCtx.Done()
871+
errCh <- context.Cause(c.logProductionCtx)
872+
}()
873+
return errCh
861874
}
862875

863876
// DockerNetwork represents a network started using Docker

0 commit comments

Comments
 (0)