Skip to content

Commit 065bc90

Browse files
committed
fix(inputs.docker): use startup-error-behavior framework for connection failures
Implement the startup-error-behavior framework (TSD-006) to handle Docker daemon unavailability during startup. This allows users to configure retry behavior via the startup_error_behavior option (error, retry, ignore, probe) instead of silently logging warnings and deferring connection to the first Gather.
1 parent ec111c6 commit 065bc90

File tree

3 files changed

+103
-54
lines changed

3 files changed

+103
-54
lines changed

plugins/inputs/docker/client.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,15 @@ import (
1212
"github.com/docker/docker/client"
1313
)
1414

15+
// IsErrConnectionFailed returns true if the error is caused by connection failure.
16+
// This is a passthrough to the docker client library function.
17+
var IsErrConnectionFailed = client.IsErrConnectionFailed
18+
1519
var (
1620
defaultHeaders = map[string]string{"User-Agent": "engine-api-cli-1.0"}
1721
)
1822

23+
//nolint:interfacebloat // wrapping upstream docker client which has many methods
1924
type dockerClient interface {
2025
// Info retrieves system-wide information about the Docker server.
2126
Info(ctx context.Context) (system.Info, error)
@@ -35,6 +40,8 @@ type dockerClient interface {
3540
DiskUsage(ctx context.Context, options types.DiskUsageOptions) (types.DiskUsage, error)
3641
// ClientVersion retrieves the version of the Docker client.
3742
ClientVersion() string
43+
// Ping pings the server and returns information about the server.
44+
Ping(ctx context.Context) (types.Ping, error)
3845
// Close releases any resources held by the client.
3946
Close() error
4047
}
@@ -114,6 +121,11 @@ func (c *socketClient) ClientVersion() string {
114121
return c.client.ClientVersion()
115122
}
116123

124+
// Ping pings the server and returns information about the server.
125+
func (c *socketClient) Ping(ctx context.Context) (types.Ping, error) {
126+
return c.client.Ping(ctx)
127+
}
128+
117129
// Close releases any resources held by the client.
118130
func (c *socketClient) Close() error {
119131
return c.client.Close()

plugins/inputs/docker/docker.go

Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/influxdata/telegraf"
2626
"github.com/influxdata/telegraf/config"
2727
"github.com/influxdata/telegraf/filter"
28+
"github.com/influxdata/telegraf/internal"
2829
"github.com/influxdata/telegraf/internal/choice"
2930
"github.com/influxdata/telegraf/internal/docker"
3031
docker_stats "github.com/influxdata/telegraf/plugins/common/docker"
@@ -143,25 +144,29 @@ func (d *Docker) Init() error {
143144
}
144145

145146
func (d *Docker) Start(telegraf.Accumulator) error {
146-
// Attempt initial connection but don't fail if Docker is unavailable.
147-
// This preserves backwards compatibility where Telegraf starts even when
148-
// Docker daemon is not running.
149-
if err := d.initClient(); err != nil {
150-
d.Log.Warnf("Failed to connect to Docker daemon during startup: %v. Will retry on first gather.", err)
151-
}
152-
return nil
153-
}
154-
155-
// initClient initializes the Docker client and performs Podman detection.
156-
// Returns an error if the connection fails, but does not prevent Telegraf from starting.
157-
func (d *Docker) initClient() error {
158147
// Get client
159148
c, err := d.getNewClient()
160149
if err != nil {
161-
return err
150+
return &internal.StartupError{
151+
Err: fmt.Errorf("failed to create Docker client: %w", err),
152+
Retry: IsErrConnectionFailed(err),
153+
}
162154
}
163155
d.client = c
164156

157+
// Use Ping to check connectivity, this is a lightweight check
158+
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(d.Timeout))
159+
_, err = d.client.Ping(ctx)
160+
cancel()
161+
if err != nil {
162+
d.client.Close()
163+
d.client = nil
164+
return &internal.StartupError{
165+
Err: fmt.Errorf("failed to ping Docker daemon: %w", err),
166+
Retry: IsErrConnectionFailed(err),
167+
}
168+
}
169+
165170
// Check API version compatibility
166171
version, err := semver.NewVersion(d.client.ClientVersion())
167172
if err != nil {
@@ -179,14 +184,17 @@ func (d *Docker) initClient() error {
179184
}
180185

181186
// Get info from docker daemon for Podman detection
182-
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(d.Timeout))
187+
ctx, cancel = context.WithTimeout(context.Background(), time.Duration(d.Timeout))
183188
defer cancel()
184189

185190
info, err := d.client.Info(ctx)
186191
if err != nil {
187192
d.client.Close()
188193
d.client = nil
189-
return fmt.Errorf("failed to get Docker info: %w", err)
194+
return &internal.StartupError{
195+
Err: fmt.Errorf("failed to get Docker info: %w", err),
196+
Retry: IsErrConnectionFailed(err),
197+
}
190198
}
191199

192200
d.engineHost = info.Name
@@ -213,11 +221,8 @@ func (d *Docker) Stop() {
213221
}
214222

215223
func (d *Docker) Gather(acc telegraf.Accumulator) error {
216-
// If client is not initialized, try to connect now
217224
if d.client == nil {
218-
if err := d.initClient(); err != nil {
219-
return fmt.Errorf("failed to connect to Docker daemon: %w", err)
220-
}
225+
return errors.New("docker client not initialized")
221226
}
222227

223228
// Create label filters if not already created

plugins/inputs/docker/docker_test.go

Lines changed: 67 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919

2020
"github.com/influxdata/telegraf"
2121
"github.com/influxdata/telegraf/config"
22+
"github.com/influxdata/telegraf/internal"
2223
"github.com/influxdata/telegraf/internal/choice"
2324
"github.com/influxdata/telegraf/testutil"
2425
)
@@ -33,6 +34,7 @@ type mockClient struct {
3334
NodeListF func() ([]swarm.Node, error)
3435
DiskUsageF func() (types.DiskUsage, error)
3536
ClientVersionF func() string
37+
PingF func() (types.Ping, error)
3638
CloseF func() error
3739
}
3840

@@ -72,6 +74,10 @@ func (c *mockClient) ClientVersion() string {
7274
return c.ClientVersionF()
7375
}
7476

77+
func (c *mockClient) Ping(context.Context) (types.Ping, error) {
78+
return c.PingF()
79+
}
80+
7581
func (c *mockClient) Close() error {
7682
return c.CloseF()
7783
}
@@ -104,6 +110,9 @@ var baseClient = mockClient{
104110
ClientVersionF: func() string {
105111
return version
106112
},
113+
PingF: func() (types.Ping, error) {
114+
return types.Ping{}, nil
115+
},
107116
CloseF: func() error {
108117
return nil
109118
},
@@ -422,7 +431,8 @@ func TestDocker_WindowsMemoryContainerStats(t *testing.T) {
422431
var acc testutil.Accumulator
423432

424433
d := Docker{
425-
Log: testutil.Logger{},
434+
Log: testutil.Logger{},
435+
Timeout: config.Duration(5 * time.Second),
426436
newClient: func(string, *tls.Config) (dockerClient, error) {
427437
return &mockClient{
428438
InfoF: func() (system.Info, error) {
@@ -452,6 +462,9 @@ func TestDocker_WindowsMemoryContainerStats(t *testing.T) {
452462
ClientVersionF: func() string {
453463
return version
454464
},
465+
PingF: func() (types.Ping, error) {
466+
return types.Ping{}, nil
467+
},
455468
CloseF: func() error {
456469
return nil
457470
},
@@ -1694,6 +1707,7 @@ func TestPodmanDetection(t *testing.T) {
16941707
var acc testutil.Accumulator
16951708
d := Docker{
16961709
Endpoint: tt.endpoint,
1710+
Timeout: config.Duration(5 * time.Second),
16971711
newClient: func(string, *tls.Config) (dockerClient, error) {
16981712
return &mockClient{
16991713
InfoF: func() (system.Info, error) {
@@ -1712,6 +1726,9 @@ func TestPodmanDetection(t *testing.T) {
17121726
ClientVersionF: func() string {
17131727
return "1.24.0"
17141728
},
1729+
PingF: func() (types.Ping, error) {
1730+
return types.Ping{}, nil
1731+
},
17151732
CloseF: func() error {
17161733
return nil
17171734
},
@@ -1777,8 +1794,7 @@ func TestPodmanStatsCache(t *testing.T) {
17771794
}
17781795

17791796
func TestStartWithUnavailableDocker(t *testing.T) {
1780-
// Test that Telegraf starts successfully even when Docker is unavailable
1781-
// This is a regression test for https://github.com/influxdata/telegraf/issues/18089
1797+
// Test that Start returns a retryable StartupError when Docker is unavailable
17821798
var acc testutil.Accumulator
17831799
d := Docker{
17841800
Log: testutil.Logger{},
@@ -1791,42 +1807,70 @@ func TestStartWithUnavailableDocker(t *testing.T) {
17911807
}
17921808

17931809
require.NoError(t, d.Init())
1794-
// Start should NOT return an error even when Docker is unavailable
1795-
require.NoError(t, d.Start(&acc))
1796-
// Client should be nil since connection failed
1797-
require.Nil(t, d.client)
17981810

1799-
// Gather should return an error since Docker is still unavailable
1800-
err := d.Gather(&acc)
1811+
// Start should return a StartupError when Docker is unavailable
1812+
err := d.Start(&acc)
18011813
require.Error(t, err)
1802-
require.Contains(t, err.Error(), "failed to connect to Docker daemon")
1814+
1815+
var startupErr *internal.StartupError
1816+
require.ErrorAs(t, err, &startupErr)
1817+
require.Contains(t, startupErr.Error(), "failed to create Docker client")
18031818
}
18041819

1805-
func TestLazyClientInitialization(t *testing.T) {
1806-
// Test that client is initialized lazily on first Gather if Start failed to connect
1820+
func TestStartWithPingFailure(t *testing.T) {
1821+
// Test that Start returns a retryable StartupError when Ping fails due to connection issues
18071822
var acc testutil.Accumulator
18081823

1809-
// Track connection attempts
1810-
connectionAttempts := 0
1824+
// Create a mock client that succeeds on creation but fails on Ping
1825+
d := Docker{
1826+
Log: testutil.Logger{},
1827+
Timeout: config.Duration(5 * time.Second),
1828+
newClient: func(string, *tls.Config) (dockerClient, error) {
1829+
return &mockClient{
1830+
PingF: func() (types.Ping, error) {
1831+
return types.Ping{}, errors.New("connection refused")
1832+
},
1833+
CloseF: func() error {
1834+
return nil
1835+
},
1836+
}, nil
1837+
},
1838+
newEnvClient: func() (dockerClient, error) {
1839+
return nil, errors.New("not using env client")
1840+
},
1841+
}
1842+
1843+
require.NoError(t, d.Init())
1844+
1845+
// Start should return a StartupError when Ping fails
1846+
err := d.Start(&acc)
1847+
require.Error(t, err)
1848+
1849+
var startupErr *internal.StartupError
1850+
require.ErrorAs(t, err, &startupErr)
1851+
require.Contains(t, startupErr.Error(), "failed to ping Docker daemon")
1852+
// Client should be nil since we clean up on failure
1853+
require.Nil(t, d.client)
1854+
}
1855+
1856+
func TestStartSuccess(t *testing.T) {
1857+
// Test that Start succeeds when Docker is available
1858+
var acc testutil.Accumulator
18111859

18121860
d := Docker{
1813-
Log: testutil.Logger{},
1861+
Log: testutil.Logger{},
1862+
Timeout: config.Duration(5 * time.Second),
18141863
newClient: func(string, *tls.Config) (dockerClient, error) {
1815-
connectionAttempts++
1816-
// First attempt fails, subsequent attempts succeed
1817-
if connectionAttempts == 1 {
1818-
return nil, errors.New("docker daemon not ready")
1819-
}
18201864
return &mockClient{
1865+
PingF: func() (types.Ping, error) {
1866+
return types.Ping{}, nil
1867+
},
18211868
InfoF: func() (system.Info, error) {
18221869
return system.Info{
18231870
Name: "docker-desktop",
18241871
ServerVersion: "20.10.0",
18251872
}, nil
18261873
},
1827-
ContainerListF: func(container.ListOptions) ([]container.Summary, error) {
1828-
return nil, nil
1829-
},
18301874
ClientVersionF: func() string {
18311875
return "1.24.0"
18321876
},
@@ -1841,18 +1885,6 @@ func TestLazyClientInitialization(t *testing.T) {
18411885
}
18421886

18431887
require.NoError(t, d.Init())
1844-
// Start should succeed even though connection fails
18451888
require.NoError(t, d.Start(&acc))
1846-
require.Equal(t, 1, connectionAttempts)
1847-
require.Nil(t, d.client)
1848-
1849-
// First Gather fails because Docker is still unavailable (same mock returns error on attempt 1)
1850-
// Reset connection attempts to simulate Docker becoming available
1851-
connectionAttempts = 1 // Set to 1 so next attempt (2) will succeed
1852-
1853-
// Second Gather should succeed after lazy initialization
1854-
err := d.Gather(&acc)
1855-
require.NoError(t, err)
1856-
require.Equal(t, 2, connectionAttempts)
18571889
require.NotNil(t, d.client)
18581890
}

0 commit comments

Comments
 (0)