Skip to content

Commit ec111c6

Browse files
committed
fix(inputs.docker): Allow Telegraf to start when Docker daemon is unavailable
This fixes a regression introduced in v1.36.3 where Telegraf would fail to start if the Docker/Podman socket was unavailable. The Start() method now logs a warning instead of returning a fatal error, and the client connection is retried lazily on each Gather() cycle.
1 parent c987240 commit ec111c6

File tree

2 files changed

+105
-0
lines changed

2 files changed

+105
-0
lines changed

plugins/inputs/docker/docker.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,18 @@ func (d *Docker) Init() error {
143143
}
144144

145145
func (d *Docker) Start(telegraf.Accumulator) error {
146+
// Attempt initial connection but don't fail if Docker is unavailable.
147+
// This preserves backwards compatibility where Telegraf starts even when
148+
// Docker daemon is not running.
149+
if err := d.initClient(); err != nil {
150+
d.Log.Warnf("Failed to connect to Docker daemon during startup: %v. Will retry on first gather.", err)
151+
}
152+
return nil
153+
}
154+
155+
// initClient initializes the Docker client and performs Podman detection.
156+
// Returns an error if the connection fails, but does not prevent Telegraf from starting.
157+
func (d *Docker) initClient() error {
146158
// Get client
147159
c, err := d.getNewClient()
148160
if err != nil {
@@ -153,6 +165,8 @@ func (d *Docker) Start(telegraf.Accumulator) error {
153165
// Check API version compatibility
154166
version, err := semver.NewVersion(d.client.ClientVersion())
155167
if err != nil {
168+
d.client.Close()
169+
d.client = nil
156170
return fmt.Errorf("failed to parse client version: %w", err)
157171
}
158172

@@ -170,6 +184,8 @@ func (d *Docker) Start(telegraf.Accumulator) error {
170184

171185
info, err := d.client.Info(ctx)
172186
if err != nil {
187+
d.client.Close()
188+
d.client = nil
173189
return fmt.Errorf("failed to get Docker info: %w", err)
174190
}
175191

@@ -197,6 +213,13 @@ func (d *Docker) Stop() {
197213
}
198214

199215
func (d *Docker) Gather(acc telegraf.Accumulator) error {
216+
// If client is not initialized, try to connect now
217+
if d.client == nil {
218+
if err := d.initClient(); err != nil {
219+
return fmt.Errorf("failed to connect to Docker daemon: %w", err)
220+
}
221+
}
222+
200223
// Create label filters if not already created
201224
if !d.filtersCreated {
202225
err := d.createLabelFilters()

plugins/inputs/docker/docker_test.go

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package docker
33
import (
44
"context"
55
"crypto/tls"
6+
"errors"
67
"io"
78
"reflect"
89
"sort"
@@ -1774,3 +1775,84 @@ func TestPodmanStatsCache(t *testing.T) {
17741775
require.NotContains(t, d.statsCache, "old-container")
17751776
require.Contains(t, d.statsCache, testID)
17761777
}
1778+
1779+
func TestStartWithUnavailableDocker(t *testing.T) {
1780+
// Test that Telegraf starts successfully even when Docker is unavailable
1781+
// This is a regression test for https://github.com/influxdata/telegraf/issues/18089
1782+
var acc testutil.Accumulator
1783+
d := Docker{
1784+
Log: testutil.Logger{},
1785+
newClient: func(string, *tls.Config) (dockerClient, error) {
1786+
return nil, errors.New("cannot connect to the Docker daemon")
1787+
},
1788+
newEnvClient: func() (dockerClient, error) {
1789+
return nil, errors.New("cannot connect to the Docker daemon")
1790+
},
1791+
}
1792+
1793+
require.NoError(t, d.Init())
1794+
// Start should NOT return an error even when Docker is unavailable
1795+
require.NoError(t, d.Start(&acc))
1796+
// Client should be nil since connection failed
1797+
require.Nil(t, d.client)
1798+
1799+
// Gather should return an error since Docker is still unavailable
1800+
err := d.Gather(&acc)
1801+
require.Error(t, err)
1802+
require.Contains(t, err.Error(), "failed to connect to Docker daemon")
1803+
}
1804+
1805+
func TestLazyClientInitialization(t *testing.T) {
1806+
// Test that client is initialized lazily on first Gather if Start failed to connect
1807+
var acc testutil.Accumulator
1808+
1809+
// Track connection attempts
1810+
connectionAttempts := 0
1811+
1812+
d := Docker{
1813+
Log: testutil.Logger{},
1814+
newClient: func(string, *tls.Config) (dockerClient, error) {
1815+
connectionAttempts++
1816+
// First attempt fails, subsequent attempts succeed
1817+
if connectionAttempts == 1 {
1818+
return nil, errors.New("docker daemon not ready")
1819+
}
1820+
return &mockClient{
1821+
InfoF: func() (system.Info, error) {
1822+
return system.Info{
1823+
Name: "docker-desktop",
1824+
ServerVersion: "20.10.0",
1825+
}, nil
1826+
},
1827+
ContainerListF: func(container.ListOptions) ([]container.Summary, error) {
1828+
return nil, nil
1829+
},
1830+
ClientVersionF: func() string {
1831+
return "1.24.0"
1832+
},
1833+
CloseF: func() error {
1834+
return nil
1835+
},
1836+
}, nil
1837+
},
1838+
newEnvClient: func() (dockerClient, error) {
1839+
return nil, errors.New("not using env client")
1840+
},
1841+
}
1842+
1843+
require.NoError(t, d.Init())
1844+
// Start should succeed even though connection fails
1845+
require.NoError(t, d.Start(&acc))
1846+
require.Equal(t, 1, connectionAttempts)
1847+
require.Nil(t, d.client)
1848+
1849+
// First Gather fails because Docker is still unavailable (same mock returns error on attempt 1)
1850+
// Reset connection attempts to simulate Docker becoming available
1851+
connectionAttempts = 1 // Set to 1 so next attempt (2) will succeed
1852+
1853+
// Second Gather should succeed after lazy initialization
1854+
err := d.Gather(&acc)
1855+
require.NoError(t, err)
1856+
require.Equal(t, 2, connectionAttempts)
1857+
require.NotNil(t, d.client)
1858+
}

0 commit comments

Comments
 (0)