@@ -13,23 +13,43 @@ import (
1313 "fmt"
1414 "time"
1515
16- "github.com/DataDog/datadog-agent/pkg/logs/metrics"
16+ "github.com/DataDog/datadog-agent/comp/logs/agent/config"
17+ logsmetrics "github.com/DataDog/datadog-agent/pkg/logs/metrics"
1718 "github.com/DataDog/datadog-agent/pkg/logs/status"
19+ "github.com/DataDog/datadog-agent/pkg/util/backoff"
1820 "github.com/DataDog/datadog-agent/pkg/util/startstop"
1921)
2022
21- // restart conducts a partial restart of the logs-agent pipeline.
22- // This is used to switch between transport protocols
23- // without disrupting the entire agent.
24- func (a * logAgent ) restart (context.Context ) error {
23+ const (
24+ // Transport types for telemetry
25+ transportTCP = "tcp"
26+ transportHTTP = "http"
27+
28+ // Restart status for telemetry
29+ restartStatusSuccess = "success"
30+ restartStatusFailure = "failure"
31+ )
32+
33+ // restart conducts a partial restart of the logs-agent pipeline with the provided endpoints.
34+ // This is used to switch between transport protocols without disrupting the entire agent.
35+ func (a * logAgent ) restart (_ context.Context , newEndpoints * config.Endpoints ) error {
2536 a .log .Info ("Attempting to restart logs-agent pipeline" )
2637
2738 a .restartMutex .Lock ()
2839 defer a .restartMutex .Unlock ()
2940
41+ // Store current endpoints for rollback if restart fails
42+ previousEndpoints := a .endpoints
43+
44+ // Determine transport type for metrics
45+ targetTransport := transportTCP
46+ if newEndpoints .UseHTTP {
47+ targetTransport = transportHTTP
48+ }
49+
3050 a .log .Info ("Gracefully stopping logs-agent" )
3151
32- timeout := time . Duration ( a .config .GetInt ("logs_config.stop_grace_period" )) * time . Second
52+ timeout := a .config .GetDuration ("logs_config.stop_grace_period" )
3353 _ , cancel := context .WithTimeout (context .Background (), timeout )
3454 defer cancel ()
3555
@@ -39,24 +59,71 @@ func (a *logAgent) restart(context.Context) error {
3959
4060 a .log .Info ("Re-starting logs-agent..." )
4161
42- endpoints , err := buildEndpoints (a .config )
62+ a .endpoints = newEndpoints
63+
64+ err := a .setupAgentForRestart ()
65+ if err != nil {
66+ message := fmt .Sprintf ("Could not re-start logs-agent: %v" , err )
67+ a .log .Error (message )
68+ a .log .Error ("Attempting rollback to previous transport" )
69+ logsmetrics .TlmRestartAttempt .Inc (restartStatusFailure , targetTransport )
70+ return a .rollbackToPreviousTransport (previousEndpoints )
71+ }
72+
73+ a .restartPipeline ()
74+ logsmetrics .TlmRestartAttempt .Inc (restartStatusSuccess , targetTransport )
75+ return nil
76+ }
77+
78+ // restartWithHTTPUpgrade upgrades the logs-agent pipeline to HTTP transport.
79+ // This is called by the smart HTTP restart mechanism after HTTP connectivity has been verified.
80+ //
81+ // Since HTTP connectivity was verified before calling this function, we commit to HTTP
82+ // and will keep retrying HTTP even if the upgrade fails. If restart fails, the base
83+ // restart() function will rollback to TCP temporarily, but this function returns an
84+ // error to trigger retry - ensuring we eventually upgrade to HTTP since connectivity exists.
85+ func (a * logAgent ) restartWithHTTPUpgrade (ctx context.Context ) error {
86+ // Build HTTP endpoints since we already verified HTTP connectivity
87+ endpoints , err := buildHTTPEndpointsForRestart (a .config )
4388 if err != nil {
44- message := fmt .Sprintf ("Invalid endpoints: %v" , err )
89+ message := fmt .Sprintf ("Failed to build HTTP endpoints: %v" , err )
4590 status .AddGlobalError (invalidEndpoints , message )
91+ a .log .Error (message )
92+ logsmetrics .TlmRestartAttempt .Inc (restartStatusFailure , transportHTTP )
4693 return errors .New (message )
4794 }
4895
49- a .endpoints = endpoints
96+ err = a .restart (ctx , endpoints )
97+ if err != nil {
98+ // Restart failed (may have rolled back to TCP to keep agent functional)
99+ // Since we verified HTTP connectivity, return error to trigger retry
100+ a .log .Warnf ("HTTP upgrade attempt failed: %v - will retry on next attempt" , err )
101+ return fmt .Errorf ("HTTP upgrade failed: %w" , err )
102+ }
103+
104+ a .log .Info ("Successfully upgraded to HTTP transport" )
105+ return nil
106+ }
50107
51- err = a .setupAgentForRestart ()
108+ // rollbackToPreviousTransport attempts to restore the agent to its previous working state
109+ // after a failed transport switch. This ensures the agent continues functioning
110+ // rather than being left in a broken state.
111+ func (a * logAgent ) rollbackToPreviousTransport (previousEndpoints * config.Endpoints ) error {
112+ a .log .Warn ("Rolling back to previous transport after failed restart" )
113+
114+ a .endpoints = previousEndpoints
115+
116+ err := a .setupAgentForRestart ()
52117 if err != nil {
53- message := fmt .Sprintf ("Could not re-start logs-agent: %v" , err )
118+ // This is a critical failure - we can't recover
119+ message := fmt .Sprintf ("CRITICAL: Failed to rollback to previous transport: %v" , err )
54120 a .log .Error (message )
55121 return errors .New (message )
56122 }
57123
58124 a .restartPipeline ()
59- return nil
125+ a .log .Info ("Successfully rolled back to previous transport" )
126+ return errors .New ("restart failed, rolled back to previous transport" )
60127}
61128
62129// setupAgentForRestart configures and rebuilds only the transient components during a restart.
@@ -76,7 +143,7 @@ func (a *logAgent) setupAgentForRestart() error {
76143// Unlike startPipeline, this only starts the transient components (destinations, pipeline, launchers)
77144// since persistent components (auditor, schedulers, diagnosticMessageReceiver) remain running.
78145func (a * logAgent ) restartPipeline () {
79- status .Init (a .started , a .endpoints , a .sources , a .tracker , metrics .LogsExpvars )
146+ status .Init (a .started , a .endpoints , a .sources , a .tracker , logsmetrics .LogsExpvars )
80147
81148 starter := startstop .NewStarter (a .destinationsCtx , a .pipelineProvider , a .launchers )
82149 starter .Start ()
@@ -114,3 +181,115 @@ func (a *logAgent) partialStop() error {
114181
115182 return nil
116183}
184+
185+ // smartHTTPRestart initiates periodic HTTP connectivity checks with exponential backoff
186+ // to automatically upgrade from TCP to HTTP when connectivity is restored.
187+ // This only runs when TCP fallback occurred (not when [force_]use_tcp is configured).
188+ func (a * logAgent ) smartHTTPRestart () {
189+ // Check if we're eligible for HTTP retry
190+ if config .ShouldUseTCP (a .config ) {
191+ return
192+ }
193+
194+ a .httpRetryMutex .Lock ()
195+ // Cancel any existing loop to avoid leaks or duplicate retries
196+ if a .httpRetryCancel != nil {
197+ a .httpRetryCancel ()
198+ }
199+ a .httpRetryCtx , a .httpRetryCancel = context .WithCancel (context .Background ())
200+ ctx := a .httpRetryCtx
201+ a .httpRetryMutex .Unlock ()
202+
203+ a .log .Info ("Starting HTTP connectivity retry with exponential backoff" )
204+
205+ // Start background goroutine for periodic HTTP checks
206+ go a .httpRetryLoop (ctx )
207+ }
208+
209+ // httpRetryLoop runs periodic HTTP connectivity checks with exponential backoff
210+ // Uses a similar backoff strategy as the TCP connection manager:
211+ // exponential backoff with randomization [2^(n-1), 2^n) seconds, capped at configured max
212+ func (a * logAgent ) httpRetryLoop (ctx context.Context ) {
213+ maxRetryInterval := config .HTTPConnectivityRetryIntervalMax (a .config )
214+ if maxRetryInterval .Seconds () <= 0 {
215+ a .log .Warn ("HTTP connectivity retry interval max set to 0 seconds, skipping HTTP connectivity retry" )
216+ return
217+ }
218+
219+ endpoints , err := buildHTTPEndpointsForConnectivityCheck (a .config )
220+ if err != nil {
221+ a .log .Errorf ("Failed to build HTTP endpoints: %v" , err )
222+ return
223+ }
224+
225+ policy := backoff .NewExpBackoffPolicy (
226+ endpoints .Main .BackoffFactor ,
227+ endpoints .Main .BackoffBase ,
228+ maxRetryInterval .Seconds (),
229+ endpoints .Main .RecoveryInterval ,
230+ endpoints .Main .RecoveryReset ,
231+ )
232+
233+ attempt := 0
234+ for {
235+ // Calculate backoff interval similar to connection_manager.go
236+ backoffDuration := policy .GetBackoffDuration (attempt )
237+
238+ a .log .Debugf ("Next HTTP connectivity check in %v (attempt %d)" , backoffDuration , attempt + 1 )
239+
240+ select {
241+ case <- time .After (backoffDuration ):
242+ attempt ++
243+ a .log .Infof ("Checking HTTP connectivity (attempt %d)" , attempt )
244+
245+ if a .checkHTTPConnectivity () {
246+ a .log .Info ("HTTP connectivity restored - initiating upgrade to HTTP transport" )
247+
248+ // Trigger HTTP upgrade. Since HTTP connectivity is verified,
249+ // we commit to HTTP and keep retrying if upgrade fails.
250+ if err := a .restartWithHTTPUpgrade (ctx ); err != nil {
251+ a .log .Errorf ("HTTP upgrade failed: %v - will retry" , err )
252+ // Publish retry failure metric
253+ logsmetrics .TlmHTTPConnectivityRetryAttempt .Inc ("failure" )
254+ // Continue retrying - HTTP is available, we want to use it
255+ continue
256+ }
257+
258+ // Publish retry success metric
259+ logsmetrics .TlmHTTPConnectivityRetryAttempt .Inc ("success" )
260+ a .log .Info ("Successfully upgraded to HTTP transport" )
261+ return
262+ }
263+
264+ a .log .Debug ("HTTP connectivity check failed - will retry" )
265+
266+ case <- ctx .Done ():
267+ a .log .Debug ("HTTP retry loop stopped" )
268+ return
269+ }
270+ }
271+ }
272+
273+ // checkHTTPConnectivity tests if HTTP endpoints are reachable
274+ func (a * logAgent ) checkHTTPConnectivity () bool {
275+ endpoints , err := buildHTTPEndpointsForConnectivityCheck (a .config )
276+ if err != nil {
277+ a .log .Debugf ("Failed to build HTTP endpoints for connectivity check: %v" , err )
278+ return false
279+ }
280+
281+ connectivity := checkHTTPConnectivityStatus (endpoints .Main , a .config )
282+ return connectivity == config .HTTPConnectivitySuccess
283+ }
284+
285+ // stopHTTPRetry stops the HTTP retry loop
286+ func (a * logAgent ) stopHTTPRetry () {
287+ a .httpRetryMutex .Lock ()
288+ defer a .httpRetryMutex .Unlock ()
289+
290+ if a .httpRetryCancel != nil {
291+ a .httpRetryCancel ()
292+ a .httpRetryCancel = nil
293+ a .httpRetryCtx = nil
294+ }
295+ }
0 commit comments