@@ -117,6 +117,10 @@ func runDaemon() {
117117 healthTicker := time .NewTicker (5 * time .Minute )
118118 defer healthTicker .Stop ()
119119
120+ // OTel failure tracking for recovery logic
121+ var consecutiveOTelFailures int
122+ const maxOTelFailuresBeforeRestart = 3
123+
120124 // Metrics collection ticker - must run BEFORE OTel SDK reads the cache
121125 // OTel SDK calls callbacks at CollectionInterval, we collect slightly faster
122126 metricsInterval := time .Duration (cfg .CollectionInterval ) * time .Second
@@ -147,8 +151,12 @@ func runDaemon() {
147151 case <- metricsTicker .C :
148152 // Collect metrics and update cache for OTel callbacks
149153 if metricsStarted {
150- if _ , err := metrics .CollectAllMetrics (); err != nil {
151- logger .Debug ("Metrics collection error: %v" , err )
154+ if m , err := metrics .CollectAllMetrics (); err != nil {
155+ logger .Warning ("Metrics collection error: %v" , err )
156+ } else if m != nil && m .Summary != nil {
157+ logger .Info ("[COLLECT] CPU: %.1f%%, Mem: %.1f%%, Disk: %.1f%%, Procs: %d, Containers: %d" ,
158+ m .Summary .CPUUsage , m .Summary .MemoryUsage , m .Summary .DiskUsage ,
159+ len (m .Processes ), len (m .Containers ))
152160 }
153161 }
154162
@@ -163,7 +171,43 @@ func runDaemon() {
163171 // Check OTLP connection health and force flush
164172 if metricsStarted {
165173 if err := metrics .CheckOTelHealth (); err != nil {
166- logger .Warning ("OTLP health check failed: %v - metrics may not be sending" , err )
174+ consecutiveOTelFailures ++
175+ logger .Warning ("OTLP health check failed (%d/%d): %v" ,
176+ consecutiveOTelFailures , maxOTelFailuresBeforeRestart , err )
177+
178+ // Recovery logic: restart OTel after consecutive failures
179+ if consecutiveOTelFailures >= maxOTelFailuresBeforeRestart {
180+ logger .Warning ("OTLP repeatedly failing, attempting restart..." )
181+
182+ // Stop current OTel collector
183+ if err := metrics .StopOTelCollector (); err != nil {
184+ logger .Warning ("Failed to stop OTel collector: %v" , err )
185+ }
186+
187+ // Wait before restart
188+ time .Sleep (5 * time .Second )
189+
190+ // Restart OTel collector
191+ metricsStarted = startMetricsCollection (cfg , hostname )
192+ if metricsStarted {
193+ logger .Info ("OTel collector restarted successfully" )
194+ // Collect and send initial metrics after restart
195+ if _ , err := metrics .CollectAllMetrics (); err == nil {
196+ metrics .ForceFlush ()
197+ }
198+ } else {
199+ logger .Error ("Failed to restart OTel collector" )
200+ }
201+
202+ consecutiveOTelFailures = 0
203+ }
204+ } else {
205+ // Reset failure counter on success
206+ if consecutiveOTelFailures > 0 {
207+ logger .Info ("[OTLP] Connection restored after %d failures" , consecutiveOTelFailures )
208+ }
209+ consecutiveOTelFailures = 0
210+ logger .Info ("[OTLP] Health check OK - metrics sending normally" )
167211 }
168212 }
169213
0 commit comments