Skip to content

Commit 96db781

Browse files
committed
refactor: split collector.go into smaller modules + add OTel recovery
Refactoring: - Split collector.go (2375 lines) into 5 focused modules: - types.go (325 lines) - all type definitions - otel.go (216 lines) - OTel setup functions - otel_registry.go (736 lines) - metrics registration - legacy.go (217 lines) - legacy API for UI compatibility - collector.go (944 lines) - only collection logic Daemon improvements: - Add OTel recovery logic (auto-restart after 3 consecutive failures) - Add detailed logging for metrics collection - Add OTLP health status logging
1 parent 36c48fd commit 96db781

File tree

6 files changed

+1571
-1470
lines changed

6 files changed

+1571
-1470
lines changed

internal/commands/daemon.go

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,10 @@ func runDaemon() {
117117
healthTicker := time.NewTicker(5 * time.Minute)
118118
defer healthTicker.Stop()
119119

120+
// OTel failure tracking for recovery logic
121+
var consecutiveOTelFailures int
122+
const maxOTelFailuresBeforeRestart = 3
123+
120124
// Metrics collection ticker - must run BEFORE OTel SDK reads the cache
121125
// OTel SDK calls callbacks at CollectionInterval, we collect slightly faster
122126
metricsInterval := time.Duration(cfg.CollectionInterval) * time.Second
@@ -147,8 +151,12 @@ func runDaemon() {
147151
case <-metricsTicker.C:
148152
// Collect metrics and update cache for OTel callbacks
149153
if metricsStarted {
150-
if _, err := metrics.CollectAllMetrics(); err != nil {
151-
logger.Debug("Metrics collection error: %v", err)
154+
if m, err := metrics.CollectAllMetrics(); err != nil {
155+
logger.Warning("Metrics collection error: %v", err)
156+
} else if m != nil && m.Summary != nil {
157+
logger.Info("[COLLECT] CPU: %.1f%%, Mem: %.1f%%, Disk: %.1f%%, Procs: %d, Containers: %d",
158+
m.Summary.CPUUsage, m.Summary.MemoryUsage, m.Summary.DiskUsage,
159+
len(m.Processes), len(m.Containers))
152160
}
153161
}
154162

@@ -163,7 +171,43 @@ func runDaemon() {
163171
// Check OTLP connection health and force flush
164172
if metricsStarted {
165173
if err := metrics.CheckOTelHealth(); err != nil {
166-
logger.Warning("OTLP health check failed: %v - metrics may not be sending", err)
174+
consecutiveOTelFailures++
175+
logger.Warning("OTLP health check failed (%d/%d): %v",
176+
consecutiveOTelFailures, maxOTelFailuresBeforeRestart, err)
177+
178+
// Recovery logic: restart OTel after consecutive failures
179+
if consecutiveOTelFailures >= maxOTelFailuresBeforeRestart {
180+
logger.Warning("OTLP repeatedly failing, attempting restart...")
181+
182+
// Stop current OTel collector
183+
if err := metrics.StopOTelCollector(); err != nil {
184+
logger.Warning("Failed to stop OTel collector: %v", err)
185+
}
186+
187+
// Wait before restart
188+
time.Sleep(5 * time.Second)
189+
190+
// Restart OTel collector
191+
metricsStarted = startMetricsCollection(cfg, hostname)
192+
if metricsStarted {
193+
logger.Info("OTel collector restarted successfully")
194+
// Collect and send initial metrics after restart
195+
if _, err := metrics.CollectAllMetrics(); err == nil {
196+
metrics.ForceFlush()
197+
}
198+
} else {
199+
logger.Error("Failed to restart OTel collector")
200+
}
201+
202+
consecutiveOTelFailures = 0
203+
}
204+
} else {
205+
// Reset failure counter on success
206+
if consecutiveOTelFailures > 0 {
207+
logger.Info("[OTLP] Connection restored after %d failures", consecutiveOTelFailures)
208+
}
209+
consecutiveOTelFailures = 0
210+
logger.Info("[OTLP] Health check OK - metrics sending normally")
167211
}
168212
}
169213

0 commit comments

Comments
 (0)