Skip to content

Commit 36c48fd

Browse files
committed
fix: add OTLP retry and health check logging
- Add retry config for OTLP HTTP exporter (5s-30s backoff, 2min max) - Add 30s timeout for OTLP requests - Add health check every 5 min with logging on failure - Add FlushWithLogging helper function This should fix silent metric sending failures.
1 parent 76440a8 commit 36c48fd

File tree

2 files changed

+55
-0
lines changed

2 files changed

+55
-0
lines changed

internal/commands/daemon.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,13 @@ func runDaemon() {
160160
runtime.NumGoroutine(),
161161
float64(memStats.Alloc)/1024/1024)
162162

163+
// Check OTLP connection health and force flush
164+
if metricsStarted {
165+
if err := metrics.CheckOTelHealth(); err != nil {
166+
logger.Warning("OTLP health check failed: %v - metrics may not be sending", err)
167+
}
168+
}
169+
163170
// Ping systemd watchdog (keeps service alive)
164171
service.NotifyWatchdog()
165172

internal/metrics/collector.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,9 @@ var (
320320
otelMu sync.Mutex
321321
otelStarted bool
322322

323+
// Current OTel config for health checks
324+
currentOTelConfig *OTelConfig
325+
323326
// Cached metrics for OTel callbacks
324327
cachedMetrics *AllMetrics
325328
cacheMu sync.RWMutex
@@ -382,11 +385,22 @@ func StartOTelCollector(cfg *OTelConfig) error {
382385
"Authorization": "Bearer " + cfg.AuthToken,
383386
"X-CatOps-Server-ID": cfg.ServerID,
384387
}),
388+
// Retry configuration for resilience
389+
otlpmetrichttp.WithRetry(otlpmetrichttp.RetryConfig{
390+
Enabled: true,
391+
InitialInterval: 5 * time.Second,
392+
MaxInterval: 30 * time.Second,
393+
MaxElapsedTime: 2 * time.Minute,
394+
}),
395+
otlpmetrichttp.WithTimeout(30*time.Second),
385396
)
386397
if err != nil {
387398
return fmt.Errorf("failed to create OTLP exporter: %w", err)
388399
}
389400

401+
// Store config for health checks
402+
currentOTelConfig = cfg
403+
390404
hostname := cfg.Hostname
391405
if hostname == "" {
392406
hostname, _ = os.Hostname()
@@ -467,6 +481,40 @@ func ForceFlush() error {
467481
return meterProvider.ForceFlush(ctx)
468482
}
469483

484+
// FlushWithLogging flushes metrics and logs the result
485+
// Returns true if flush was successful
486+
func FlushWithLogging() bool {
487+
err := ForceFlush()
488+
if err != nil {
489+
// Log error but don't crash - will retry on next interval
490+
fmt.Fprintf(os.Stderr, "[%s] ERROR: Failed to flush metrics to backend: %v\n",
491+
time.Now().Format("2006-01-02 15:04:05"), err)
492+
return false
493+
}
494+
return true
495+
}
496+
497+
// CheckOTelHealth verifies the OTLP exporter can reach the backend
498+
// Returns nil if healthy, error otherwise
499+
func CheckOTelHealth() error {
500+
otelMu.Lock()
501+
defer otelMu.Unlock()
502+
503+
if !otelStarted || meterProvider == nil {
504+
return fmt.Errorf("OTLP exporter not started")
505+
}
506+
507+
if currentOTelConfig == nil {
508+
return fmt.Errorf("OTLP config not available")
509+
}
510+
511+
// Try to flush - if it fails, connection is unhealthy
512+
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
513+
defer cancel()
514+
515+
return meterProvider.ForceFlush(ctx)
516+
}
517+
470518
// =============================================================================
471519
// OTel Metrics Registration
472520
// =============================================================================

0 commit comments

Comments
 (0)