@@ -320,6 +320,9 @@ var (
320320 otelMu sync.Mutex
321321 otelStarted bool
322322
323+ // Current OTel config for health checks
324+ currentOTelConfig * OTelConfig
325+
323326 // Cached metrics for OTel callbacks
324327 cachedMetrics * AllMetrics
325328 cacheMu sync.RWMutex
@@ -382,11 +385,22 @@ func StartOTelCollector(cfg *OTelConfig) error {
382385 "Authorization" : "Bearer " + cfg .AuthToken ,
383386 "X-CatOps-Server-ID" : cfg .ServerID ,
384387 }),
388+ // Retry configuration for resilience
389+ otlpmetrichttp .WithRetry (otlpmetrichttp.RetryConfig {
390+ Enabled : true ,
391+ InitialInterval : 5 * time .Second ,
392+ MaxInterval : 30 * time .Second ,
393+ MaxElapsedTime : 2 * time .Minute ,
394+ }),
395+ otlpmetrichttp .WithTimeout (30 * time .Second ),
385396 )
386397 if err != nil {
387398 return fmt .Errorf ("failed to create OTLP exporter: %w" , err )
388399 }
389400
401+ // Store config for health checks
402+ currentOTelConfig = cfg
403+
390404 hostname := cfg .Hostname
391405 if hostname == "" {
392406 hostname , _ = os .Hostname ()
@@ -467,6 +481,40 @@ func ForceFlush() error {
467481 return meterProvider .ForceFlush (ctx )
468482}
469483
484+ // FlushWithLogging flushes metrics and logs the result
485+ // Returns true if flush was successful
486+ func FlushWithLogging () bool {
487+ err := ForceFlush ()
488+ if err != nil {
489+ // Log error but don't crash - will retry on next interval
490+ fmt .Fprintf (os .Stderr , "[%s] ERROR: Failed to flush metrics to backend: %v\n " ,
491+ time .Now ().Format ("2006-01-02 15:04:05" ), err )
492+ return false
493+ }
494+ return true
495+ }
496+
497+ // CheckOTelHealth verifies the OTLP exporter can reach the backend
498+ // Returns nil if healthy, error otherwise
499+ func CheckOTelHealth () error {
500+ otelMu .Lock ()
501+ defer otelMu .Unlock ()
502+
503+ if ! otelStarted || meterProvider == nil {
504+ return fmt .Errorf ("OTLP exporter not started" )
505+ }
506+
507+ if currentOTelConfig == nil {
508+ return fmt .Errorf ("OTLP config not available" )
509+ }
510+
511+ // Try to flush - if it fails, connection is unhealthy
512+ ctx , cancel := context .WithTimeout (context .Background (), 15 * time .Second )
513+ defer cancel ()
514+
515+ return meterProvider .ForceFlush (ctx )
516+ }
517+
470518// =============================================================================
471519// OTel Metrics Registration
472520// =============================================================================
0 commit comments