@@ -44,9 +44,9 @@ type PromConfig struct {
4444func DefaultPromConfig () * PromConfig {
4545 return & PromConfig {
4646 Enabled : true ,
47- ScrapeInterval : 5 * time .Second ,
48- RetentionTime : 1 * time .Hour ,
49- MaxSamples : 10000 ,
47+ ScrapeInterval : 4 * time .Second ,
48+ RetentionTime : 5 * time .Minute ,
49+ MaxSamples : 50 ,
5050 Components : []prom.ComponentType {
5151 prom .ComponentKubelet ,
5252 prom .ComponentCAdvisor ,
@@ -444,26 +444,26 @@ func (p *PromMetricsSource) GetMetricsForPod(ctx context.Context, pod metav1.Obj
444444
445445// GetAllPodMetrics retrieves metrics for all pods
446446func (p * PromMetricsSource ) GetAllPodMetrics (ctx context.Context ) ([]* metrics.PodMetrics , error ) {
447+ // Check health and get store reference under lock
447448 p .mu .RLock ()
448- defer p .mu .RUnlock ()
449-
450449 if ! p .isHealthyLocked () {
450+ p .mu .RUnlock ()
451451 return nil , fmt .Errorf ("prometheus source is not healthy" )
452452 }
453-
454453 if p .store == nil {
454+ p .mu .RUnlock ()
455455 return nil , fmt .Errorf ("metrics store not initialized" )
456456 }
457+ store := p .store
458+ p .mu .RUnlock ()
457459
458- // Get all unique pod/namespace combinations from labels
459- // This requires querying the store for label values
460- namespaces := p .store .GetLabelValues ("namespace" )
461- pods := p .store .GetLabelValues ("pod" )
460+ // Get label values (store has its own lock)
461+ namespaces := store .GetLabelValues ("namespace" )
462+ pods := store .GetLabelValues ("pod" )
462463
463464 var allPodMetrics []* metrics.PodMetrics
464465
465- // This is a simplified implementation - in production would need better logic
466- // to match pods with their namespaces
466+ // Call GetPodMetrics without holding p.mu to avoid deadlock
467467 for _ , namespace := range namespaces {
468468 for _ , pod := range pods {
469469 if podMetrics , err := p .GetPodMetrics (ctx , namespace , pod ); err == nil {
@@ -539,12 +539,8 @@ func (p *PromMetricsSource) SetHealthCallback(callback func(healthy bool, info m
539539}
540540
541541// buildSourceInfoLocked builds SourceInfo while holding the lock.
542- // Must be called with p.mu held.
543- func (p * PromMetricsSource ) buildSourceInfoLocked (healthy bool ) metrics.SourceInfo {
544- metricCount := 0
545- if p .store != nil {
546- metricCount = len (p .store .GetMetricNames ())
547- }
542+ // metricCount must be obtained before acquiring lock to avoid nested locking.
543+ func (p * PromMetricsSource ) buildSourceInfoLocked (healthy bool , metricCount int ) metrics.SourceInfo {
548544 return metrics.SourceInfo {
549545 Type : metrics .SourceTypePrometheus ,
550546 Version : "v1.0.0" ,
@@ -557,6 +553,12 @@ func (p *PromMetricsSource) buildSourceInfoLocked(healthy bool) metrics.SourceIn
557553
558554// handleError is called when an error occurs during metrics collection
559555func (p * PromMetricsSource ) handleError (component prom.ComponentType , err error ) {
556+ // Get metric count BEFORE acquiring lock to avoid nested locking
557+ metricCount := 0
558+ if p .store != nil {
559+ metricCount = len (p .store .GetMetricNames ())
560+ }
561+
560562 var shouldNotify bool
561563 var info metrics.SourceInfo
562564
@@ -572,7 +574,7 @@ func (p *PromMetricsSource) handleError(component prom.ComponentType, err error)
572574 // Check if we need to notify (overall state changed from healthy to unhealthy)
573575 if wasHealthy && ! nowHealthy {
574576 shouldNotify = true
575- info = p .buildSourceInfoLocked (false )
577+ info = p .buildSourceInfoLocked (false , metricCount )
576578 }
577579 p .mu .Unlock ()
578580
@@ -584,6 +586,12 @@ func (p *PromMetricsSource) handleError(component prom.ComponentType, err error)
584586
585587// handleMetricsCollected is called when metrics are successfully collected
586588func (p * PromMetricsSource ) handleMetricsCollected (component prom.ComponentType , collectedMetrics * prom.ScrapedMetrics ) {
589+ // Get metric count BEFORE acquiring lock to avoid nested locking
590+ metricCount := 0
591+ if p .store != nil {
592+ metricCount = len (p .store .GetMetricNames ())
593+ }
594+
587595 var shouldNotify bool
588596 var info metrics.SourceInfo
589597
@@ -604,7 +612,7 @@ func (p *PromMetricsSource) handleMetricsCollected(component prom.ComponentType,
604612 // Check if we need to notify (overall state changed from unhealthy to healthy)
605613 if ! wasHealthy && nowHealthy {
606614 shouldNotify = true
607- info = p .buildSourceInfoLocked (true )
615+ info = p .buildSourceInfoLocked (true , metricCount )
608616 }
609617 p .mu .Unlock ()
610618
0 commit comments