45
45
defaultCachePeriod = 10 * time .Minute
46
46
)
47
47
48
+ // cpuUsageRecord holds the cpu usage stats and the calculated usageNanoCores.
49
+ type cpuUsageRecord struct {
50
+ stats * runtimeapi.CpuUsage
51
+ usageNanoCores * uint64
52
+ }
53
+
48
54
// criStatsProvider implements the containerStatsProvider interface by getting
49
55
// the container stats from CRI.
50
56
type criStatsProvider struct {
@@ -63,8 +69,8 @@ type criStatsProvider struct {
63
69
logMetricsService LogMetricsService
64
70
65
71
// cpuUsageCache caches the cpu usage for containers.
66
- cpuUsageCache map [string ]* runtimeapi. CpuUsage
67
- mutex sync.Mutex
72
+ cpuUsageCache map [string ]* cpuUsageRecord
73
+ mutex sync.RWMutex
68
74
}
69
75
70
76
// newCRIStatsProvider returns a containerStatsProvider implementation that
@@ -82,12 +88,32 @@ func newCRIStatsProvider(
82
88
runtimeService : runtimeService ,
83
89
imageService : imageService ,
84
90
logMetricsService : logMetricsService ,
85
- cpuUsageCache : make (map [string ]* runtimeapi. CpuUsage ),
91
+ cpuUsageCache : make (map [string ]* cpuUsageRecord ),
86
92
}
87
93
}
88
94
89
95
// ListPodStats returns the stats of all the pod-managed containers.
90
96
func (p * criStatsProvider ) ListPodStats () ([]statsapi.PodStats , error ) {
97
+ // Don't update CPU nano core usage.
98
+ return p .listPodStats (false )
99
+ }
100
+
101
+ // ListPodStatsAndUpdateCPUNanoCoreUsage updates the cpu nano core usage for
102
+ // the containers and returns the stats for all the pod-managed containers.
103
+ // This is a workaround because CRI runtimes do not supply nano core usages,
104
+ // so this function calculate the difference between the current and the last
105
+ // (cached) cpu stats to calculate this metrics. The implementation assumes a
106
+ // single caller to periodically invoke this function to update the metrics. If
107
+ // there exist multiple callers, the period used to compute the cpu usage may
108
+ // vary and the usage could be incoherent (e.g., spiky). If no caller calls
109
+ // this function, the cpu usage will stay nil. Right now, eviction manager is
110
+ // the only caller, and it calls this function every 10s.
111
+ func (p * criStatsProvider ) ListPodStatsAndUpdateCPUNanoCoreUsage () ([]statsapi.PodStats , error ) {
112
+ // Update CPU nano core usage.
113
+ return p .listPodStats (true )
114
+ }
115
+
116
+ func (p * criStatsProvider ) listPodStats (updateCPUNanoCoreUsage bool ) ([]statsapi.PodStats , error ) {
91
117
// Gets node root filesystem information, which will be used to populate
92
118
// the available and capacity bytes/inodes in container stats.
93
119
rootFsInfo , err := p .cadvisor .RootFsInfo ()
@@ -157,7 +183,7 @@ func (p *criStatsProvider) ListPodStats() ([]statsapi.PodStats, error) {
157
183
}
158
184
159
185
// Fill available stats for full set of required pod stats
160
- cs := p .makeContainerStats (stats , container , & rootFsInfo , fsIDtoInfo , podSandbox .GetMetadata ().GetUid ())
186
+ cs := p .makeContainerStats (stats , container , & rootFsInfo , fsIDtoInfo , podSandbox .GetMetadata ().GetUid (), updateCPUNanoCoreUsage )
161
187
p .addPodNetworkStats (ps , podSandboxID , caInfos , cs )
162
188
p .addPodCPUMemoryStats (ps , types .UID (podSandbox .Metadata .Uid ), allInfos , cs )
163
189
@@ -435,6 +461,7 @@ func (p *criStatsProvider) makeContainerStats(
435
461
rootFsInfo * cadvisorapiv2.FsInfo ,
436
462
fsIDtoInfo map [runtimeapi.FilesystemIdentifier ]* cadvisorapiv2.FsInfo ,
437
463
uid string ,
464
+ updateCPUNanoCoreUsage bool ,
438
465
) * statsapi.ContainerStats {
439
466
result := & statsapi.ContainerStats {
440
467
Name : stats .Attributes .Metadata .Name ,
@@ -450,8 +477,12 @@ func (p *criStatsProvider) makeContainerStats(
450
477
if stats .Cpu .UsageCoreNanoSeconds != nil {
451
478
result .CPU .UsageCoreNanoSeconds = & stats .Cpu .UsageCoreNanoSeconds .Value
452
479
}
453
-
454
- usageNanoCores := p .getContainerUsageNanoCores (stats )
480
+ var usageNanoCores * uint64
481
+ if updateCPUNanoCoreUsage {
482
+ usageNanoCores = p .getAndUpdateContainerUsageNanoCores (stats )
483
+ } else {
484
+ usageNanoCores = p .getContainerUsageNanoCores (stats )
485
+ }
455
486
if usageNanoCores != nil {
456
487
result .CPU .UsageNanoCores = usageNanoCores
457
488
}
@@ -541,27 +572,63 @@ func (p *criStatsProvider) makeContainerCPUAndMemoryStats(
541
572
return result
542
573
}
543
574
544
- // getContainerUsageNanoCores gets usageNanoCores based on cached usageCoreNanoSeconds .
575
+ // getContainerUsageNanoCores gets the cached usageNanoCores .
545
576
func (p * criStatsProvider ) getContainerUsageNanoCores (stats * runtimeapi.ContainerStats ) * uint64 {
546
- if stats == nil || stats .Cpu == nil || stats . Cpu . UsageCoreNanoSeconds == nil {
577
+ if stats == nil || stats .Attributes == nil {
547
578
return nil
548
579
}
549
580
550
- p .mutex .Lock ()
551
- defer func () {
552
- // Update cache with new value.
553
- p .cpuUsageCache [stats .Attributes .Id ] = stats .Cpu
554
- p .mutex .Unlock ()
555
- }()
581
+ p .mutex .RLock ()
582
+ defer p .mutex .RUnlock ()
556
583
557
584
cached , ok := p .cpuUsageCache [stats .Attributes .Id ]
558
- if ! ok || cached .UsageCoreNanoSeconds == nil {
585
+ if ! ok || cached .usageNanoCores == nil {
559
586
return nil
560
587
}
588
+ // return a copy of the usage
589
+ latestUsage := * cached .usageNanoCores
590
+ return & latestUsage
591
+ }
561
592
562
- nanoSeconds := stats .Cpu .Timestamp - cached .Timestamp
563
- usageNanoCores := (stats .Cpu .UsageCoreNanoSeconds .Value - cached .UsageCoreNanoSeconds .Value ) * uint64 (time .Second / time .Nanosecond ) / uint64 (nanoSeconds )
564
- return & usageNanoCores
593
+ // getContainerUsageNanoCores computes usageNanoCores based on the given and
594
+ // the cached usageCoreNanoSeconds, updates the cache with the computed
595
+ // usageNanoCores, and returns the usageNanoCores.
596
+ func (p * criStatsProvider ) getAndUpdateContainerUsageNanoCores (stats * runtimeapi.ContainerStats ) * uint64 {
597
+ if stats == nil || stats .Attributes == nil || stats .Cpu == nil || stats .Cpu .UsageCoreNanoSeconds == nil {
598
+ return nil
599
+ }
600
+ id := stats .Attributes .Id
601
+ usage , err := func () (* uint64 , error ) {
602
+ p .mutex .Lock ()
603
+ defer p .mutex .Unlock ()
604
+
605
+ cached , ok := p .cpuUsageCache [id ]
606
+ if ! ok || cached .stats .UsageCoreNanoSeconds == nil {
607
+ // Cannot compute the usage now, but update the cached stats anyway
608
+ p .cpuUsageCache [id ] = & cpuUsageRecord {stats : stats .Cpu , usageNanoCores : nil }
609
+ return nil , nil
610
+ }
611
+
612
+ newStats := stats .Cpu
613
+ cachedStats := cached .stats
614
+ nanoSeconds := newStats .Timestamp - cachedStats .Timestamp
615
+ if nanoSeconds <= 0 {
616
+ return nil , fmt .Errorf ("zero or negative interval (%v - %v)" , newStats .Timestamp , cachedStats .Timestamp )
617
+ }
618
+ usageNanoCores := (newStats .UsageCoreNanoSeconds .Value - cachedStats .UsageCoreNanoSeconds .Value ) * uint64 (time .Second / time .Nanosecond ) / uint64 (nanoSeconds )
619
+
620
+ // Update cache with new value.
621
+ usageToUpdate := usageNanoCores
622
+ p .cpuUsageCache [id ] = & cpuUsageRecord {stats : newStats , usageNanoCores : & usageToUpdate }
623
+
624
+ return & usageNanoCores , nil
625
+ }()
626
+
627
+ if err != nil {
628
+ // This should not happen. Log now to raise visiblity
629
+ klog .Errorf ("failed updating cpu usage nano core: %v" , err )
630
+ }
631
+ return usage
565
632
}
566
633
567
634
func (p * criStatsProvider ) cleanupOutdatedCaches () {
@@ -573,7 +640,7 @@ func (p *criStatsProvider) cleanupOutdatedCaches() {
573
640
delete (p .cpuUsageCache , k )
574
641
}
575
642
576
- if time .Since (time .Unix (0 , v .Timestamp )) > defaultCachePeriod {
643
+ if time .Since (time .Unix (0 , v .stats . Timestamp )) > defaultCachePeriod {
577
644
delete (p .cpuUsageCache , k )
578
645
}
579
646
}
0 commit comments