Skip to content

Commit 544e4a7

Browse files
turboboost55fjl
andauthored
metrics: improve accuracy of CPU gauges (#26793)
This PR changes metrics collection to actually measure the time interval between collections, rather than assume 3 seconds. I did some ad hoc profiling, and on slower hardware (eg, my Raspberry Pi 4) I routinely saw intervals between 3.3 - 3.5 seconds, with some being as high as 4.5 seconds. This will generally cause the CPU gauge readings to be too high, and in some cases can cause impossibly large values for the CPU load metrics (eg. greater than 400 for a 4 core CPU). --------- Co-authored-by: Felix Lange <[email protected]>
1 parent 5bc2ef9 commit 544e4a7

File tree

5 files changed

+24
-14
lines changed

5 files changed

+24
-14
lines changed

metrics/cpu.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717
package metrics
1818

1919
// CPUStats is the system and process CPU stats.
20+
// All values are in seconds.
2021
type CPUStats struct {
21-
GlobalTime int64 // Time spent by the CPU working on all processes
22-
GlobalWait int64 // Time spent by waiting on disk for all processes
23-
LocalTime int64 // Time spent by the CPU working on this process
22+
GlobalTime float64 // Time spent by the CPU working on all processes
23+
GlobalWait float64 // Time spent by waiting on disk for all processes
24+
LocalTime float64 // Time spent by the CPU working on this process
2425
}

metrics/cpu_enabled.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ func ReadCPUStats(stats *CPUStats) {
3838
}
3939
// requesting all cpu times will always return an array with only one time stats entry
4040
timeStat := timeStats[0]
41-
stats.GlobalTime = int64((timeStat.User + timeStat.Nice + timeStat.System) * cpu.ClocksPerSec)
42-
stats.GlobalWait = int64((timeStat.Iowait) * cpu.ClocksPerSec)
41+
stats.GlobalTime = timeStat.User + timeStat.Nice + timeStat.System
42+
stats.GlobalWait = timeStat.Iowait
4343
stats.LocalTime = getProcessCPUTime()
4444
}

metrics/cputime_nop.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,6 @@ package metrics
2121

2222
// getProcessCPUTime returns 0 on Windows as there is no system call to resolve
2323
// the actual process' CPU time.
24-
func getProcessCPUTime() int64 {
24+
func getProcessCPUTime() float64 {
2525
return 0
2626
}

metrics/cputime_unix.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,11 @@ import (
2626
)
2727

2828
// getProcessCPUTime retrieves the process' CPU time since program startup.
29-
func getProcessCPUTime() int64 {
29+
func getProcessCPUTime() float64 {
3030
var usage syscall.Rusage
3131
if err := syscall.Getrusage(syscall.RUSAGE_SELF, &usage); err != nil {
3232
log.Warn("Failed to retrieve CPU time", "err", err)
3333
return 0
3434
}
35-
return int64(usage.Utime.Sec+usage.Stime.Sec)*100 + int64(usage.Utime.Usec+usage.Stime.Usec)/10000 //nolint:unconvert
35+
return float64(usage.Utime.Sec+usage.Stime.Sec) + float64(usage.Utime.Usec+usage.Stime.Usec)/1000000 //nolint:unconvert
3636
}

metrics/metrics.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -127,8 +127,6 @@ func CollectProcessMetrics(refresh time.Duration) {
127127
return
128128
}
129129

130-
refreshFreq := int64(refresh / time.Second)
131-
132130
// Create the various data collectors
133131
var (
134132
cpustats = make([]CPUStats, 2)
@@ -163,14 +161,25 @@ func CollectProcessMetrics(refresh time.Duration) {
163161
diskWriteBytesCounter = GetOrRegisterCounter("system/disk/writebytes", DefaultRegistry)
164162
)
165163

164+
var lastCollectTime time.Time
165+
166166
// Iterate loading the different stats and updating the meters.
167167
now, prev := 0, 1
168168
for ; ; now, prev = prev, now {
169-
// CPU
169+
// Gather CPU times.
170170
ReadCPUStats(&cpustats[now])
171-
cpuSysLoad.Update((cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / refreshFreq)
172-
cpuSysWait.Update((cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / refreshFreq)
173-
cpuProcLoad.Update((cpustats[now].LocalTime - cpustats[prev].LocalTime) / refreshFreq)
171+
collectTime := time.Now()
172+
secondsSinceLastCollect := collectTime.Sub(lastCollectTime).Seconds()
173+
lastCollectTime = collectTime
174+
if secondsSinceLastCollect > 0 {
175+
sysLoad := (cpustats[now].GlobalTime - cpustats[prev].GlobalTime) / secondsSinceLastCollect
176+
sysWait := (cpustats[now].GlobalWait - cpustats[prev].GlobalWait) / secondsSinceLastCollect
177+
procLoad := (cpustats[now].LocalTime - cpustats[prev].LocalTime) / secondsSinceLastCollect
178+
// Convert to integer percentage.
179+
cpuSysLoad.Update(int64(sysLoad * 100))
180+
cpuSysWait.Update(int64(sysWait * 100))
181+
cpuProcLoad.Update(int64(procLoad * 100))
182+
}
174183

175184
// Threads
176185
cpuThreads.Update(int64(threadCreateProfile.Count()))

0 commit comments

Comments
 (0)