Skip to content

Commit a8df90d

Browse files
inqrphlAhmet Oeztuerksni
authored
CPU idle time in cpu utilization and counter improvements (#308)
Co-authored-by: Ahmet Oeztuerk <[email protected]> Co-authored-by: Sven Nierlein <[email protected]>
1 parent b83a7c2 commit a8df90d

File tree

4 files changed

+271
-58
lines changed

4 files changed

+271
-58
lines changed

docs/checks/commands/check_cpu_utilization.md

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ Checks the cpu utilization metrics.
2020

2121
### Default Check
2222

23-
check_cpu_utilization
24-
OK - user: 29% - system: 11% - iowait: 3% - steal: 0% - guest: 0% |'user'=28.83%;;;0;...
23+
check_cpu_utilization
24+
OK - user: 2% - system: 1% - iowait: 0% - steal: 0% - guest: 0 - idle: 96% |'total'=3.4%;90;95;0; 'user'=2.11%;;;0;...
2525

2626
### Example using NRPE and Naemon
2727

@@ -41,15 +41,15 @@ Naemon Config
4141

4242
## Argument Defaults
4343

44-
| Argument | Default Value |
45-
| ------------- | --------------------------------------------------------------------------------------------------- |
46-
| warning | total > 90 |
47-
| critical | total > 95 |
48-
| empty-state | 0 (OK) |
49-
| empty-syntax | |
50-
| top-syntax | \${status} - \${list} |
51-
| ok-syntax | |
52-
| detail-syntax | user: \${user}% - system: \${system}% - iowait: \${iowait}% - steal: \${steal}% - guest: \${guest}% |
44+
| Argument | Default Value |
45+
| ------------- | ----------------------------------------------------------------------------------------------------- |
46+
| warning | total > 90 |
47+
| critical | total > 95 |
48+
| empty-state | 0 (OK) |
49+
| empty-syntax | |
50+
| top-syntax | \${status} - \${list} |
51+
| ok-syntax | |
52+
| detail-syntax | user: \${user}% - system: \${system}% - iowait: \${iowait}% - steal: \${steal}% - guest: \${guest} - idle: %{idle}% |
5353

5454
## Check Specific Arguments
5555

@@ -71,3 +71,4 @@ these can be used in filters and thresholds (along with the default attributes):
7171
| iowait | IOWait cpu utilization in percent |
7272
| steal | Steal cpu utilization in percent |
7373
| guest | Guest cpu utilization in percent |
74+
| idle | Idle cpu utilization in percent |

pkg/counter/counter.go

Lines changed: 83 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package counter
22

33
import (
4+
"fmt"
45
"math"
56
"sync"
67
"time"
@@ -9,10 +10,14 @@ import (
910
// Counter is the container for a single timeseries of performance values
1011
// it used a fixed size storage backend
1112
type Counter struct {
12-
lock sync.RWMutex // lock for concurrent access
13-
data []Value // array of values
14-
current int64 // position of last inserted value
15-
size int64 // number of values for this series
13+
lock sync.RWMutex // lock for concurrent access
14+
data []Value // array of values. size determined by the retention and interval
15+
current int64 // position of last inserted value
16+
oldest int64 // position of the earliest inserted value
17+
size int64 // number of values for this series
18+
timesSet int64 // number of times a value was set in this counter
19+
retention time.Duration // the time span this counter can hold, interval * size
20+
interval time.Duration // the interval time that new values are designed to be added
1621
}
1722

1823
// Value is a single entry of a Counter
@@ -24,30 +29,43 @@ type Value struct {
2429
// NewCounter creates a new Counter with given retention time and interval
2530
func NewCounter(retentionTime, interval time.Duration) *Counter {
2631
// round retention and interval to milliseconds
27-
retentionMilli := retentionTime.Milliseconds()
28-
intervalMilli := interval.Milliseconds()
32+
retentionMili := retentionTime.Milliseconds()
33+
intervalMili := interval.Milliseconds()
2934

30-
// round retention time to a multiple of interval
31-
retention := int64(math.Ceil(float64(retentionMilli)/float64(intervalMilli))) * intervalMilli
32-
size := retention / intervalMilli
35+
// round retentionMili to a multiple of interval
36+
retentionMiliRounded := int64(math.Ceil(float64(retentionMili)/float64(intervalMili))) * intervalMili
37+
size := retentionMiliRounded / intervalMili
3338

3439
return &Counter{
35-
lock: sync.RWMutex{},
36-
data: make([]Value, size),
37-
size: size,
38-
current: -1,
40+
lock: sync.RWMutex{},
41+
data: make([]Value, size),
42+
size: size,
43+
current: -1,
44+
oldest: -1,
45+
retention: time.Duration(retentionMiliRounded) * time.Millisecond,
46+
interval: interval,
47+
timesSet: 0,
3948
}
4049
}
4150

4251
// Set adds a new value with current timestamp
4352
func (c *Counter) Set(val any) {
4453
c.lock.Lock()
54+
// setting a value for the first time
55+
if c.oldest == -1 {
56+
c.oldest = 0
57+
}
4558
c.current++
4659
if c.current == c.size {
4760
c.current = 0
4861
}
4962
c.data[c.current].UnixMilli = time.Now().UTC().UnixMilli()
5063
c.data[c.current].Value = val
64+
c.timesSet++
65+
// if we already filled the array, and started overwriting, the oldest index just got overwritten
66+
if c.timesSet > c.size {
67+
c.oldest = (c.current + 1) % c.size
68+
}
5169
c.lock.Unlock()
5270
}
5371

@@ -66,7 +84,8 @@ func (c *Counter) AvgForDuration(duration time.Duration) float64 {
6684
if idx == -1 {
6785
return 0
6886
}
69-
for seen := int64(0); seen <= c.size; seen++ {
87+
88+
for range c.size {
7089
if c.data[idx].UnixMilli > useAfter {
7190
if val, ok := c.data[idx].Value.(float64); ok {
7291
sum += val
@@ -145,35 +164,73 @@ func (c *Counter) getLast() *Value {
145164
return &c.data[c.current]
146165
}
147166

148-
// GetAt returns first value closest to given date
149-
func (c *Counter) GetAt(useAfter time.Time) *Value {
167+
// GetFirst returns first (earliest) value
168+
func (c *Counter) GetFirst() *Value {
169+
c.lock.RLock()
170+
defer c.lock.RUnlock()
171+
172+
return c.getFirst()
173+
}
174+
175+
func (c *Counter) getFirst() *Value {
176+
// the latest added item had index c.current
177+
if c.oldest == -1 {
178+
return nil
179+
}
180+
181+
return &c.data[c.oldest]
182+
}
183+
184+
// GetAt returns first value with >= timestamp than lowerBound
185+
func (c *Counter) GetAt(lowerBound time.Time) *Value {
150186
c.lock.RLock()
151187
defer c.lock.RUnlock()
152188

153-
return c.getAt(useAfter)
189+
return c.getAt(lowerBound)
154190
}
155191

156-
func (c *Counter) getAt(useAfter time.Time) *Value {
157-
useAfterUnix := useAfter.UTC().UnixMilli()
192+
// Gets the first counter that has a >= timestamp than lowerBound
193+
func (c *Counter) getAt(lowerBound time.Time) *Value {
194+
useAfterUnix := lowerBound.UTC().UnixMilli()
195+
196+
// the counter is not yet populated
158197
idx := c.current
159198
if idx == -1 {
160199
return nil
161200
}
162201

163-
var last *Value
164-
for seen := int64(0); seen <= c.size; seen++ {
165-
val := &c.data[idx]
166-
if val.UnixMilli < useAfterUnix {
167-
return last
202+
var previouslyComparedValue *Value
203+
for range c.size {
204+
currentValue := &c.data[idx]
205+
if currentValue.UnixMilli < useAfterUnix {
206+
return previouslyComparedValue
168207
}
169-
last = val
208+
209+
previouslyComparedValue = currentValue
170210
idx--
171211
if idx < 0 {
172212
idx = c.size - 1
173213
}
174214
}
175215

176-
return last
216+
return previouslyComparedValue
217+
}
218+
219+
// checks if the counter can fit the targetRetention. optionally extend the interval by count in the check
220+
func (c *Counter) CheckRetention(targetRetention time.Duration, intervalExtensionCount int64) error {
221+
extendedRetentionRange := c.retention + time.Duration(intervalExtensionCount)*c.interval
222+
223+
if extendedRetentionRange < targetRetention {
224+
if intervalExtensionCount == 0 {
225+
return fmt.Errorf("counter retention range is %f seconds, less than the target retention range of %f seconds",
226+
extendedRetentionRange.Seconds(), targetRetention.Seconds())
227+
}
228+
229+
return fmt.Errorf("counter retention range is %f seconds, even when extended by %d intervals to be %f seconds, it is less than target retention range of %f seconds",
230+
c.interval.Seconds(), intervalExtensionCount, extendedRetentionRange.Seconds(), targetRetention.Seconds())
231+
}
232+
233+
return nil
177234
}
178235

179236
// Float64 returns value as float64

pkg/counter/counter_test.go

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"time"
66

77
"github.com/stretchr/testify/assert"
8+
"github.com/stretchr/testify/require"
89
)
910

1011
func TestCounter(t *testing.T) {
@@ -55,3 +56,96 @@ func TestCounter(t *testing.T) {
5556
set.Delete("test", "key")
5657
assert.Emptyf(t, set.counter, "set is empty now")
5758
}
59+
60+
func TestCounter2(t *testing.T) {
61+
set := NewCounterSet()
62+
63+
retention := time.Millisecond * 4500
64+
interval := time.Second
65+
set.Create("test", "key", retention, interval)
66+
67+
// empty counter
68+
counter := set.Get("test", "key")
69+
latest := counter.getLast()
70+
oldest := counter.getFirst()
71+
assert.Nil(t, latest, "calling latest on empty counter should return nil")
72+
assert.Nil(t, oldest, "calling oldest on empty counter should return nil")
73+
74+
// check the retention for 4 seconds
75+
retentionCheck1 := counter.CheckRetention(time.Second*4, 0)
76+
require.NoError(t, retentionCheck1, "the counter should be able to hold 4 seconds")
77+
78+
// check the retention for 5 seconds
79+
retentionCheck2 := counter.CheckRetention(time.Second*5, 0)
80+
require.NoError(t, retentionCheck2, "the counter should be able to hold 5 seconds")
81+
82+
// check the retention for 6 seconds
83+
retentionCheck3 := counter.CheckRetention(time.Second*6, 0)
84+
require.Error(t, retentionCheck3, "the counter should not be able to hold 6 seconds")
85+
86+
// check the retention for 10 seconds
87+
retentionCheck4 := counter.CheckRetention(time.Second*10, 0)
88+
require.Error(t, retentionCheck4, "the counter should not be able to hold 10 seconds")
89+
90+
// check the retention for 1 minute with 10 extensions
91+
retentionCheck5 := counter.CheckRetention(time.Minute, 10)
92+
require.Error(t, retentionCheck5, "the counter should not be able to hold 1 minute with 10 interval extensions")
93+
94+
// check the retention for 1 minute with 100 extensions
95+
retentionCheck6 := counter.CheckRetention(time.Minute, 100)
96+
require.NoError(t, retentionCheck6, "the counter should be able to hold 1 minute with 10 interval extensions")
97+
98+
// 1 _ _ _ _
99+
counter.Set(float64(1))
100+
latest = counter.getLast()
101+
oldest = counter.getFirst()
102+
assert.InEpsilon(t, float64(1), latest.Value, 0.001, "latest element should be 1")
103+
assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1")
104+
105+
// 1 2 _ _ _
106+
counter.Set(float64(2))
107+
latest = counter.getLast()
108+
oldest = counter.getFirst()
109+
assert.InEpsilon(t, float64(2), latest.Value, 0.001, "latest element should be 2")
110+
assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1")
111+
112+
// 1 2 3 _ _
113+
counter.Set(float64(3))
114+
latest = counter.getLast()
115+
oldest = counter.getFirst()
116+
assert.InEpsilon(t, float64(3), latest.Value, 0.001, "latest element should be 3")
117+
assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1")
118+
119+
// 1 2 3 4 _
120+
counter.Set(float64(4))
121+
latest = counter.getLast()
122+
oldest = counter.getFirst()
123+
assert.InEpsilon(t, float64(4), latest.Value, 0.001, "latest element should be 4")
124+
assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1")
125+
126+
// 1 2 3 4 5
127+
counter.Set(float64(5))
128+
latest = counter.getLast()
129+
oldest = counter.getFirst()
130+
assert.InEpsilon(t, float64(5), latest.Value, 0.001, "latest element should be 5")
131+
assert.InEpsilon(t, float64(1), oldest.Value, 0.001, "oldest element should be 1")
132+
133+
// check the average now
134+
avg := counter.AvgForDuration(time.Minute)
135+
assert.InEpsilon(t, 3, avg, 0.001, "average of 1,2,3,4,5 is 3")
136+
137+
// started overwriting from the first index, the c.oldest should update
138+
// 6 2 3 4 5
139+
counter.Set(float64(6))
140+
latest = counter.getLast()
141+
oldest = counter.getFirst()
142+
assert.InEpsilon(t, float64(6), latest.Value, 0.001, "latest element should be 6")
143+
assert.InEpsilon(t, float64(2), oldest.Value, 0.001, "oldest element should be 2")
144+
145+
// 6 7 3 4 5
146+
counter.Set(float64(7))
147+
latest = counter.getLast()
148+
oldest = counter.getFirst()
149+
assert.InEpsilon(t, float64(7), latest.Value, 0.001, "latest element should be 7")
150+
assert.InEpsilon(t, float64(3), oldest.Value, 0.001, "oldest element should be 3")
151+
}

0 commit comments

Comments
 (0)