Skip to content

Commit a0aa9b5

Browse files
committed
feat: add health check probes for k8s liveness and readiness
Implement health probe endpoints following k8s best practices: - Add LiveChecker and ReadyChecker interfaces to service framework - Implement IsLive() and IsReady() methods in PowerMonitor using atomic operations - Create HealthProbe service with /probe/livez and /probe/readyz endpoints - Update K8s daemonset to use new health endpoints instead of /metrics - unit tests for health check Signed-off-by: Vimal Kumar <[email protected]>
1 parent cfbc40c commit a0aa9b5

File tree

7 files changed

+641
-1
lines changed

7 files changed

+641
-1
lines changed

cmd/kepler/main.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,10 @@ func createServices(logger *slog.Logger, cfg *config.Config) ([]service.Service,
206206
services = append(services, stdoutExporter)
207207
}
208208

209+
// Add health probe endpoints
210+
healthProbe := server.NewHealthProbe(apiServer, services, logger)
211+
services = append(services, healthProbe)
212+
209213
return services, nil
210214
}
211215

internal/monitor/monitor.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,10 @@ type PowerMonitor struct {
7878
// For managing the collection loop
7979
collectionCtx context.Context
8080
collectionCancel context.CancelFunc
81+
82+
// Health tracking
83+
initialized atomic.Bool
84+
fatalError atomic.Bool
8185
}
8286

8387
var _ Service = (*PowerMonitor)(nil)
@@ -146,6 +150,9 @@ func (pm *PowerMonitor) Init() error {
146150
// signal now so that exporters can construct descriptors
147151
pm.signalNewData()
148152

153+
// Mark as initialized for health checks
154+
pm.initialized.Store(true)
155+
149156
return nil
150157
}
151158

@@ -429,3 +436,25 @@ func (pm *PowerMonitor) calculatePower(prev, newSnapshot *Snapshot) error {
429436

430437
return nil
431438
}
439+
440+
// IsLive returns true if the monitor is initialized and has not encountered fatal errors
441+
func (pm *PowerMonitor) IsLive() bool {
442+
return pm.initialized.Load() && !pm.fatalError.Load()
443+
}
444+
445+
// IsReady returns true if the monitor is live and has collected at least one valid snapshot
446+
// that is not stale
447+
func (pm *PowerMonitor) IsReady() bool {
448+
if !pm.IsLive() {
449+
return false
450+
}
451+
452+
snapshot := pm.snapshot.Load()
453+
if snapshot == nil || snapshot.Timestamp.IsZero() {
454+
return false
455+
}
456+
457+
// Check if data is fresh (not stale)
458+
age := pm.clock.Now().Sub(snapshot.Timestamp)
459+
return age <= pm.maxStaleness
460+
}
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
// SPDX-FileCopyrightText: 2025 The Kepler Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package monitor
5+
6+
import (
7+
"testing"
8+
"time"
9+
10+
"github.com/stretchr/testify/assert"
11+
"github.com/sustainable-computing-io/kepler/internal/device"
12+
testingclock "k8s.io/utils/clock/testing"
13+
)
14+
15+
func TestPowerMonitor_IsLive_NotInitialized(t *testing.T) {
16+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
17+
assert.NoError(t, err)
18+
pm := NewPowerMonitor(meter)
19+
20+
// Before initialization, should not be live
21+
assert.False(t, pm.IsLive())
22+
}
23+
24+
func TestPowerMonitor_IsLive_Initialized(t *testing.T) {
25+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
26+
assert.NoError(t, err)
27+
pm := NewPowerMonitor(meter)
28+
29+
err = pm.Init()
30+
assert.NoError(t, err)
31+
32+
// After initialization, should be live
33+
assert.True(t, pm.IsLive())
34+
}
35+
36+
func TestPowerMonitor_IsLive_FatalError(t *testing.T) {
37+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
38+
assert.NoError(t, err)
39+
pm := NewPowerMonitor(meter)
40+
41+
err = pm.Init()
42+
assert.NoError(t, err)
43+
44+
// Simulate fatal error
45+
pm.fatalError.Store(true)
46+
47+
// Should not be live when fatal error occurred
48+
assert.False(t, pm.IsLive())
49+
}
50+
51+
func TestPowerMonitor_IsReady_NotInitialized(t *testing.T) {
52+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
53+
assert.NoError(t, err)
54+
pm := NewPowerMonitor(meter)
55+
56+
// Before initialization, should not be ready
57+
assert.False(t, pm.IsReady())
58+
}
59+
60+
func TestPowerMonitor_IsReady_InitializedNoSnapshot(t *testing.T) {
61+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
62+
assert.NoError(t, err)
63+
pm := NewPowerMonitor(meter)
64+
65+
err = pm.Init()
66+
assert.NoError(t, err)
67+
68+
// After initialization but before first snapshot, should not be ready
69+
assert.False(t, pm.IsReady())
70+
}
71+
72+
func TestPowerMonitor_IsReady_WithFreshSnapshot(t *testing.T) {
73+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
74+
assert.NoError(t, err)
75+
fakeClock := testingclock.NewFakeClock(time.Now())
76+
77+
pm := NewPowerMonitor(meter,
78+
WithClock(fakeClock),
79+
WithMaxStaleness(10*time.Second),
80+
)
81+
82+
initErr := pm.Init()
83+
assert.NoError(t, initErr)
84+
85+
// Create a snapshot
86+
snapshot := NewSnapshot()
87+
snapshot.Timestamp = fakeClock.Now()
88+
pm.snapshot.Store(snapshot)
89+
90+
// Should be ready with fresh snapshot
91+
assert.True(t, pm.IsReady())
92+
}
93+
94+
func TestPowerMonitor_IsReady_WithStaleSnapshot(t *testing.T) {
95+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
96+
assert.NoError(t, err)
97+
fakeClock := testingclock.NewFakeClock(time.Now())
98+
99+
pm := NewPowerMonitor(meter,
100+
WithClock(fakeClock),
101+
WithMaxStaleness(10*time.Second),
102+
)
103+
104+
err = pm.Init()
105+
assert.NoError(t, err)
106+
107+
// Create a snapshot in the past
108+
snapshot := NewSnapshot()
109+
snapshot.Timestamp = fakeClock.Now()
110+
pm.snapshot.Store(snapshot)
111+
112+
// Advance clock beyond staleness threshold
113+
fakeClock.Step(15 * time.Second)
114+
115+
// Should not be ready with stale snapshot
116+
assert.False(t, pm.IsReady())
117+
}
118+
119+
func TestPowerMonitor_IsReady_NotLive(t *testing.T) {
120+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
121+
assert.NoError(t, err)
122+
fakeClock := testingclock.NewFakeClock(time.Now())
123+
124+
pm := NewPowerMonitor(meter,
125+
WithClock(fakeClock),
126+
WithMaxStaleness(10*time.Second),
127+
)
128+
129+
err = pm.Init()
130+
assert.NoError(t, err)
131+
132+
// Create a fresh snapshot
133+
snapshot := NewSnapshot()
134+
snapshot.Timestamp = fakeClock.Now()
135+
pm.snapshot.Store(snapshot)
136+
137+
// Simulate fatal error (not live)
138+
pm.fatalError.Store(true)
139+
140+
// Should not be ready if not live, even with fresh snapshot
141+
assert.False(t, pm.IsReady())
142+
}
143+
144+
func TestPowerMonitor_IsReady_ZeroTimestamp(t *testing.T) {
145+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
146+
assert.NoError(t, err)
147+
fakeClock := testingclock.NewFakeClock(time.Now())
148+
149+
pm := NewPowerMonitor(meter,
150+
WithClock(fakeClock),
151+
WithMaxStaleness(10*time.Second),
152+
)
153+
154+
err = pm.Init()
155+
assert.NoError(t, err)
156+
157+
// Create a snapshot with zero timestamp
158+
snapshot := NewSnapshot()
159+
snapshot.Timestamp = time.Time{} // Zero value
160+
pm.snapshot.Store(snapshot)
161+
162+
// Should not be ready with zero timestamp
163+
assert.False(t, pm.IsReady())
164+
}
165+
166+
func TestPowerMonitor_HealthCheck_Integration(t *testing.T) {
167+
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
168+
assert.NoError(t, err)
169+
fakeClock := testingclock.NewFakeClock(time.Now())
170+
171+
pm := NewPowerMonitor(meter,
172+
WithClock(fakeClock),
173+
WithMaxStaleness(10*time.Second),
174+
WithInterval(0), // No automatic collection
175+
)
176+
177+
// Initially not live or ready
178+
assert.False(t, pm.IsLive())
179+
assert.False(t, pm.IsReady())
180+
181+
// After init, live but not ready
182+
initErr := pm.Init()
183+
assert.NoError(t, initErr)
184+
assert.True(t, pm.IsLive())
185+
assert.False(t, pm.IsReady())
186+
187+
// After first snapshot, both live and ready
188+
snapshot := NewSnapshot()
189+
snapshot.Timestamp = fakeClock.Now()
190+
pm.snapshot.Store(snapshot)
191+
assert.True(t, pm.IsLive())
192+
assert.True(t, pm.IsReady())
193+
194+
// After data becomes stale, live but not ready
195+
fakeClock.Step(15 * time.Second)
196+
assert.True(t, pm.IsLive())
197+
assert.False(t, pm.IsReady())
198+
199+
// After fatal error, neither live nor ready
200+
pm.fatalError.Store(true)
201+
assert.False(t, pm.IsLive())
202+
assert.False(t, pm.IsReady())
203+
}

0 commit comments

Comments
 (0)