Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmd/kepler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,10 @@ func createServices(logger *slog.Logger, cfg *config.Config) ([]service.Service,
services = append(services, stdoutExporter)
}

// Add health probe endpoints
healthProbe := server.NewHealthProbe(apiServer, services, logger)
services = append(services, healthProbe)

return services, nil
}

Expand Down
29 changes: 29 additions & 0 deletions internal/monitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ type PowerMonitor struct {
// For managing the collection loop
collectionCtx context.Context
collectionCancel context.CancelFunc

// Health tracking
initialized atomic.Bool
fatalError atomic.Bool
}

var _ Service = (*PowerMonitor)(nil)
Expand Down Expand Up @@ -146,6 +150,9 @@ func (pm *PowerMonitor) Init() error {
// signal now so that exporters can construct descriptors
pm.signalNewData()

// Mark as initialized for health checks
pm.initialized.Store(true)

return nil
}

Expand Down Expand Up @@ -429,3 +436,25 @@ func (pm *PowerMonitor) calculatePower(prev, newSnapshot *Snapshot) error {

return nil
}

// IsLive returns true if the monitor is initialized and has not encountered fatal errors
func (pm *PowerMonitor) IsLive() bool {
return pm.initialized.Load() && !pm.fatalError.Load()
}

// IsReady returns true if the monitor is live and has collected at least one valid snapshot
// that is not stale
func (pm *PowerMonitor) IsReady() bool {
if !pm.IsLive() {
return false
}

snapshot := pm.snapshot.Load()
if snapshot == nil || snapshot.Timestamp.IsZero() {
return false
}

// Check if data is fresh (not stale)
age := pm.clock.Now().Sub(snapshot.Timestamp)
return age <= pm.maxStaleness
}
203 changes: 203 additions & 0 deletions internal/monitor/monitor_health_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
// SPDX-FileCopyrightText: 2025 The Kepler Authors
// SPDX-License-Identifier: Apache-2.0

package monitor

import (
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/sustainable-computing-io/kepler/internal/device"
testingclock "k8s.io/utils/clock/testing"
)

func TestPowerMonitor_IsLive_NotInitialized(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
pm := NewPowerMonitor(meter)

// Before initialization, should not be live
assert.False(t, pm.IsLive())
}

func TestPowerMonitor_IsLive_Initialized(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
pm := NewPowerMonitor(meter)

err = pm.Init()
assert.NoError(t, err)

// After initialization, should be live
assert.True(t, pm.IsLive())
}

func TestPowerMonitor_IsLive_FatalError(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
pm := NewPowerMonitor(meter)

err = pm.Init()
assert.NoError(t, err)

// Simulate fatal error
pm.fatalError.Store(true)

// Should not be live when fatal error occurred
assert.False(t, pm.IsLive())
}

func TestPowerMonitor_IsReady_NotInitialized(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
pm := NewPowerMonitor(meter)

// Before initialization, should not be ready
assert.False(t, pm.IsReady())
}

func TestPowerMonitor_IsReady_InitializedNoSnapshot(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
pm := NewPowerMonitor(meter)

err = pm.Init()
assert.NoError(t, err)

// After initialization but before first snapshot, should not be ready
assert.False(t, pm.IsReady())
}

func TestPowerMonitor_IsReady_WithFreshSnapshot(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
fakeClock := testingclock.NewFakeClock(time.Now())

pm := NewPowerMonitor(meter,
WithClock(fakeClock),
WithMaxStaleness(10*time.Second),
)

initErr := pm.Init()
assert.NoError(t, initErr)

// Create a snapshot
snapshot := NewSnapshot()
snapshot.Timestamp = fakeClock.Now()
pm.snapshot.Store(snapshot)

// Should be ready with fresh snapshot
assert.True(t, pm.IsReady())
}

func TestPowerMonitor_IsReady_WithStaleSnapshot(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
fakeClock := testingclock.NewFakeClock(time.Now())

pm := NewPowerMonitor(meter,
WithClock(fakeClock),
WithMaxStaleness(10*time.Second),
)

err = pm.Init()
assert.NoError(t, err)

// Create a snapshot in the past
snapshot := NewSnapshot()
snapshot.Timestamp = fakeClock.Now()
pm.snapshot.Store(snapshot)

// Advance clock beyond staleness threshold
fakeClock.Step(15 * time.Second)

// Should not be ready with stale snapshot
assert.False(t, pm.IsReady())
}

func TestPowerMonitor_IsReady_NotLive(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
fakeClock := testingclock.NewFakeClock(time.Now())

pm := NewPowerMonitor(meter,
WithClock(fakeClock),
WithMaxStaleness(10*time.Second),
)

err = pm.Init()
assert.NoError(t, err)

// Create a fresh snapshot
snapshot := NewSnapshot()
snapshot.Timestamp = fakeClock.Now()
pm.snapshot.Store(snapshot)

// Simulate fatal error (not live)
pm.fatalError.Store(true)

// Should not be ready if not live, even with fresh snapshot
assert.False(t, pm.IsReady())
}

func TestPowerMonitor_IsReady_ZeroTimestamp(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
fakeClock := testingclock.NewFakeClock(time.Now())

pm := NewPowerMonitor(meter,
WithClock(fakeClock),
WithMaxStaleness(10*time.Second),
)

err = pm.Init()
assert.NoError(t, err)

// Create a snapshot with zero timestamp
snapshot := NewSnapshot()
snapshot.Timestamp = time.Time{} // Zero value
pm.snapshot.Store(snapshot)

// Should not be ready with zero timestamp
assert.False(t, pm.IsReady())
}

func TestPowerMonitor_HealthCheck_Integration(t *testing.T) {
meter, err := device.NewFakeCPUMeter([]string{"pkg"})
assert.NoError(t, err)
fakeClock := testingclock.NewFakeClock(time.Now())

pm := NewPowerMonitor(meter,
WithClock(fakeClock),
WithMaxStaleness(10*time.Second),
WithInterval(0), // No automatic collection
)

// Initially not live or ready
assert.False(t, pm.IsLive())
assert.False(t, pm.IsReady())

// After init, live but not ready
initErr := pm.Init()
assert.NoError(t, initErr)
assert.True(t, pm.IsLive())
assert.False(t, pm.IsReady())

// After first snapshot, both live and ready
snapshot := NewSnapshot()
snapshot.Timestamp = fakeClock.Now()
pm.snapshot.Store(snapshot)
assert.True(t, pm.IsLive())
assert.True(t, pm.IsReady())

// After data becomes stale, live but not ready
fakeClock.Step(15 * time.Second)
assert.True(t, pm.IsLive())
assert.False(t, pm.IsReady())

// After fatal error, neither live nor ready
pm.fatalError.Store(true)
assert.False(t, pm.IsLive())
assert.False(t, pm.IsReady())
}
Loading
Loading