sustainable-computing-io · robinlem · Sep 17, 2025 · Sep 18, 2025 · sthaha · Sep 18, 2025
diff --git a/cmd/kepler/main.go b/cmd/kepler/main.go
@@ -167,11 +167,15 @@ func createServices(logger *slog.Logger, cfg *config.Config) ([]service.Service,
 		server.WithWebConfig(cfg.Web.Config),
 	)
 
+	// Create health probe service
+	healthProbeService := server.NewHealthProbeService(apiServer, pm, pm, logger)
+
 	services = append(services,
 		resourceInformer,
 		cpuPowerMeter,
 		apiServer,
 		pm,
+		healthProbeService,
 	)
 
 	// Add Redfish service if enabled

diff --git a/docs/HEALTH_PROBES.md b/docs/HEALTH_PROBES.md
@@ -0,0 +1,232 @@
+# Kubernetes Health Probes for Kepler
+
+This documentation describes the implementation of health check endpoints for Kubernetes probes in Kepler.
+
+## Overview
+
+Health probes allow Kubernetes to determine the health status of your application. Kepler implements two types of probes:
+
+- **Liveness Probe** (`/probe/livez`): Determines if the application is alive and responding
+- **Readiness Probe** (`/probe/readyz`): Determines if the application is ready to receive traffic
+
+## Endpoints
+
+### `/probe/livez` - Liveness Probe
+
+**Description**: Checks if Kepler's monitor service is alive and responding.
+
+**Success Criteria**:
+- PowerMonitor service is not nil
+- Collection context is not cancelled
+
+**Response**:
+- `200 OK`: Service is alive
+- `503 Service Unavailable`: Service is not alive
+
+**Example Response**:
+```json
+{
+  "status": "ok",
+  "timestamp": "2025-01-17T10:30:00Z",
+  "duration": "1.2µs"
+}
+```
+
+### `/probe/readyz` - Readiness Probe
+
+**Description**: Checks if Kepler's monitor service is ready to serve data.
+
+**Success Criteria**:
+- Service is alive (checks liveness first)
+- At least one snapshot is available
+- Snapshot is not too old (within staleness limit)
+- CPU meter is functional
+- Energy zones are initialized
+
+**Response**:
+- `200 OK`: Service is ready
+- `503 Service Unavailable`: Service is not ready
+
+**Example Response**:
+```json
+{
+  "status": "ok", 
+  "timestamp": "2025-01-17T10:30:00Z",
+  "duration": "1.8µs"
+}
+```
+
+## Kubernetes Configuration
+
+### DaemonSet with Health Probes
+
+```yaml
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: kepler
+  namespace: kepler
+spec:
+  selector:
+    matchLabels:
+      app: kepler
+  template:
+    metadata:
+      labels:
+        app: kepler
+    spec:
+      containers:
+      - name: kepler
+        image: quay.io/sustainable_computing_io/kepler:latest
+        ports:
+        - containerPort: 28282
+          name: http-metrics
+        livenessProbe:
+          httpGet:
+            path: /probe/livez
+            port: 28282
+          initialDelaySeconds: 10
+          periodSeconds: 30
+          timeoutSeconds: 5
+          failureThreshold: 3
+        readinessProbe:
+          httpGet:
+            path: /probe/readyz
+            port: 28282
+          initialDelaySeconds: 5
+          periodSeconds: 10
+          timeoutSeconds: 5
+          failureThreshold: 3
+```
+
+## Testing
+
+### Unit Tests
+
+Unit tests are available in `internal/server/health_test.go`:
+
+```bash
+go test ./internal/server/ -v
+```
+
+### Integration Tests
+
+A test script is provided to test the endpoints live:
+
+```bash
+# Start Kepler
+go run ./cmd/kepler/
+
+# In another terminal, test the endpoints
+./examples/test-health-endpoints.sh
+```
+
+## Architecture
+
+### Interfaces
+
+The following interfaces were added in `internal/service/service.go`:
+
+```go
+// LiveChecker checks if a service is alive
+type LiveChecker interface {
+    IsLive(ctx context.Context) (bool, error)
+}
+
+// ReadyChecker checks if a service is ready
+type ReadyChecker interface {
+    IsReady(ctx context.Context) (bool, error)
+}
+```
+
+### Implementation
+
+1. **PowerMonitor** (`internal/monitor/monitor.go`): Implements `LiveChecker` and `ReadyChecker` interfaces
+2. **HealthProbeService** (`internal/server/health.go`): Service that exposes HTTP endpoints
+3. **Integration** (`cmd/kepler/main.go`): Service registration in the main application
+
+### Verification Flow
+
+#### Liveness Check
+1. Verify the monitor is not nil
+2. Verify the collection context is not cancelled
+
+#### Readiness Check  
+1. Execute liveness check
+2. Verify a snapshot is available
+3. Verify the snapshot is not stale
+4. Verify the CPU meter is available
+5. Verify energy zones are initialized
+
+## Performance
+
+Health checks are designed to be very lightweight:
+- **Liveness**: Typically 1-5 microseconds
+- **Readiness**: Typically 1-10 microseconds
+
+No forced data collection is performed during health checks to avoid performance impact.
+
+## Debugging
+
+### Logs
+
+Health checks generate DEBUG level logs for successes and ERROR level logs for failures:
+
+```bash
+# View health check logs
+journalctl -u kepler -f | grep "health-probe"
+```
+
+### Manual Testing
+
+```bash
+# Test liveness
+curl -v http://localhost:28282/probe/livez
+
+# Test readiness  
+curl -v http://localhost:28282/probe/readyz
+
+# With jq to format response
+curl -s http://localhost:28282/probe/livez | jq .
+```
+
+## Troubleshooting
+
+### Liveness probe fails
+
+- Verify Kepler is started
+- Check logs for startup errors
+- Verify port 28282 is accessible
+
+### Readiness probe fails
+
+- Verify liveness probe works
+- Verify `/proc` and `/sys` files are accessible
+- Check RAPL zones configuration
+- Verify collection interval is not too long
+
+## Migration
+
+This implementation is compatible with existing Kepler versions. The new endpoints are optional and do not affect existing functionality.
+
+To enable health probes in an existing installation, simply update the Kubernetes configuration to include the new probes.
+
+## Files Modified/Added
+
+### New Files
+- `internal/server/health.go` - Health probe service implementation
+- `internal/server/health_test.go` - Unit tests for health probes
+- `examples/kubernetes-health-probes.yaml` - Kubernetes DaemonSet example
+- `examples/test-health-endpoints.sh` - Integration test script
+
+### Modified Files
+- `internal/service/service.go` - Added LiveChecker and ReadyChecker interfaces
+- `internal/monitor/monitor.go` - Added IsLive() and IsReady() methods to PowerMonitor
+- `cmd/kepler/main.go` - Registered health probe service
+
+## Future Enhancements
+
+- Add more granular health checks for different components
+- Implement health check metrics for monitoring
+- Add configuration options for health check behavior
+- Support for custom health check plugins
diff --git a/internal/monitor/monitor.go b/internal/monitor/monitor.go
@@ -67,6 +67,13 @@ type PowerMonitor struct {
 	// state atomically across goroutines.
 	exported atomic.Bool
 
+
+	// lastCollectUnixNano tracks the last collection timestamp for liveness checks
+	lastCollectUnixNano int64
+
+	// healthCheckTolerance is the multiplier for interval tolerance in liveness checks
+	healthCheckTolerance float64
+
 	zonesNames []string // cache of all zones
 
 	// Internal terminated workload trackers (not exposed)
@@ -103,6 +110,8 @@ func NewPowerMonitor(meter device.CPUPowerMeter, applyOpts ...OptionFn) *PowerMo
 
 		maxTerminated:                opts.maxTerminated,
 		minTerminatedEnergyThreshold: opts.minTerminatedEnergyThreshold,
+
+		healthCheckTolerance: opts.healthCheckTolerance,
 
 		collectionCtx:    ctx,
 		collectionCancel: cancel,
@@ -338,8 +347,13 @@ func (pm *PowerMonitor) refreshSnapshot() error {
 	pm.exported.Store(false)
 
 	// Update snapshot with current timestamp
-	newSnapshot.Timestamp = pm.clock.Now()
+	now := pm.clock.Now()
+	newSnapshot.Timestamp = now
 	pm.snapshot.Store(newSnapshot)
+
+	// Update collection heartbeat for liveness checks
+	atomic.StoreInt64(&pm.lastCollectUnixNano, now.UnixNano())
+
 	pm.signalNewData()
 	pm.logger.Debug("refreshSnapshot",
 		"processes", len(newSnapshot.Processes),
@@ -429,3 +443,47 @@ func (pm *PowerMonitor) calculatePower(prev, newSnapshot *Snapshot) error {
 
 	return nil
 }
+
+// IsLive checks if the monitor is alive and responsive
+func (pm *PowerMonitor) IsLive(ctx context.Context) (bool, error) {
+	if pm == nil {
+		return false, fmt.Errorf("monitor is nil")
+	}
+	if pm.cpu == nil {
+		return false, fmt.Errorf("CPU meter not initialized")
+	}
+
+	// If periodic collection is expected, require a recent heartbeat
+	if pm.interval > 0 {
+		lastNano := atomic.LoadInt64(&pm.lastCollectUnixNano)
+		if lastNano == 0 {
+			return false, fmt.Errorf("no collection heartbeat yet")
+		}
+		last := time.Unix(0, lastNano)
+		tolerance := time.Duration(float64(pm.interval) * pm.healthCheckTolerance)
+		if time.Since(last) > tolerance {
+			return false, fmt.Errorf("collector stalled; last=%s, tolerance=%.1fx interval", last, pm.healthCheckTolerance)
+		}
+	}
+	return true, nil
+}
+
+// IsReady checks if the monitor is ready to serve data
+func (pm *PowerMonitor) IsReady(ctx context.Context) (bool, error) {
+	if pm == nil {
+		return false, fmt.Errorf("monitor is nil")
+	}
+
+	// Passive mode: ready even without periodic collection
+	if pm.interval == 0 {
+		return true, nil
+	}
+
+	// Active collection: require at least one snapshot
+	if pm.snapshot.Load() == nil {
+		return false, fmt.Errorf("no data yet")
+	}
+
+	return true, nil
+}
+