Merge pull request #2199 from sthaha/feat-config-max-terminated

vimalk78 · web-flow · commit b611eea12887 · 2025-07-03T16:30:17.000+05:30
feat(monitor): add configurable limit for terminated workloads tracking
diff --git a/cmd/kepler/main.go b/cmd/kepler/main.go
@@ -153,6 +153,7 @@ func createServices(logger *slog.Logger, cfg *config.Config) ([]service.Service,
 		monitor.WithResourceInformer(resourceInformer),
 		monitor.WithInterval(cfg.Monitor.Interval),
 		monitor.WithMaxStaleness(cfg.Monitor.Staleness),
+		monitor.WithMaxTerminated(cfg.Monitor.MaxTerminated),
 	)
 
 	apiServer := server.NewAPIServer(
diff --git a/compose/dev/kepler-dev/etc/kepler/config.yaml b/compose/dev/kepler-dev/etc/kepler/config.yaml
@@ -20,6 +20,10 @@ monitor:
   # NOTE: Keep staleness shorter than the monitor interval.
   staleness: 1000ms
 
+  # maximum number of terminated workloads (process, container, VM, pods)
+  # to be kept in memory until the data is exported; 0 disables the limit
+  maxTerminated: 500
+
 host:
   sysfs: /host/sys # Path to sysfs filesystem (default: /sys)
   procfs: /host/proc # Path to procfs filesystem (default: /proc)
diff --git a/config/config.go b/config/config.go
@@ -46,6 +46,8 @@ type (
 	Monitor struct {
 		Interval  time.Duration `yaml:"interval"`  // Interval for monitoring resources
 		Staleness time.Duration `yaml:"staleness"` // Time after which calculated values are considered stale
+
+		MaxTerminated int `yaml:"maxTerminated"`
 	}
 
 	// Exporter configuration
@@ -147,8 +149,9 @@ const (
 	HostSysFSFlag  = "host.sysfs"
 	HostProcFSFlag = "host.procfs"
 
-	MonitorIntervalFlag = "monitor.interval"
-	MonitorStaleness    = "monitor.staleness" // not a flag
+	MonitorIntervalFlag      = "monitor.interval"
+	MonitorStaleness         = "monitor.staleness" // not a flag
+	MonitorMaxTerminatedFlag = "monitor.max-terminated"
 
 	// RAPL
 	RaplZones = "rapl.zones" // not a flag
@@ -190,6 +193,8 @@ func DefaultConfig() *Config {
 		Monitor: Monitor{
 			Interval:  5 * time.Second,
 			Staleness: 500 * time.Millisecond,
+
+			MaxTerminated: 500,
 		},
 		Exporter: Exporter{
 			Stdout: StdoutExporter{
@@ -286,6 +291,8 @@ func RegisterFlags(app *kingpin.Application) ConfigUpdaterFn {
 	// monitor
 	monitorInterval := app.Flag(MonitorIntervalFlag,
 		"Interval for monitoring resources (processes, container, vm, etc...); 0 to disable").Default("5s").Duration()
+	maxTerminated := app.Flag(MonitorMaxTerminatedFlag,
+		"Maximum number of terminated workloads to keep in memory until exported; 0 for unlimited").Default("500").Int()
 
 	enablePprof := app.Flag(pprofEnabledFlag, "Enable pprof debug endpoints").Default("false").Bool()
 	webConfig := app.Flag(WebConfigFlag, "Web config file path").Default("").String()
@@ -295,7 +302,7 @@ func RegisterFlags(app *kingpin.Application) ConfigUpdaterFn {
 
 	prometheusExporterEnabled := app.Flag(ExporterPrometheusEnabledFlag, "Enable Prometheus exporter").Default("true").Bool()
 
-	var metricsLevel = metrics.MetricsLevelNode | metrics.MetricsLevelProcess | metrics.MetricsLevelContainer | metrics.MetricsLevelVM | metrics.MetricsLevelPod
+	metricsLevel := metrics.MetricsLevelNode | metrics.MetricsLevelProcess | metrics.MetricsLevelContainer | metrics.MetricsLevelVM | metrics.MetricsLevelPod
 	app.Flag(ExporterPrometheusMetricsFlag, "Metrics levels to export (node,process,container,vm,pod)").SetValue(NewMetricsLevelValue(&metricsLevel))
 
 	kubernetes := app.Flag(KubernetesFlag, "Monitor kubernetes").Default("false").Bool()
@@ -325,6 +332,10 @@ func RegisterFlags(app *kingpin.Application) ConfigUpdaterFn {
 			cfg.Monitor.Interval = *monitorInterval
 		}
 
+		if flagsSet[MonitorMaxTerminatedFlag] {
+			cfg.Monitor.MaxTerminated = *maxTerminated
+		}
+
 		if flagsSet[pprofEnabledFlag] {
 			cfg.Debug.Pprof.Enabled = enablePprof
 		}
@@ -434,6 +445,9 @@ func (c *Config) Validate(skips ...SkipValidation) error {
 		if c.Monitor.Staleness < 0 {
 			errs = append(errs, fmt.Sprintf("invalid monitor staleness: %s can't be negative", c.Monitor.Staleness))
 		}
+		if c.Monitor.MaxTerminated < 0 {
+			errs = append(errs, fmt.Sprintf("invalid monitor max terminated: %d can't be negative", c.Monitor.MaxTerminated))
+		}
 	}
 	{ // Kubernetes
 		if ptr.Deref(c.Kube.Enabled, false) {
@@ -514,6 +528,7 @@ func (c *Config) manualString() string {
 		{HostProcFSFlag, c.Host.ProcFS},
 		{MonitorIntervalFlag, c.Monitor.Interval.String()},
 		{MonitorStaleness, c.Monitor.Staleness.String()},
+		{MonitorMaxTerminatedFlag, fmt.Sprintf("%d", c.Monitor.MaxTerminated)},
 		{RaplZones, strings.Join(c.Rapl.Zones, ", ")},
 		{ExporterStdoutEnabledFlag, fmt.Sprintf("%v", c.Exporter.Stdout.Enabled)},
 		{ExporterPrometheusEnabledFlag, fmt.Sprintf("%v", c.Exporter.Prometheus.Enabled)},
diff --git a/config/config_test.go b/config/config_test.go
@@ -627,14 +627,30 @@ func TestMonitorConfig(t *testing.T) {
 		cfg.Monitor.Staleness = 100
 		assert.NoError(t, cfg.Validate())
 	})
+
+	t.Run("maxTerminated", func(t *testing.T) {
+		cfg := DefaultConfig()
+		assert.Equal(t, 500, cfg.Monitor.MaxTerminated, "default maxTerminated should be 500")
+		assert.NoError(t, cfg.Validate())
+
+		cfg.Monitor.MaxTerminated = -10
+		assert.ErrorContains(t, cfg.Validate(), "invalid configuration: invalid monitor max terminated")
+
+		cfg.Monitor.MaxTerminated = 0
+		assert.NoError(t, cfg.Validate(), "maxTerminated=0 should be valid (unlimited)")
+
+		cfg.Monitor.MaxTerminated = 1000
+		assert.NoError(t, cfg.Validate())
+	})
 }
 
 func TestMonitorConfigFlags(t *testing.T) {
 	type expect struct {
-		interval   time.Duration
-		staleness  time.Duration
-		parseError error
-		cfgErr     error
+		interval      time.Duration
+		staleness     time.Duration
+		maxTerminated int
+		parseError    error
+		cfgErr        error
 	}
 	tt := []struct {
 		name     string
@@ -643,7 +659,7 @@ func TestMonitorConfigFlags(t *testing.T) {
 	}{{
 		name:     "default",
 		args:     []string{},
-		expected: expect{interval: 5 * time.Second, staleness: 500 * time.Millisecond, parseError: nil},
+		expected: expect{interval: 5 * time.Second, staleness: 500 * time.Millisecond, maxTerminated: 500, parseError: nil},
 	}, {
 		name:     "invalid-interval flag",
 		args:     []string{"--monitor.interval=-10Fs"},
@@ -652,6 +668,18 @@ func TestMonitorConfigFlags(t *testing.T) {
 		name:     "invalid-interval",
 		args:     []string{"--monitor.interval=-10s"},
 		expected: expect{cfgErr: fmt.Errorf("invalid configuration: invalid monitor interval")},
+	}, {
+		name:     "valid-max-terminated",
+		args:     []string{"--monitor.max-terminated=1000"},
+		expected: expect{interval: 5 * time.Second, staleness: 500 * time.Millisecond, maxTerminated: 1000, parseError: nil},
+	}, {
+		name:     "max-terminated-zero",
+		args:     []string{"--monitor.max-terminated=0"},
+		expected: expect{interval: 5 * time.Second, staleness: 500 * time.Millisecond, maxTerminated: 0, parseError: nil},
+	}, {
+		name:     "invalid-max-terminated",
+		args:     []string{"--monitor.max-terminated=-10"},
+		expected: expect{cfgErr: fmt.Errorf("invalid configuration: invalid monitor max terminated")},
 	}}
 
 	for _, tc := range tt {
@@ -676,10 +704,46 @@ func TestMonitorConfigFlags(t *testing.T) {
 			assert.NoError(t, err, "unexpected config update error")
 			assert.Equal(t, cfg.Monitor.Interval, tc.expected.interval)
 			assert.Equal(t, cfg.Monitor.Staleness, tc.expected.staleness)
+			assert.Equal(t, cfg.Monitor.MaxTerminated, tc.expected.maxTerminated)
 		})
 	}
 }
 
+func TestMonitorMaxTerminatedYAML(t *testing.T) {
+	t.Run("yaml-config-maxTerminated", func(t *testing.T) {
+		yamlData := `
+monitor:
+  maxTerminated: 1000
+`
+		reader := strings.NewReader(yamlData)
+		cfg, err := Load(reader)
+		assert.NoError(t, err)
+		assert.Equal(t, 1000, cfg.Monitor.MaxTerminated)
+	})
+
+	t.Run("yaml-config-maxTerminated-zero", func(t *testing.T) {
+		yamlData := `
+monitor:
+  maxTerminated: 0
+`
+		reader := strings.NewReader(yamlData)
+		cfg, err := Load(reader)
+		assert.NoError(t, err)
+		assert.Equal(t, 0, cfg.Monitor.MaxTerminated)
+	})
+
+	t.Run("yaml-config-maxTerminated-invalid", func(t *testing.T) {
+		yamlData := `
+monitor:
+  maxTerminated: -100
+`
+		reader := strings.NewReader(yamlData)
+		_, err := Load(reader)
+		assert.Error(t, err)
+		assert.Contains(t, err.Error(), "invalid monitor max terminated")
+	})
+}
+
 func TestConfigDefault(t *testing.T) {
 	cfg := DefaultConfig()
 
diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md
@@ -23,6 +23,7 @@ You can configure Kepler by passing flags when starting the service. The followi
 | `--host.sysfs` | Path to sysfs filesystem | `/sys` | Any valid directory path |
 | `--host.procfs` | Path to procfs filesystem | `/proc` | Any valid directory path |
 | `--monitor.interval` | Monitor refresh interval | `5s` | Any valid duration |
+| `--monitor.max-terminated` | Maximum number of terminated workloads to keep in memory until exported | `500` | Any non-negative integer (0 for unlimited) |
 | `--web.config-file` | Path to TLS server config file | `""` | Any valid file path |
 | `--debug.pprof` | Enable pprof debugging endpoints | `false` | `true`, `false` |
 | `--exporter.stdout` | Enable stdout exporter | `false` | `true`, `false` |
@@ -55,6 +56,12 @@ kepler --metrics=node --metrics=container
 
 # Export only process level metrics
 kepler --metrics=process
+
+# Set maximum terminated workloads to 1000
+kepler --monitor.max-terminated=1000
+
+# Disable terminated workload tracking (unlimited)
+kepler --monitor.max-terminated=0
 ```
 
 ## 🗂️ Configuration File
@@ -69,8 +76,9 @@ log:
   format: text  # text or json (default: text)
 
 monitor:
-  interval: 5s      # Monitor refresh interval (default: 5s)
-  staleness: 1000ms # Duration after which data is considered stale (default: 1000ms)
+  interval: 5s        # Monitor refresh interval (default: 5s)
+  staleness: 1000ms   # Duration after which data is considered stale (default: 1000ms)
+  maxTerminated: 500  # Maximum number of terminated workloads to keep in memory (default: 500)
 
 host:
   sysfs: /sys   # Path to sysfs filesystem (default: /sys)
@@ -139,12 +147,15 @@ log:
 monitor:
   interval: 5s
   staleness: 1000ms
+  maxTerminated: 500
 ```
 
 - **interval**: The monitor's refresh interval. All processes with a lifetime less than this interval will be ignored. Setting to 0s disables monitor refreshes.
 
 - **staleness**: Duration after which data computed by the monitor is considered stale and recomputed when requested again. Especially useful when multiple Prometheus instances are scraping Kepler, ensuring they receive the same data within the staleness window. Should be shorter than the monitor interval.
 
+- **maxTerminated**: Maximum number of terminated workloads (processes, containers, VMs, pods) to keep in memory until the data is exported. This prevents unbounded memory growth in high-churn environments. Set to 0 for unlimited (no limit). When the limit is reached, the least power consuming terminated workloads are removed first.
+
 ### 🗄️ Host Configuration
 
 ```yaml
diff --git a/hack/config.yaml b/hack/config.yaml
@@ -20,6 +20,10 @@ monitor:
   # NOTE: Keep staleness shorter than the monitor interval.
   staleness: 1000ms
 
+  # maximum number of terminated workloads (process, container, VM, pods)
+  # to be kept in memory until the data is exported; 0 disables the limit
+  maxTerminated: 500
+
 host:
   sysfs: /sys # Path to sysfs filesystem (default: /sys)
   procfs: /proc # Path to procfs filesystem (default: /proc)
@@ -55,7 +59,6 @@ kube: # kubernetes related config
   config: "" # path to kubeconfig file (optional if running in-cluster)
   nodeName: "" # name of the kubernetes node (required when enabled)
 
-
 # WARN DO NOT ENABLE THIS IN PRODUCTION - for development / testing only
 dev:
   fake-cpu-meter:
diff --git a/internal/monitor/monitor.go b/internal/monitor/monitor.go
@@ -40,10 +40,11 @@ type PowerMonitor struct {
 	logger *slog.Logger
 	cpu    device.CPUPowerMeter
 
-	interval     time.Duration
-	clock        clock.WithTicker
-	maxStaleness time.Duration
-	resources    resource.Informer
+	interval      time.Duration
+	clock         clock.WithTicker
+	maxStaleness  time.Duration
+	maxTerminated int
+	resources     resource.Informer
 
 	// signals when a snapshot has been updated
 	dataCh chan struct{}
@@ -86,6 +87,7 @@ func NewPowerMonitor(meter device.CPUPowerMeter, applyOpts ...OptionFn) *PowerMo
 		resources:        opts.resources,
 		dataCh:           make(chan struct{}, 1),
 		maxStaleness:     opts.maxStaleness,
+		maxTerminated:    opts.maxTerminated,
 		collectionCtx:    ctx,
 		collectionCancel: cancel,
 	}
diff --git a/internal/monitor/options.go b/internal/monitor/options.go
@@ -12,23 +12,25 @@ import (
 )
 
 type Opts struct {
-	logger       *slog.Logger
-	sysfsPath    string
-	interval     time.Duration
-	clock        clock.WithTicker
-	maxStaleness time.Duration
-	resources    resource.Informer
+	logger        *slog.Logger
+	sysfsPath     string
+	interval      time.Duration
+	clock         clock.WithTicker
+	maxStaleness  time.Duration
+	maxTerminated int
+	resources     resource.Informer
 }
 
 // NewConfig returns a new Config with defaults set
 func DefaultOpts() Opts {
 	return Opts{
-		logger:       slog.Default(),
-		sysfsPath:    "/sys",
-		interval:     5 * time.Second,
-		clock:        clock.RealClock{},
-		maxStaleness: 500 * time.Millisecond,
-		resources:    nil,
+		logger:        slog.Default(),
+		sysfsPath:     "/sys",
+		interval:      5 * time.Second,
+		clock:         clock.RealClock{},
+		maxStaleness:  500 * time.Millisecond,
+		maxTerminated: 500,
+		resources:     nil,
 	}
 }
 
@@ -69,3 +71,10 @@ func WithResourceInformer(r resource.Informer) OptionFn {
 		o.resources = r
 	}
 }
+
+// WithMaxTerminated sets the maximum number of terminated workloads to keep in memory
+func WithMaxTerminated(max int) OptionFn {
+	return func(o *Opts) {
+		o.maxTerminated = max
+	}
+}
diff --git a/manifests/k8s/configmap.yaml b/manifests/k8s/configmap.yaml
@@ -17,6 +17,7 @@ data:
     monitor:
       interval: 5s
       staleness: 500ms
+      maxTerminated: 100
     rapl:
       zones: []
     exporter:

Original file line number	Diff line number	Diff line change
`@@ -153,6 +153,7 @@ func createServices(logger slog.Logger, cfg config.Config) ([]service.Service,`
`153`	`153`	`monitor.WithResourceInformer(resourceInformer),`
`154`	`154`	`monitor.WithInterval(cfg.Monitor.Interval),`
`155`	`155`	`monitor.WithMaxStaleness(cfg.Monitor.Staleness),`
	`156`	`+ monitor.WithMaxTerminated(cfg.Monitor.MaxTerminated),`
`156`	`157`	`)`
`157`	`158`
`158`	`159`	`apiServer := server.NewAPIServer(`