container/libcontainer: fix memory hog and stats

kolyshkin · kolyshkin · commit 9a22e3e3eed4 · 2021-10-27T18:45:34.000-07:00
The logic of the existing code of schedulerStatsFromProcs is to provide
a cumulative stats for all the processes inside a container. Once the
process is dead, its stat entry is no longer updated, but still used in
totals calculation. This creates two problems:

 - pidsMetricsCache map is ever growing -- in case of many short-lived
   processes in containers this can impact kubelet memory usage a lot;

 - in case a new process with the same PID appears (as a result of PID
   reuse), the stats from the old one are overwritten, resulting in
   wrong totals (e.g. they can be less than previous, which should not
   ever be the case).

To kill these two birds with one stone, let's accumulate stats from dead
processes in pidsMetricsSaved, and remove them from the pidsMetricsCache.

Signed-off-by: Kir Kolyshkin &lt;kolyshkin@gmail.com&gt;
diff --git a/container/libcontainer/handler.go b/container/libcontainer/handler.go
@@ -54,7 +54,10 @@ type Handler struct {
 	rootFs          string
 	pid             int
 	includedMetrics container.MetricSet
+	// pidMetricsCache holds CPU scheduler stats for existing processes (map key is PID) between calls to schedulerStatsFromProcs.
 	pidMetricsCache map[int]*info.CpuSchedstat
+	// pidMetricsSaved holds accumulated CPU scheduler stats for processes that no longer exist.
+	pidMetricsSaved info.CpuSchedstat
 	cycles          uint64
 }
 
@@ -314,6 +317,7 @@ func (h *Handler) schedulerStatsFromProcs() (info.CpuSchedstat, error) {
 	if err != nil {
 		return info.CpuSchedstat{}, fmt.Errorf("Could not get PIDs for container %d: %w", h.pid, err)
 	}
+	alivePids := make(map[int]struct{}, len(pids))
 	for _, pid := range pids {
 		f, err := os.Open(path.Join(h.rootFs, "proc", strconv.Itoa(pid), "schedstat"))
 		if err != nil {
@@ -324,6 +328,7 @@ func (h *Handler) schedulerStatsFromProcs() (info.CpuSchedstat, error) {
 		if err != nil {
 			return info.CpuSchedstat{}, fmt.Errorf("couldn't read scheduler statistics for process %d: %v", pid, err)
 		}
+		alivePids[pid] = struct{}{}
 		rawMetrics := bytes.Split(bytes.TrimRight(contents, "\n"), []byte(" "))
 		if len(rawMetrics) != 3 {
 			return info.CpuSchedstat{}, fmt.Errorf("unexpected number of metrics in schedstat file for process %d", pid)
@@ -348,11 +353,20 @@ func (h *Handler) schedulerStatsFromProcs() (info.CpuSchedstat, error) {
 			}
 		}
 	}
-	schedstats := info.CpuSchedstat{}
-	for _, v := range h.pidMetricsCache {
+	schedstats := h.pidMetricsSaved // copy
+	for p, v := range h.pidMetricsCache {
 		schedstats.RunPeriods += v.RunPeriods
 		schedstats.RunqueueTime += v.RunqueueTime
 		schedstats.RunTime += v.RunTime
+		if _, alive := alivePids[p]; !alive {
+			// PID p is gone: accumulate its stats ...
+			h.pidMetricsSaved.RunPeriods += v.RunPeriods
+			h.pidMetricsSaved.RunqueueTime += v.RunqueueTime
+			h.pidMetricsSaved.RunTime += v.RunTime
+			// ... and remove its cache entry, to prevent
+			// pidMetricsCache from growing.
+			delete(h.pidMetricsCache, p)
+		}
 	}
 	return schedstats, nil
 }