Skip to content

Commit f5d0824

Browse files
authored
Merge pull request #2979 from kolyshkin/fix-mem-hog
container/libcontainer: fix schedulerStatsFromProcs hogging memory and wrong stats
2 parents 14155ee + 9a22e3e commit f5d0824

File tree

1 file changed

+26
-13
lines changed

1 file changed

+26
-13
lines changed

container/libcontainer/handler.go

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,10 @@ type Handler struct {
5454
rootFs string
5555
pid int
5656
includedMetrics container.MetricSet
57+
// pidMetricsCache holds CPU scheduler stats for existing processes (map key is PID) between calls to schedulerStatsFromProcs.
5758
pidMetricsCache map[int]*info.CpuSchedstat
59+
// pidMetricsSaved holds accumulated CPU scheduler stats for processes that no longer exist.
60+
pidMetricsSaved info.CpuSchedstat
5861
cycles uint64
5962
}
6063

@@ -93,14 +96,9 @@ func (h *Handler) GetStats() (*info.ContainerStats, error) {
9396
stats := newContainerStats(libcontainerStats, h.includedMetrics)
9497

9598
if h.includedMetrics.Has(container.ProcessSchedulerMetrics) {
96-
pids, err := h.cgroupManager.GetAllPids()
99+
stats.Cpu.Schedstat, err = h.schedulerStatsFromProcs()
97100
if err != nil {
98-
klog.V(4).Infof("Could not get PIDs for container %d: %v", h.pid, err)
99-
} else {
100-
stats.Cpu.Schedstat, err = schedulerStatsFromProcs(h.rootFs, pids, h.pidMetricsCache)
101-
if err != nil {
102-
klog.V(4).Infof("Unable to get Process Scheduler Stats: %v", err)
103-
}
101+
klog.V(4).Infof("Unable to get Process Scheduler Stats: %v", err)
104102
}
105103
}
106104

@@ -314,9 +312,14 @@ func processStatsFromProcs(rootFs string, cgroupPath string, rootPid int) (info.
314312
return processStats, nil
315313
}
316314

317-
func schedulerStatsFromProcs(rootFs string, pids []int, pidMetricsCache map[int]*info.CpuSchedstat) (info.CpuSchedstat, error) {
315+
func (h *Handler) schedulerStatsFromProcs() (info.CpuSchedstat, error) {
316+
pids, err := h.cgroupManager.GetAllPids()
317+
if err != nil {
318+
return info.CpuSchedstat{}, fmt.Errorf("Could not get PIDs for container %d: %w", h.pid, err)
319+
}
320+
alivePids := make(map[int]struct{}, len(pids))
318321
for _, pid := range pids {
319-
f, err := os.Open(path.Join(rootFs, "proc", strconv.Itoa(pid), "schedstat"))
322+
f, err := os.Open(path.Join(h.rootFs, "proc", strconv.Itoa(pid), "schedstat"))
320323
if err != nil {
321324
return info.CpuSchedstat{}, fmt.Errorf("couldn't open scheduler statistics for process %d: %v", pid, err)
322325
}
@@ -325,14 +328,15 @@ func schedulerStatsFromProcs(rootFs string, pids []int, pidMetricsCache map[int]
325328
if err != nil {
326329
return info.CpuSchedstat{}, fmt.Errorf("couldn't read scheduler statistics for process %d: %v", pid, err)
327330
}
331+
alivePids[pid] = struct{}{}
328332
rawMetrics := bytes.Split(bytes.TrimRight(contents, "\n"), []byte(" "))
329333
if len(rawMetrics) != 3 {
330334
return info.CpuSchedstat{}, fmt.Errorf("unexpected number of metrics in schedstat file for process %d", pid)
331335
}
332-
cacheEntry, ok := pidMetricsCache[pid]
336+
cacheEntry, ok := h.pidMetricsCache[pid]
333337
if !ok {
334338
cacheEntry = &info.CpuSchedstat{}
335-
pidMetricsCache[pid] = cacheEntry
339+
h.pidMetricsCache[pid] = cacheEntry
336340
}
337341
for i, rawMetric := range rawMetrics {
338342
metric, err := strconv.ParseUint(string(rawMetric), 10, 64)
@@ -349,11 +353,20 @@ func schedulerStatsFromProcs(rootFs string, pids []int, pidMetricsCache map[int]
349353
}
350354
}
351355
}
352-
schedstats := info.CpuSchedstat{}
353-
for _, v := range pidMetricsCache {
356+
schedstats := h.pidMetricsSaved // copy
357+
for p, v := range h.pidMetricsCache {
354358
schedstats.RunPeriods += v.RunPeriods
355359
schedstats.RunqueueTime += v.RunqueueTime
356360
schedstats.RunTime += v.RunTime
361+
if _, alive := alivePids[p]; !alive {
362+
// PID p is gone: accumulate its stats ...
363+
h.pidMetricsSaved.RunPeriods += v.RunPeriods
364+
h.pidMetricsSaved.RunqueueTime += v.RunqueueTime
365+
h.pidMetricsSaved.RunTime += v.RunTime
366+
// ... and remove its cache entry, to prevent
367+
// pidMetricsCache from growing.
368+
delete(h.pidMetricsCache, p)
369+
}
357370
}
358371
return schedstats, nil
359372
}

0 commit comments

Comments
 (0)