Skip to content

Commit 6b23ac7

Browse files
dqminhFelix Ehrenpfort
authored andcommitted
Expose PSI metrics with prometheus
This adds support for reading PSI metrics via prometheus. We exposes the following for `psi_total`: ``` container_cpu_psi_total_seconds container_memory_psi_total_seconds container_io_psi_total_seconds ``` And for `psi_avg`: ``` container_cpu_psi_avg10_ratio container_cpu_psi_avg60_ratio container_cpu_psi_avg300_ratio container_memory_psi_avg10_ratio container_memory_psi_avg60_ratio container_memory_psi_avg300_ratio container_io_psi_avg10_ratio container_io_psi_avg60_ratio container_io_psi_avg300_ratio ``` Signed-off-by: Daniel Dao <[email protected]>
1 parent d3fefb9 commit 6b23ac7

File tree

4 files changed

+216
-0
lines changed

4 files changed

+216
-0
lines changed

metrics/prometheus.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1746,6 +1746,64 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
17461746
})
17471747
}
17481748

1749+
if includedMetrics.Has(container.PSITotalMetrics) {
1750+
c.containerMetrics = append(c.containerMetrics, []containerMetric{
1751+
{
1752+
name: "container_cpu_psi_total_seconds",
1753+
help: "Total time spent under cpu pressure in seconds.",
1754+
valueType: prometheus.CounterValue,
1755+
extraLabels: []string{"kind"},
1756+
getValues: func(s *info.ContainerStats) metricValues {
1757+
return getPSIValues(s, &s.Cpu.PSI, "total")
1758+
},
1759+
}, {
1760+
name: "container_memory_psi_total_seconds",
1761+
help: "Total container time spent under memory pressure in seconds.",
1762+
valueType: prometheus.CounterValue,
1763+
extraLabels: []string{"kind"},
1764+
getValues: func(s *info.ContainerStats) metricValues {
1765+
return getPSIValues(s, &s.Memory.PSI, "total")
1766+
},
1767+
}, {
1768+
name: "container_io_psi_total_seconds",
1769+
help: "Total time spent under io pressure in seconds.",
1770+
valueType: prometheus.CounterValue,
1771+
extraLabels: []string{"kind"},
1772+
getValues: func(s *info.ContainerStats) metricValues {
1773+
return getPSIValues(s, &s.DiskIo.PSI, "total")
1774+
},
1775+
},
1776+
}...)
1777+
}
1778+
1779+
if includedMetrics.Has(container.PSIAvgMetrics) {
1780+
makePSIAvgMetric := func(controller, window string) containerMetric {
1781+
return containerMetric{
1782+
name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window),
1783+
help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window),
1784+
valueType: prometheus.GaugeValue,
1785+
extraLabels: []string{"kind"},
1786+
getValues: func(s *info.ContainerStats) metricValues {
1787+
switch controller {
1788+
case "cpu":
1789+
return getPSIValues(s, &s.Cpu.PSI, "avg"+window)
1790+
case "memory":
1791+
return getPSIValues(s, &s.Memory.PSI, "avg"+window)
1792+
case "io":
1793+
return getPSIValues(s, &s.DiskIo.PSI, "avg"+window)
1794+
default:
1795+
return nil
1796+
}
1797+
},
1798+
}
1799+
}
1800+
for _, controller := range []string{"cpu", "memory", "io"} {
1801+
for _, window := range []string{"10", "60", "300"} {
1802+
c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window))
1803+
}
1804+
}
1805+
}
1806+
17491807
return c
17501808
}
17511809

@@ -2038,3 +2096,23 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues {
20382096
}
20392097
return values
20402098
}
2099+
2100+
func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues {
2101+
v := make(metricValues, 0, 2)
2102+
switch psiMetric {
2103+
case "avg10":
2104+
v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}})
2105+
v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}})
2106+
case "avg60":
2107+
v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}})
2108+
v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}})
2109+
case "avg300":
2110+
v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}})
2111+
v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}})
2112+
case "total":
2113+
// total is measured as microseconds
2114+
v = append(v, metricValue{value: float64(time.Duration(psi.Some.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"some"}})
2115+
v = append(v, metricValue{value: float64(time.Duration(psi.Full.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"full"}})
2116+
}
2117+
return v
2118+
}

metrics/prometheus_fake.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
328328
},
329329
LoadAverage: 2,
330330
LoadDAverage: 2,
331+
PSI: info.PSIStats{
332+
Some: info.PSIData{
333+
Avg10: 0.1,
334+
Avg60: 0.2,
335+
Avg300: 0.3,
336+
Total: 100,
337+
},
338+
Full: info.PSIData{
339+
Avg10: 0.4,
340+
Avg60: 0.5,
341+
Avg300: 0.6,
342+
Total: 200,
343+
},
344+
},
331345
},
332346
Memory: info.MemoryStats{
333347
Usage: 8,
@@ -358,6 +372,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
358372
MappedFile: 16,
359373
KernelUsage: 17,
360374
Swap: 8192,
375+
PSI: info.PSIStats{
376+
Some: info.PSIData{
377+
Avg10: 0.01,
378+
Avg60: 0.02,
379+
Avg300: 0.03,
380+
Total: 1000,
381+
},
382+
Full: info.PSIData{
383+
Avg10: 0.04,
384+
Avg60: 0.05,
385+
Avg300: 0.06,
386+
Total: 2000,
387+
},
388+
},
361389
},
362390
Hugetlb: map[string]info.HugetlbStats{
363391
"2Mi": {
@@ -550,6 +578,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
550578
"Write": 6,
551579
},
552580
}},
581+
PSI: info.PSIStats{
582+
Some: info.PSIData{
583+
Avg10: 0.11,
584+
Avg60: 0.12,
585+
Avg300: 0.13,
586+
Total: 1111,
587+
},
588+
Full: info.PSIData{
589+
Avg10: 0.14,
590+
Avg60: 0.15,
591+
Avg300: 0.16,
592+
Total: 2222,
593+
},
594+
},
553595
},
554596
Filesystem: []info.FsStats{
555597
{

metrics/testdata/prometheus_metrics

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",container_label_fo
433433
# TYPE container_memory_bandwidth_local_bytes gauge
434434
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
435435
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
436+
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
437+
# TYPE container_cpu_psi_avg10_ratio gauge
438+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
439+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
440+
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
441+
# TYPE container_cpu_psi_avg300_ratio gauge
442+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
443+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
444+
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
445+
# TYPE container_cpu_psi_avg60_ratio gauge
446+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
447+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
448+
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
449+
# TYPE container_cpu_psi_total_seconds counter
450+
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
451+
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000
452+
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
453+
# TYPE container_io_psi_avg10_ratio gauge
454+
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
455+
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
456+
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
457+
# TYPE container_io_psi_avg300_ratio gauge
458+
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
459+
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
460+
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
461+
# TYPE container_io_psi_avg60_ratio gauge
462+
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
463+
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
464+
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
465+
# TYPE container_io_psi_total_seconds counter
466+
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000
467+
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000
468+
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
469+
# TYPE container_memory_psi_avg10_ratio gauge
470+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
471+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
472+
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
473+
# TYPE container_memory_psi_avg300_ratio gauge
474+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
475+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
476+
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
477+
# TYPE container_memory_psi_avg60_ratio gauge
478+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
479+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
480+
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
481+
# TYPE container_memory_psi_total_seconds counter
482+
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
483+
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000

metrics/testdata/prometheus_metrics_whitelist_filtered

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",id="testcontainer"
433433
# TYPE container_memory_bandwidth_local_bytes gauge
434434
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
435435
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
436+
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
437+
# TYPE container_cpu_psi_avg10_ratio gauge
438+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
439+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
440+
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
441+
# TYPE container_cpu_psi_avg300_ratio gauge
442+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
443+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
444+
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
445+
# TYPE container_cpu_psi_avg60_ratio gauge
446+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
447+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
448+
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
449+
# TYPE container_cpu_psi_total_seconds counter
450+
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000
451+
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000
452+
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
453+
# TYPE container_io_psi_avg10_ratio gauge
454+
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
455+
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
456+
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
457+
# TYPE container_io_psi_avg300_ratio gauge
458+
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
459+
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
460+
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
461+
# TYPE container_io_psi_avg60_ratio gauge
462+
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
463+
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
464+
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
465+
# TYPE container_io_psi_total_seconds counter
466+
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000
467+
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000
468+
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
469+
# TYPE container_memory_psi_avg10_ratio gauge
470+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
471+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
472+
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
473+
# TYPE container_memory_psi_avg300_ratio gauge
474+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
475+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
476+
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
477+
# TYPE container_memory_psi_avg60_ratio gauge
478+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
479+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
480+
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
481+
# TYPE container_memory_psi_total_seconds counter
482+
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000
483+
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000

0 commit comments

Comments
 (0)