Skip to content

Commit 450c6c3

Browse files
authored
Merge pull request #410 from xueweiz/stats
Collect more CPU/disk/memory metrics
2 parents aadb2b8 + 8c02c6d commit 450c6c3

File tree

143 files changed

+6940
-9119
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

143 files changed

+6940
-9119
lines changed

config/system-stats-monitor.json

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,14 @@
11
{
2+
"cpu": {
3+
"metricsConfigs": {
4+
"cpu/runnable_task_count": {
5+
"displayName": "cpu/runnable_task_count"
6+
},
7+
"cpu/usage_time": {
8+
"displayName": "cpu/usage_time"
9+
}
10+
}
11+
},
212
"disk": {
313
"metricsConfigs": {
414
"disk/io_time": {
@@ -9,6 +19,21 @@
919
},
1020
"disk/avg_queue_len": {
1121
"displayName": "disk/avg_queue_len"
22+
},
23+
"disk/operation_count": {
24+
"displayName": "disk/operation_count"
25+
},
26+
"disk/merged_operation_count": {
27+
"displayName": "disk/merged_operation_count"
28+
},
29+
"disk/operation_bytes_count": {
30+
"displayName": "disk/operation_bytes_count"
31+
},
32+
"disk/operation_time": {
33+
"displayName": "disk/operation_time"
34+
},
35+
"disk/bytes_used": {
36+
"displayName": "disk/bytes_used"
1237
}
1338
},
1439
"includeRootBlk": true,
@@ -22,5 +47,24 @@
2247
}
2348
}
2449
},
50+
"memory": {
51+
"metricsConfigs": {
52+
"memory/bytes_used": {
53+
"displayName": "memory/bytes_used"
54+
},
55+
"memory/anonymous_used": {
56+
"displayName": "memory/anonymous_used"
57+
},
58+
"memory/page_cache_used": {
59+
"displayName": "memory/page_cache_used"
60+
},
61+
"memory/unevictable_used": {
62+
"displayName": "memory/unevictable_used"
63+
},
64+
"memory/dirty_used": {
65+
"displayName": "memory/dirty_used"
66+
}
67+
}
68+
},
2569
"invokeInterval": "60s"
2670
}

go.mod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ require (
2020
github.com/pborman/uuid v1.2.0
2121
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90
2222
github.com/prometheus/common v0.4.1
23-
github.com/shirou/gopsutil v2.18.12+incompatible
23+
github.com/prometheus/procfs v0.0.8
24+
github.com/shirou/gopsutil v2.19.12+incompatible
2425
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 // indirect
2526
github.com/sigma/go-inotify v0.0.0-20181102212354-c87b6cf5033d // indirect
2627
github.com/spf13/pflag v1.0.3

go.sum

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -297,13 +297,17 @@ github.com/prometheus/procfs v0.0.0-20190117184657-bf6a532e95b1/go.mod h1:c3At6R
297297
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
298298
github.com/prometheus/procfs v0.0.4 h1:w8DjqFMJDjuVwdZBQoOozr4MVWOnwF7RcL/7uxBjY78=
299299
github.com/prometheus/procfs v0.0.4/go.mod h1:4A/X28fw3Fc593LaREMrKMqOKvUAntwMDaekg4FpcdQ=
300+
github.com/prometheus/procfs v0.0.8 h1:+fpWZdT24pJBiqJdAwYBjPSk+5YmQzYNPYzQsdzLkt8=
301+
github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A=
300302
github.com/rcrowley/go-metrics v0.0.0-20181016184325-3113b8401b8a/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4=
301303
github.com/rogpeppe/go-internal v1.1.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
302304
github.com/rogpeppe/go-internal v1.3.2/go.mod h1:xXDCJY+GAPziupqXw64V24skbSoqbTEfhy4qGm1nDQc=
303305
github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR/rfWxYHBV53g=
304306
github.com/satori/go.uuid v0.0.0-20160713180306-0aa62d5ddceb/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0=
305307
github.com/shirou/gopsutil v2.18.12+incompatible h1:1eaJvGomDnH74/5cF4CTmTbLHAriGFsTZppLXDX93OM=
306308
github.com/shirou/gopsutil v2.18.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
309+
github.com/shirou/gopsutil v2.19.12+incompatible h1:WRstheAymn1WOPesh+24+bZKFkqrdCR8JOc77v4xV3Q=
310+
github.com/shirou/gopsutil v2.19.12+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
307311
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4 h1:udFKJ0aHUL60LboW/A+DfgoHVedieIzIXE8uylPue0U=
308312
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc=
309313
github.com/shurcooL/githubv4 v0.0.0-20180925043049-51d7b505e2e9/go.mod h1:hAF0iLZy4td2EX+/8Tw+4nodhlMrwN3HupfaXj3zkGo=
@@ -407,6 +411,8 @@ golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJ
407411
golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
408412
golang.org/x/sync v0.0.0-20190423024810-112230192c58 h1:8gQV6CLnAEikrhgkHFbMAEhagSSnXWGV915qUMm9mrU=
409413
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
414+
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e h1:vcxGaoTs7kV8m5Np9uUNQin4BrLOthgV7252N8V+FwY=
415+
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
410416
golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
411417
golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
412418
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=

pkg/exporters/stackdriver/stackdriver_exporter.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,24 @@ func init() {
4747
const exporterName = "stackdriver"
4848

4949
var NPDMetricToSDMetric = map[metrics.MetricID]string{
50-
metrics.HostUptimeID: "compute.googleapis.com/guest/system/uptime",
51-
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
52-
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
53-
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
54-
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
55-
metrics.DiskWeightedIOID: "compute.googleapis.com/guest/disk/weighted_io_time",
50+
metrics.CPURunnableTaskCountID: "compute.googleapis.com/guest/cpu/runnable_task_count",
51+
metrics.CPUUsageTimeID: "compute.googleapis.com/guest/cpu/usage_time",
52+
metrics.DiskAvgQueueLenID: "compute.googleapis.com/guest/disk/queue_length",
53+
metrics.DiskBytesUsedID: "compute.googleapis.com/guest/disk/bytes_used",
54+
metrics.DiskIOTimeID: "compute.googleapis.com/guest/disk/io_time",
55+
metrics.DiskMergedOpsCountID: "compute.googleapis.com/guest/disk/merged_operation_count",
56+
metrics.DiskOpsBytesID: "compute.googleapis.com/guest/disk/operation_bytes_count",
57+
metrics.DiskOpsCountID: "compute.googleapis.com/guest/disk/operation_count",
58+
metrics.DiskOpsTimeID: "compute.googleapis.com/guest/disk/operation_time",
59+
metrics.DiskWeightedIOID: "compute.googleapis.com/guest/disk/weighted_io_time",
60+
metrics.HostUptimeID: "compute.googleapis.com/guest/system/uptime",
61+
metrics.MemoryAnonymousUsedID: "compute.googleapis.com/guest/memory/anonymous_used",
62+
metrics.MemoryBytesUsedID: "compute.googleapis.com/guest/memory/bytes_used",
63+
metrics.MemoryDirtyUsedID: "compute.googleapis.com/guest/memory/dirty_used",
64+
metrics.MemoryPageCacheUsedID: "compute.googleapis.com/guest/memory/page_cache_used",
65+
metrics.MemoryUnevictableUsedID: "compute.googleapis.com/guest/memory/unevictable_used",
66+
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
67+
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
5668
}
5769

5870
func getMetricTypeConversionFunction(customMetricPrefix string) func(*view.View) string {

pkg/systemstatsmonitor/README.md

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,67 @@
44

55
Currently supported components are:
66

7+
* cpu
78
* disk
9+
* host
10+
* memory
811

912
See example config file [here](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json).
1013

14+
By setting the `metricsConfigs` field and `displayName` field ([example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json)), you can specify the list of metrics to be collected, and their display names on the Prometheus scaping endpoint.
15+
1116
## Detailed Configuration Options
1217

1318
### Global Configurations
1419

1520
Data collection period can be specified globally in the config file, see `invokeInterval` at the [example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json).
1621

22+
### CPU
23+
24+
Below metrics are collected from `cpu` component:
25+
26+
* `cpu_runnable_task_count`: The average number of runnable tasks in the run-queue during the last minute. Collected from [`/proc/loadavg`][/proc doc].
27+
* `cpu_usage_time`: CPU usage, in seconds. The [CPU state][/proc doc] for the corresponding usage is reported under the `state` metric label (e.g. `user`, `nice`, `system`...).
28+
29+
[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html
30+
1731
### Disk
1832

1933
Below metrics are collected from `disk` component:
2034

21-
* `disk/io_time`: [# of milliseconds spent doing I/Os on this device](https://www.kernel.org/doc/Documentation/iostats.txt)
22-
* `disk/weighted_io`: [# of milliseconds spent doing I/Os on this device](https://www.kernel.org/doc/Documentation/iostats.txt)
23-
* `disk/avg_queue_len`: [average # of requests that was waiting in queue or being serviced during the last `invokeInterval`](https://www.xaprb.com/blog/2010/01/09/how-linux-iostat-computes-its-results/)
35+
* `disk_io_time`: [# of milliseconds spent doing I/Os on this device][iostat doc]
36+
* `disk_weighted_io`: [# of milliseconds spent doing I/Os on this device][iostat doc]
37+
* `disk_avg_queue_len`: [average # of requests that was waiting in queue or being serviced during the last `invokeInterval`](https://www.xaprb.com/blog/2010/01/09/how-linux-iostat-computes-its-results/)
38+
* `disk_operation_count`: [# of reads/writes completed][iostat doc]
39+
* `disk_merged_operation_count`: [# of reads/writes merged][iostat doc]
40+
* `disk_operation_bytes_count`: # of Bytes used for reads/writes on this device
41+
* `disk_operation_time`: [# of milliseconds spent reading/writing][iostat doc]
42+
* `disk_bytes_used`: Disk usage in Bytes. The usage state is reported under the `state` metric label (e.g. `used`, `free`). Summing values of all states yields the disk size.
43+
44+
The name of the disk block device is reported in the `device_name` metric label (e.g. `sda`).
2445

25-
By setting the `metricsConfigs` field and `displayName` field ([example](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json)), you can specify the list of metrics to be collected, and their display names on the Prometheus scaping endpoint. The name of the disk block device will be reported in the `device` metrics label.
46+
For the metrics that separates read/write operations, the IO direction is reported in the `direction` metric label (e.g. `read`, `write`).
2647

2748
And a few other options:
28-
* `includeRootBlk`: When set to `true`, add all block devices that's [not a slave or holder device](http://man7.org/linux/man-pages/man8/lsblk.8.html) to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
49+
* `includeRootBlk`: When set to `true`, add all block devices that's [not a slave or holder device][lsblk doc] to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
2950
* `includeAllAttachedBlk`: When set to `true`, add all currently attached block devices to the list of disks that System Stats Monitor collects metrics from. When set to `false`, do not modify the list of disks that System Stats Monitor collects metrics from.
30-
* `lsblkTimeout`: System Stats Monitor uses [`lsblk`](http://man7.org/linux/man-pages/man8/lsblk.8.html) to retrieve block devices information. This option sets the timeout for calling `lsblk` commands.
51+
* `lsblkTimeout`: System Stats Monitor uses [`lsblk`][lsblk doc] to retrieve block devices information. This option sets the timeout for calling `lsblk` commands.
52+
53+
[iostat doc]: https://www.kernel.org/doc/Documentation/iostats.txt
54+
[lsblk doc]: http://man7.org/linux/man-pages/man8/lsblk.8.html
55+
56+
### Host
57+
58+
Below metrics are collected from `host` component:
59+
60+
* `host_uptime`: The uptime of the operating system, in seconds. OS version and kernel versions are reported under the `os_version` and `kernel_version` metric label (e.g. `cos 73-11647.217.0`, `4.14.127+`).
61+
62+
### Memory
63+
64+
Below metrics are collected from `memory` component:
65+
66+
* `memory_bytes_used`: Memory usage by each memory state, in Bytes. The memory state is reported under the `state` metric label (e.g. `free`, `used`, `buffered`...). Summing values of all states yields the total memory of the node.
67+
* `memory_anonymous_used`: Anonymous memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not swapped until needed. Summing values of all states yields the total anonymous memory used.
68+
* `memory_page_cache_used`: Page cache memory usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `active`, `inactive`). `active` means the memory has been used more recently and usually not reclaimed until needed. Summing values of all states yields the total page cache memory used.
69+
* `memory_unevictable_used`: [Unevictable memory][/proc doc] usage, in Bytes.
70+
* `memory_dirty_used`: Dirty pages usage, in Bytes. Memory usage state is reported under the `state` metric label (e.g. `dirty`, `writeback`). `dirty` means the memory is waiting to be written back to disk, and `writeback` means the memory is actively being written back to disk.
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
/*
2+
Copyright 2020 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package systemstatsmonitor
18+
19+
import (
20+
"github.com/golang/glog"
21+
"github.com/shirou/gopsutil/cpu"
22+
"github.com/shirou/gopsutil/load"
23+
24+
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
25+
"k8s.io/node-problem-detector/pkg/util/metrics"
26+
)
27+
28+
// clockTick is the ratio between 1 second and 1 USER_HZ (a clock tick).
29+
//
30+
// CLK_TCK is 100 in most architectures. If NPD ever runs on a super special architecture,
31+
// we can work out a way to detect the clock tick on that architecture (might require
32+
// cross-compilation with C library or parsing kernel ABIs). For now, it's not worth the
33+
// complexity.
34+
//
35+
// See documentation at http://man7.org/linux/man-pages/man5/proc.5.html
36+
const clockTick float64 = 100.0
37+
38+
type cpuCollector struct {
39+
mRunnableTaskCount *metrics.Float64Metric
40+
mUsageTime *metrics.Float64Metric
41+
42+
config *ssmtypes.CPUStatsConfig
43+
44+
lastUsageTime map[string]float64
45+
}
46+
47+
func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
48+
cc := cpuCollector{config: cpuConfig}
49+
50+
var err error
51+
52+
cc.mRunnableTaskCount, err = metrics.NewFloat64Metric(
53+
metrics.CPURunnableTaskCountID,
54+
cpuConfig.MetricsConfigs[string(metrics.CPURunnableTaskCountID)].DisplayName,
55+
"The average number of runnable tasks in the run-queue during the last minute",
56+
"1",
57+
metrics.LastValue,
58+
[]string{})
59+
if err != nil {
60+
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPURunnableTaskCountID, err)
61+
}
62+
63+
cc.mUsageTime, err = metrics.NewFloat64Metric(
64+
metrics.CPUUsageTimeID,
65+
cpuConfig.MetricsConfigs[string(metrics.CPUUsageTimeID)].DisplayName,
66+
"CPU usage, in seconds",
67+
"s",
68+
metrics.Sum,
69+
[]string{stateLabel})
70+
if err != nil {
71+
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPUUsageTimeID, err)
72+
}
73+
74+
cc.lastUsageTime = make(map[string]float64)
75+
76+
return &cc
77+
}
78+
79+
func (cc *cpuCollector) recordLoad() {
80+
if cc.mRunnableTaskCount == nil {
81+
return
82+
}
83+
84+
loadAvg, err := load.Avg()
85+
if err != nil {
86+
glog.Errorf("Failed to retrieve average CPU load: %v", err)
87+
return
88+
}
89+
90+
cc.mRunnableTaskCount.Record(map[string]string{}, loadAvg.Load1)
91+
}
92+
93+
func (cc *cpuCollector) recordUsage() {
94+
if cc.mUsageTime == nil {
95+
return
96+
}
97+
98+
// Set percpu=false to get aggregated usage from all CPUs.
99+
timersStats, err := cpu.Times(false)
100+
if err != nil {
101+
glog.Errorf("Failed to retrieve CPU timers stat: %v", err)
102+
return
103+
}
104+
timersStat := timersStats[0]
105+
106+
cc.mUsageTime.Record(map[string]string{stateLabel: "user"}, clockTick*timersStat.User-cc.lastUsageTime["user"])
107+
cc.lastUsageTime["user"] = clockTick * timersStat.User
108+
109+
cc.mUsageTime.Record(map[string]string{stateLabel: "system"}, clockTick*timersStat.System-cc.lastUsageTime["system"])
110+
cc.lastUsageTime["system"] = clockTick * timersStat.System
111+
112+
cc.mUsageTime.Record(map[string]string{stateLabel: "idle"}, clockTick*timersStat.Idle-cc.lastUsageTime["idle"])
113+
cc.lastUsageTime["idle"] = clockTick * timersStat.Idle
114+
115+
cc.mUsageTime.Record(map[string]string{stateLabel: "nice"}, clockTick*timersStat.Nice-cc.lastUsageTime["nice"])
116+
cc.lastUsageTime["nice"] = clockTick * timersStat.Nice
117+
118+
cc.mUsageTime.Record(map[string]string{stateLabel: "iowait"}, clockTick*timersStat.Iowait-cc.lastUsageTime["iowait"])
119+
cc.lastUsageTime["iowait"] = clockTick * timersStat.Iowait
120+
121+
cc.mUsageTime.Record(map[string]string{stateLabel: "irq"}, clockTick*timersStat.Irq-cc.lastUsageTime["irq"])
122+
cc.lastUsageTime["irq"] = clockTick * timersStat.Irq
123+
124+
cc.mUsageTime.Record(map[string]string{stateLabel: "softirq"}, clockTick*timersStat.Softirq-cc.lastUsageTime["softirq"])
125+
cc.lastUsageTime["softirq"] = clockTick * timersStat.Softirq
126+
127+
cc.mUsageTime.Record(map[string]string{stateLabel: "steal"}, clockTick*timersStat.Steal-cc.lastUsageTime["steal"])
128+
cc.lastUsageTime["steal"] = clockTick * timersStat.Steal
129+
130+
cc.mUsageTime.Record(map[string]string{stateLabel: "guest"}, clockTick*timersStat.Guest-cc.lastUsageTime["guest"])
131+
cc.lastUsageTime["guest"] = clockTick * timersStat.Guest
132+
133+
cc.mUsageTime.Record(map[string]string{stateLabel: "guest_nice"}, clockTick*timersStat.GuestNice-cc.lastUsageTime["guest_nice"])
134+
cc.lastUsageTime["guest_nice"] = clockTick * timersStat.GuestNice
135+
}
136+
137+
func (cc *cpuCollector) collect() {
138+
if cc == nil {
139+
return
140+
}
141+
142+
cc.recordLoad()
143+
cc.recordUsage()
144+
}

0 commit comments

Comments
 (0)