Skip to content

Commit c2d7a7b

Browse files
authored
Merge pull request #513 from karan/cpu_activity_metrics
add metrics for process stats
2 parents a8a1d30 + f13d2a5 commit c2d7a7b

File tree

7 files changed

+132
-8
lines changed

7 files changed

+132
-8
lines changed

config/system-stats-monitor.json

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,19 @@
1515
},
1616
"cpu/load_15m": {
1717
"displayName": "cpu/load_15m"
18-
}
18+
},
19+
"system/processes_total": {
20+
"displayName": "system/processes_total"
21+
},
22+
"system/procs_running": {
23+
"displayName": "system/procs_running"
24+
},
25+
"system/procs_blocked": {
26+
"displayName": "system/procs_blocked"
27+
},
28+
"system/interrupts_total": {
29+
"displayName": "system/interrupts_total"
30+
}
1931
}
2032
},
2133
"disk": {

pkg/exporters/stackdriver/stackdriver_exporter.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ var NPDMetricToSDMetric = map[metrics.MetricID]string{
6969
metrics.ProblemCounterID: "compute.googleapis.com/guest/system/problem_count",
7070
metrics.ProblemGaugeID: "compute.googleapis.com/guest/system/problem_state",
7171
metrics.OSFeatureID: "compute.googleapis.com/guest/system/os_feature_enabled",
72+
metrics.SystemProcessesTotal: "kubernetes.io/internal/node/guest/system/processes_total",
73+
metrics.SystemProcsRunning: "kubernetes.io/internal/node/guest/system/procs_running",
74+
metrics.SystemProcsBlocked: "kubernetes.io/internal/node/guest/system/procs_blocked",
75+
metrics.SystemInterruptsTotal: "kubernetes.io/internal/node/guest/system/interrupts_total",
7276
}
7377

7478
func getMetricTypeConversionFunction(customMetricPrefix string) func(*view.View) string {

pkg/systemstatsmonitor/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ Below metrics are collected from `cpu` component:
2828
* `cpu_load_1m`: CPU load average over the last 1 minute. Collected from [`/proc/loadavg`][/proc doc].
2929
* `cpu_load_5m`: CPU load average over the last 5 minutes. Collected from [`/proc/loadavg`][/proc doc].
3030
* `cpu_load_15m`: CPU load average over the last 15 minutes. Collected from [`/proc/loadavg`][/proc doc].
31+
* `system/processes_total`: Number of forks since boot.
32+
* `system/procs_running`: Number of processes currently running.
33+
* `system/procs_blocked`: Number of processes currently blocked.
34+
* `system/interrupts_total`: Total number of interrupts serviced (cumulative).
3135

3236
[/proc doc]: http://man7.org/linux/man-pages/man5/proc.5.html
3337

pkg/systemstatsmonitor/cpu_collector.go

Lines changed: 98 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,13 @@ package systemstatsmonitor
1818

1919
import (
2020
"github.com/golang/glog"
21+
"github.com/prometheus/procfs"
2122
"github.com/shirou/gopsutil/cpu"
23+
"github.com/shirou/gopsutil/host"
2224
"github.com/shirou/gopsutil/load"
2325

2426
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
27+
"k8s.io/node-problem-detector/pkg/util"
2528
"k8s.io/node-problem-detector/pkg/util/metrics"
2629
)
2730

@@ -36,21 +39,37 @@ import (
3639
const clockTick float64 = 100.0
3740

3841
type cpuCollector struct {
39-
mRunnableTaskCount *metrics.Float64Metric
40-
mUsageTime *metrics.Float64Metric
41-
mCpuLoad1m *metrics.Float64Metric
42-
mCpuLoad5m *metrics.Float64Metric
43-
mCpuLoad15m *metrics.Float64Metric
42+
tags map[string]string
43+
44+
mRunnableTaskCount *metrics.Float64Metric
45+
mUsageTime *metrics.Float64Metric
46+
mCpuLoad1m *metrics.Float64Metric
47+
mCpuLoad5m *metrics.Float64Metric
48+
mCpuLoad15m *metrics.Float64Metric
49+
mSystemProcessesTotal *metrics.Int64Metric
50+
mSystemProcsRunning *metrics.Int64Metric
51+
mSystemProcsBlocked *metrics.Int64Metric
52+
mSystemInterruptsTotal *metrics.Int64Metric
4453

4554
config *ssmtypes.CPUStatsConfig
4655

4756
lastUsageTime map[string]float64
4857
}
4958

5059
func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
51-
cc := cpuCollector{config: cpuConfig}
60+
cc := cpuCollector{tags: map[string]string{}, config: cpuConfig}
5261

53-
var err error
62+
kernelVersion, err := host.KernelVersion()
63+
if err != nil {
64+
glog.Fatalf("Failed to retrieve kernel version: %v", err)
65+
}
66+
cc.tags["kernel_version"] = kernelVersion
67+
68+
osVersion, err := util.GetOSVersion()
69+
if err != nil {
70+
glog.Fatalf("Failed to retrieve OS version: %v", err)
71+
}
72+
cc.tags["os_version"] = osVersion
5473

5574
cc.mRunnableTaskCount, err = metrics.NewFloat64Metric(
5675
metrics.CPURunnableTaskCountID,
@@ -107,6 +126,50 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
107126
glog.Fatalf("Error initializing metric for %q: %v", metrics.CPULoad15m, err)
108127
}
109128

129+
cc.mSystemProcessesTotal, err = metrics.NewInt64Metric(
130+
metrics.SystemProcessesTotal,
131+
cpuConfig.MetricsConfigs[string(metrics.SystemProcessesTotal)].DisplayName,
132+
"Number of forks since boot.",
133+
"1",
134+
metrics.Sum,
135+
[]string{osVersionLabel, kernelVersionLabel})
136+
if err != nil {
137+
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemProcessesTotal, err)
138+
}
139+
140+
cc.mSystemProcsRunning, err = metrics.NewInt64Metric(
141+
metrics.SystemProcsRunning,
142+
cpuConfig.MetricsConfigs[string(metrics.SystemProcsRunning)].DisplayName,
143+
"Number of processes currently running.",
144+
"1",
145+
metrics.LastValue,
146+
[]string{osVersionLabel, kernelVersionLabel})
147+
if err != nil {
148+
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemProcsRunning, err)
149+
}
150+
151+
cc.mSystemProcsBlocked, err = metrics.NewInt64Metric(
152+
metrics.SystemProcsBlocked,
153+
cpuConfig.MetricsConfigs[string(metrics.SystemProcsBlocked)].DisplayName,
154+
"Number of processes currently blocked.",
155+
"1",
156+
metrics.LastValue,
157+
[]string{osVersionLabel, kernelVersionLabel})
158+
if err != nil {
159+
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemProcsBlocked, err)
160+
}
161+
162+
cc.mSystemInterruptsTotal, err = metrics.NewInt64Metric(
163+
metrics.SystemInterruptsTotal,
164+
cpuConfig.MetricsConfigs[string(metrics.SystemInterruptsTotal)].DisplayName,
165+
"Total number of interrupts serviced (cumulative).",
166+
"1",
167+
metrics.Sum,
168+
[]string{osVersionLabel, kernelVersionLabel})
169+
if err != nil {
170+
glog.Fatalf("Error initializing metric for %q: %v", metrics.SystemInterruptsTotal, err)
171+
}
172+
110173
cc.lastUsageTime = make(map[string]float64)
111174

112175
return &cc
@@ -174,11 +237,39 @@ func (cc *cpuCollector) recordUsage() {
174237
cc.lastUsageTime["guest_nice"] = clockTick * timersStat.GuestNice
175238
}
176239

240+
func (cc *cpuCollector) recordSystemStats() {
241+
if cc.mSystemProcessesTotal == nil {
242+
return
243+
}
244+
if cc.mSystemProcsRunning == nil {
245+
return
246+
}
247+
if cc.mSystemProcsBlocked == nil {
248+
return
249+
}
250+
if cc.mSystemInterruptsTotal == nil {
251+
return
252+
}
253+
254+
fs, err := procfs.NewFS("/proc")
255+
stats, err := fs.Stat()
256+
if err != nil {
257+
glog.Errorf("Failed to retrieve cpu/process stats: %v", err)
258+
return
259+
}
260+
261+
cc.mSystemProcessesTotal.Record(cc.tags, int64(stats.ProcessCreated))
262+
cc.mSystemProcsRunning.Record(cc.tags, int64(stats.ProcessesRunning))
263+
cc.mSystemProcsBlocked.Record(cc.tags, int64(stats.ProcessesBlocked))
264+
cc.mSystemInterruptsTotal.Record(cc.tags, int64(stats.IRQTotal))
265+
}
266+
177267
func (cc *cpuCollector) collect() {
178268
if cc == nil {
179269
return
180270
}
181271

182272
cc.recordLoad()
183273
cc.recordUsage()
274+
cc.recordSystemStats()
184275
}

pkg/systemstatsmonitor/labels.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,9 @@ const featureLabel = "os_feature"
3636

3737
// valueLabel labels the value for the features of the guest os system if required
3838
const valueLabel = "value"
39+
40+
// osVersionLabel labels the OS
41+
const osVersionLabel = "os_version"
42+
43+
// osVersionLabel labels the kernel version
44+
const kernelVersionLabel = "kernel_version"

pkg/systemstatsmonitor/osfeature_collector.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ func (ofc *osFeatureCollector) recordFeaturesFromModules(modules []system.Module
142142
}
143143

144144
func (ofc *osFeatureCollector) collect() {
145+
if ofc == nil || ofc.osFeature == nil {
146+
return
147+
}
145148
cmdlineArgs, err := system.CmdlineArgs()
146149
if err != nil {
147150
glog.Fatalf("Error retrieving cmdline args: %v", err)

pkg/util/metrics/metric.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ const (
4242
MemoryUnevictableUsedID MetricID = "memory/unevictable_used"
4343
MemoryDirtyUsedID MetricID = "memory/dirty_used"
4444
OSFeatureID MetricID = "system/os_feature"
45+
SystemProcessesTotal MetricID = "system/processes_total"
46+
SystemProcsRunning MetricID = "system/procs_running"
47+
SystemProcsBlocked MetricID = "system/procs_blocked"
48+
SystemInterruptsTotal MetricID = "system/interrupts_total"
4549
)
4650

4751
var MetricMap MetricMapping

0 commit comments

Comments
 (0)