@@ -18,10 +18,13 @@ package systemstatsmonitor
18
18
19
19
import (
20
20
"github.com/golang/glog"
21
+ "github.com/prometheus/procfs"
21
22
"github.com/shirou/gopsutil/cpu"
23
+ "github.com/shirou/gopsutil/host"
22
24
"github.com/shirou/gopsutil/load"
23
25
24
26
ssmtypes "k8s.io/node-problem-detector/pkg/systemstatsmonitor/types"
27
+ "k8s.io/node-problem-detector/pkg/util"
25
28
"k8s.io/node-problem-detector/pkg/util/metrics"
26
29
)
27
30
@@ -36,21 +39,37 @@ import (
36
39
const clockTick float64 = 100.0
37
40
38
41
type cpuCollector struct {
39
- mRunnableTaskCount * metrics.Float64Metric
40
- mUsageTime * metrics.Float64Metric
41
- mCpuLoad1m * metrics.Float64Metric
42
- mCpuLoad5m * metrics.Float64Metric
43
- mCpuLoad15m * metrics.Float64Metric
42
+ tags map [string ]string
43
+
44
+ mRunnableTaskCount * metrics.Float64Metric
45
+ mUsageTime * metrics.Float64Metric
46
+ mCpuLoad1m * metrics.Float64Metric
47
+ mCpuLoad5m * metrics.Float64Metric
48
+ mCpuLoad15m * metrics.Float64Metric
49
+ mSystemProcessesTotal * metrics.Int64Metric
50
+ mSystemProcsRunning * metrics.Int64Metric
51
+ mSystemProcsBlocked * metrics.Int64Metric
52
+ mSystemInterruptsTotal * metrics.Int64Metric
44
53
45
54
config * ssmtypes.CPUStatsConfig
46
55
47
56
lastUsageTime map [string ]float64
48
57
}
49
58
50
59
func NewCPUCollectorOrDie (cpuConfig * ssmtypes.CPUStatsConfig ) * cpuCollector {
51
- cc := cpuCollector {config : cpuConfig }
60
+ cc := cpuCollector {tags : map [ string ] string {}, config : cpuConfig }
52
61
53
- var err error
62
+ kernelVersion , err := host .KernelVersion ()
63
+ if err != nil {
64
+ glog .Fatalf ("Failed to retrieve kernel version: %v" , err )
65
+ }
66
+ cc .tags ["kernel_version" ] = kernelVersion
67
+
68
+ osVersion , err := util .GetOSVersion ()
69
+ if err != nil {
70
+ glog .Fatalf ("Failed to retrieve OS version: %v" , err )
71
+ }
72
+ cc .tags ["os_version" ] = osVersion
54
73
55
74
cc .mRunnableTaskCount , err = metrics .NewFloat64Metric (
56
75
metrics .CPURunnableTaskCountID ,
@@ -107,6 +126,50 @@ func NewCPUCollectorOrDie(cpuConfig *ssmtypes.CPUStatsConfig) *cpuCollector {
107
126
glog .Fatalf ("Error initializing metric for %q: %v" , metrics .CPULoad15m , err )
108
127
}
109
128
129
+ cc .mSystemProcessesTotal , err = metrics .NewInt64Metric (
130
+ metrics .SystemProcessesTotal ,
131
+ cpuConfig .MetricsConfigs [string (metrics .SystemProcessesTotal )].DisplayName ,
132
+ "Number of forks since boot." ,
133
+ "1" ,
134
+ metrics .Sum ,
135
+ []string {osVersionLabel , kernelVersionLabel })
136
+ if err != nil {
137
+ glog .Fatalf ("Error initializing metric for %q: %v" , metrics .SystemProcessesTotal , err )
138
+ }
139
+
140
+ cc .mSystemProcsRunning , err = metrics .NewInt64Metric (
141
+ metrics .SystemProcsRunning ,
142
+ cpuConfig .MetricsConfigs [string (metrics .SystemProcsRunning )].DisplayName ,
143
+ "Number of processes currently running." ,
144
+ "1" ,
145
+ metrics .LastValue ,
146
+ []string {osVersionLabel , kernelVersionLabel })
147
+ if err != nil {
148
+ glog .Fatalf ("Error initializing metric for %q: %v" , metrics .SystemProcsRunning , err )
149
+ }
150
+
151
+ cc .mSystemProcsBlocked , err = metrics .NewInt64Metric (
152
+ metrics .SystemProcsBlocked ,
153
+ cpuConfig .MetricsConfigs [string (metrics .SystemProcsBlocked )].DisplayName ,
154
+ "Number of processes currently blocked." ,
155
+ "1" ,
156
+ metrics .LastValue ,
157
+ []string {osVersionLabel , kernelVersionLabel })
158
+ if err != nil {
159
+ glog .Fatalf ("Error initializing metric for %q: %v" , metrics .SystemProcsBlocked , err )
160
+ }
161
+
162
+ cc .mSystemInterruptsTotal , err = metrics .NewInt64Metric (
163
+ metrics .SystemInterruptsTotal ,
164
+ cpuConfig .MetricsConfigs [string (metrics .SystemInterruptsTotal )].DisplayName ,
165
+ "Total number of interrupts serviced (cumulative)." ,
166
+ "1" ,
167
+ metrics .Sum ,
168
+ []string {osVersionLabel , kernelVersionLabel })
169
+ if err != nil {
170
+ glog .Fatalf ("Error initializing metric for %q: %v" , metrics .SystemInterruptsTotal , err )
171
+ }
172
+
110
173
cc .lastUsageTime = make (map [string ]float64 )
111
174
112
175
return & cc
@@ -174,11 +237,39 @@ func (cc *cpuCollector) recordUsage() {
174
237
cc .lastUsageTime ["guest_nice" ] = clockTick * timersStat .GuestNice
175
238
}
176
239
240
+ func (cc * cpuCollector ) recordSystemStats () {
241
+ if cc .mSystemProcessesTotal == nil {
242
+ return
243
+ }
244
+ if cc .mSystemProcsRunning == nil {
245
+ return
246
+ }
247
+ if cc .mSystemProcsBlocked == nil {
248
+ return
249
+ }
250
+ if cc .mSystemInterruptsTotal == nil {
251
+ return
252
+ }
253
+
254
+ fs , err := procfs .NewFS ("/proc" )
255
+ stats , err := fs .Stat ()
256
+ if err != nil {
257
+ glog .Errorf ("Failed to retrieve cpu/process stats: %v" , err )
258
+ return
259
+ }
260
+
261
+ cc .mSystemProcessesTotal .Record (cc .tags , int64 (stats .ProcessCreated ))
262
+ cc .mSystemProcsRunning .Record (cc .tags , int64 (stats .ProcessesRunning ))
263
+ cc .mSystemProcsBlocked .Record (cc .tags , int64 (stats .ProcessesBlocked ))
264
+ cc .mSystemInterruptsTotal .Record (cc .tags , int64 (stats .IRQTotal ))
265
+ }
266
+
177
267
func (cc * cpuCollector ) collect () {
178
268
if cc == nil {
179
269
return
180
270
}
181
271
182
272
cc .recordLoad ()
183
273
cc .recordUsage ()
274
+ cc .recordSystemStats ()
184
275
}
0 commit comments