Skip to content

Commit b8ce636

Browse files
authored
Merge pull request #300 from xueweiz/metrics
Report metrics from system-log-monitor
2 parents dbe7caf + fbebcf3 commit b8ce636

18 files changed

+1607
-86
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ List of supported problem daemons:
6161

6262
| Problem Daemon | NodeCondition | Description | Disabling Build Tag |
6363
|----------------|:---------------:|:------------|:--------------------|
64-
| [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) | KernelDeadlock | A system log monitor monitors kernel log and reports problem according to predefined rules. | disable_system_log_monitor
64+
| [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) | KernelDeadlock | A system log monitor monitors kernel log and reports problems and metrics according to predefined rules. | disable_system_log_monitor
6565
| [AbrtAdaptor](https://github.com/kubernetes/node-problem-detector/blob/master/config/abrt-adaptor.json) | None | Monitor ABRT log messages and report them further. ABRT (Automatic Bug Report Tool) is health monitoring daemon able to catch kernel problems as well as application crashes of various kinds occurred on the host. For more information visit the [link](https://github.com/abrt). | disable_system_log_monitor
6666
| [CustomPluginMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/custom-plugin-monitor.json) | On-demand(According to users configuration) | A custom plugin monitor for node-problem-detector to invoke and check various node problems with user defined check scripts. See proposal [here](https://docs.google.com/document/d/1jK_5YloSYtboj-DtfjmYKxfNnUxCAvohLnsH5aGCAYQ/edit#). | disable_custom_plugin_monitor
6767
| [SystemStatsMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/system-stats-monitor.json) | None(Could be added in the future) | A system stats monitor for node-problem-detector to collect various health-related system stats as metrics. See proposal [here](https://docs.google.com/document/d/1SeaUz6kBavI283Dq8GBpoEUDrHA2a795xtw0OvjM568/edit). | disable_system_stats_monitor

config/docker-monitor.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"lookback": "5m",
88
"bufferSize": 10,
99
"source": "docker-monitor",
10+
"metricsReporting": true,
1011
"conditions": [],
1112
"rules": [
1213
{

config/kernel-monitor.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
"lookback": "5m",
55
"bufferSize": 10,
66
"source": "kernel-monitor",
7+
"metricsReporting": true,
78
"conditions": [
89
{
910
"type": "KernelDeadlock",

config/systemd-monitor.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"lookback": "",
88
"bufferSize": 10,
99
"source": "systemd-monitor",
10+
"metricsReporting": true,
1011
"conditions": [],
1112
"rules": [
1213
{

pkg/problemmetrics/problem_metrics.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
/*
2+
Copyright 2019 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package problemmetrics
18+
19+
import (
20+
"errors"
21+
"fmt"
22+
"sync"
23+
24+
"github.com/golang/glog"
25+
26+
"k8s.io/node-problem-detector/pkg/util/metrics"
27+
)
28+
29+
// GlobalProblemMetricsManager is a singleton of ProblemMetricsManager,
30+
// which should be used to manage all problem-converted metrics across all
31+
// problem daemons.
32+
var GlobalProblemMetricsManager *ProblemMetricsManager
33+
34+
func init() {
35+
GlobalProblemMetricsManager = NewProblemMetricsManagerOrDie()
36+
}
37+
38+
// ProblemMetricsManager manages problem-converted metrics.
39+
// ProblemMetricsManager is thread-safe.
40+
type ProblemMetricsManager struct {
41+
problemCounter metrics.Int64MetricInterface
42+
problemGauge metrics.Int64MetricInterface
43+
problemTypeToReason map[string]string
44+
problemTypeToReasonMutex sync.Mutex
45+
}
46+
47+
func NewProblemMetricsManagerOrDie() *ProblemMetricsManager {
48+
pmm := ProblemMetricsManager{}
49+
50+
var err error
51+
pmm.problemCounter, err = metrics.NewInt64Metric(
52+
"problem_counter",
53+
"Number of times a specific type of problem have occurred.",
54+
"1",
55+
metrics.Sum,
56+
[]string{"reason"})
57+
if err != nil {
58+
glog.Fatalf("Failed to create problem_counter metric: %v", err)
59+
}
60+
61+
pmm.problemGauge, err = metrics.NewInt64Metric(
62+
"problem_gauge",
63+
"Whether a specific type of problem is affecting the node or not.",
64+
"1",
65+
metrics.LastValue,
66+
[]string{"type", "reason"})
67+
if err != nil {
68+
glog.Fatalf("Failed to create problem_gauge metric: %v", err)
69+
}
70+
71+
pmm.problemTypeToReason = make(map[string]string)
72+
73+
return &pmm
74+
}
75+
76+
// IncrementProblemCounter increments the value of a problem counter.
77+
func (pmm *ProblemMetricsManager) IncrementProblemCounter(reason string, count int64) error {
78+
if pmm.problemCounter == nil {
79+
return errors.New("problem counter is being incremented before initialized.")
80+
}
81+
82+
return pmm.problemCounter.Record(map[string]string{"reason": reason}, count)
83+
}
84+
85+
// SetProblemGauge sets the value of a problem gauge.
86+
func (pmm *ProblemMetricsManager) SetProblemGauge(problemType string, reason string, value bool) error {
87+
if pmm.problemGauge == nil {
88+
return errors.New("problem gauge is being set before initialized.")
89+
}
90+
91+
pmm.problemTypeToReasonMutex.Lock()
92+
defer pmm.problemTypeToReasonMutex.Unlock()
93+
94+
// We clear the last reason, because the expected behavior is that at any point of time,
95+
// for each type of permanent problem, there should be at most one reason got set to 1.
96+
// This behavior is consistent with the behavior of node condition in Kubernetes.
97+
// However, problemGauges with different "type" and "reason" are considered as different
98+
// metrics in Prometheus. So we need to clear the previous metrics explicitly.
99+
if lastReason, ok := pmm.problemTypeToReason[problemType]; ok {
100+
err := pmm.problemGauge.Record(map[string]string{"type": problemType, "reason": lastReason}, 0)
101+
if err != nil {
102+
return fmt.Errorf("failed to clear previous reason %q for type %q: %v",
103+
problemType, lastReason, err)
104+
}
105+
}
106+
107+
pmm.problemTypeToReason[problemType] = reason
108+
109+
var valueInt int64
110+
if value {
111+
valueInt = 1
112+
}
113+
return pmm.problemGauge.Record(map[string]string{"type": problemType, "reason": reason}, valueInt)
114+
}
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
Copyright 2019 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package problemmetrics
18+
19+
import (
20+
"k8s.io/node-problem-detector/pkg/util/metrics"
21+
)
22+
23+
// NewProblemMetricsManagerStub creates a ProblemMetricsManager stubbed by fake metrics.
24+
// The stubbed ProblemMetricsManager and fake metrics are returned.
25+
func NewProblemMetricsManagerStub() (*ProblemMetricsManager, *metrics.FakeInt64Metric, *metrics.FakeInt64Metric) {
26+
fakeProblemCounter := metrics.NewFakeInt64Metric("problem_counter", metrics.Sum, []string{"reason"})
27+
fakeProblemGauge := metrics.NewFakeInt64Metric("problem_gauge", metrics.LastValue, []string{"type", "reason"})
28+
29+
pmm := ProblemMetricsManager{}
30+
pmm.problemCounter = metrics.Int64MetricInterface(fakeProblemCounter)
31+
pmm.problemGauge = metrics.Int64MetricInterface(fakeProblemGauge)
32+
pmm.problemTypeToReason = make(map[string]string)
33+
34+
return &pmm, fakeProblemCounter, fakeProblemGauge
35+
}

0 commit comments

Comments
 (0)