Skip to content

Commit 5b07afd

Browse files
committed
1. Make source and conditions configurable.
2. Add multiple events and conditions support in problem interface.
1 parent 63b4ba7 commit 5b07afd

File tree

7 files changed

+114
-67
lines changed

7 files changed

+114
-67
lines changed

config/kernel-monitor.json

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
{
22
"logPath": "/log/kern.log",
33
"bufferSize": 10,
4+
"source": "kernel-monitor",
5+
"conditions": [
6+
{
7+
"type": "KernelDeadlock",
8+
"reason": "KernelHasNoDeadlock",
9+
"message": "kernel has no deadlock"
10+
}
11+
],
412
"rules": [
513
{
614
"type": "temporary",
@@ -14,17 +22,20 @@
1422
},
1523
{
1624
"type": "permanent",
25+
"condition": "KernelDeadlock",
1726
"reason": "AUFSUmountHung",
1827
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
1928
},
2029
{
2130
"type": "permanent",
31+
"condition": "KernelDeadlock",
2232
"reason": "DockerHung",
2333
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
2434
},
2535
{
2636
"type": "permanent",
27-
"reason": "KernelBug",
37+
"condition": "KernelDeadlock",
38+
"reason": "UnregisterNetDeviceIssue",
2839
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
2940
}
3041
]

pkg/kernelmonitor/kernel_monitor.go

Lines changed: 46 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -30,20 +30,16 @@ import (
3030
"github.com/golang/glog"
3131
)
3232

33-
// May want to add more conditions if we need finer grained node conditions.
34-
// TODO(random-liu): Make the kernel condition to be a predefined list, and make it configurable
35-
// in rule.
36-
const (
37-
KernelDeadlockCondition = "KernelDeadlock"
38-
KernelMonitorSource = "kernel-monitor"
39-
)
40-
4133
// MonitorConfig is the configuration of kernel monitor.
4234
type MonitorConfig struct {
4335
// WatcherConfig is the configuration of kernel log watcher.
4436
WatcherConfig
4537
// BufferSize is the size (in lines) of the log buffer.
4638
BufferSize int `json:"bufferSize"`
39+
// Source is the source name of the kernel monitor
40+
Source string `json:"source"`
41+
// DefaultConditions are the default states of all the conditions kernel monitor should handle.
42+
DefaultConditions []types.Condition `json:"conditions"`
4743
// Rules are the rules kernel monitor will follow to parse the log file.
4844
Rules []kerntypes.Rule `json:"rules"`
4945
}
@@ -58,21 +54,20 @@ type KernelMonitor interface {
5854
}
5955

6056
type kernelMonitor struct {
61-
watcher KernelLogWatcher
62-
buffer LogBuffer
63-
config MonitorConfig
64-
condition types.Condition
65-
uptime time.Time
66-
logCh <-chan *kerntypes.KernelLog
67-
output chan *types.Status
68-
tomb *util.Tomb
57+
watcher KernelLogWatcher
58+
buffer LogBuffer
59+
config MonitorConfig
60+
conditions []types.Condition
61+
uptime time.Time
62+
logCh <-chan *kerntypes.KernelLog
63+
output chan *types.Status
64+
tomb *util.Tomb
6965
}
7066

7167
// NewKernelMonitorOrDie create a new KernelMonitor, panic if error occurs.
7268
func NewKernelMonitorOrDie(configPath string) KernelMonitor {
7369
k := &kernelMonitor{
74-
condition: defaultCondition(),
75-
tomb: util.NewTomb(),
70+
tomb: util.NewTomb(),
7671
}
7772
f, err := ioutil.ReadFile(configPath)
7873
if err != nil {
@@ -82,6 +77,8 @@ func NewKernelMonitorOrDie(configPath string) KernelMonitor {
8277
if err != nil {
8378
panic(err)
8479
}
80+
// Initialize the default node conditions
81+
k.conditions = initialConditions(k.config.DefaultConditions)
8582
err = validateRules(k.config.Rules)
8683
if err != nil {
8784
panic(err)
@@ -120,7 +117,7 @@ func (k *kernelMonitor) Stop() {
120117
// monitorLoop is the main loop of kernel monitor.
121118
func (k *kernelMonitor) monitorLoop() {
122119
defer k.tomb.Done()
123-
k.output <- defaultStatus() // Update the default status
120+
k.output <- k.initialStatus() // Update the initial status
124121
for {
125122
select {
126123
case log := <-k.logCh:
@@ -153,27 +150,34 @@ func (k *kernelMonitor) generateStatus(logs []*kerntypes.KernelLog, rule kerntyp
153150
messages = append(messages, log.Message)
154151
}
155152
message := concatLogs(messages)
156-
var event *types.Event
153+
var events []types.Event
157154
if rule.Type == kerntypes.Temp {
158155
// For temporary error only generate event
159-
event = &types.Event{
156+
events = append(events, types.Event{
160157
Severity: types.Warn,
161158
Timestamp: timestamp,
162159
Reason: rule.Reason,
163160
Message: message,
164-
}
161+
})
165162
} else {
166163
// For permanent error changes the condition
167-
k.condition.Type = KernelDeadlockCondition
168-
k.condition.Status = true
169-
k.condition.Transition = timestamp
170-
k.condition.Reason = rule.Reason
171-
k.condition.Message = message
164+
for i := range k.conditions {
165+
condition := &k.conditions[i]
166+
if condition.Type == rule.Condition {
167+
condition.Type = rule.Condition
168+
condition.Status = true
169+
condition.Transition = timestamp
170+
condition.Reason = rule.Reason
171+
condition.Message = message
172+
break
173+
}
174+
}
172175
}
173176
return &types.Status{
174-
Source: KernelMonitorSource,
175-
Event: event,
176-
Condition: k.condition,
177+
Source: k.config.Source,
178+
// TODO(random-liu): Aggregate events and conditions and then do periodically report.
179+
Events: events,
180+
Conditions: k.conditions,
177181
}
178182
}
179183

@@ -182,22 +186,23 @@ func (k *kernelMonitor) generateTimestamp(timestamp int64) time.Time {
182186
return k.uptime.Add(time.Duration(timestamp * int64(time.Microsecond)))
183187
}
184188

185-
// defaultStatus returns the default status with default condition.
186-
func defaultStatus() *types.Status {
189+
// initialStatus returns the initial status with initial condition.
190+
func (k *kernelMonitor) initialStatus() *types.Status {
187191
return &types.Status{
188-
Source: KernelMonitorSource,
189-
Condition: defaultCondition(),
192+
Source: k.config.Source,
193+
Conditions: k.conditions,
190194
}
191195
}
192196

193-
func defaultCondition() types.Condition {
194-
return types.Condition{
195-
Type: KernelDeadlockCondition,
196-
Status: false,
197-
Transition: time.Now(),
198-
Reason: "KernelHasNoDeadlock",
199-
Message: "kernel has no deadlock",
197+
func initialConditions(defaults []types.Condition) []types.Condition {
198+
conditions := make([]types.Condition, len(defaults))
199+
copy(conditions, defaults)
200+
for i := range conditions {
201+
// TODO(random-liu): Validate default conditions
202+
conditions[i].Status = false
203+
conditions[i].Transition = time.Now()
200204
}
205+
return conditions
201206
}
202207

203208
// validateRules verifies whether the regular expressions in the rules are valid.

pkg/kernelmonitor/kernel_monitor_test.go

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,26 @@ import (
2525
"k8s.io/node-problem-detector/pkg/types"
2626
)
2727

28+
const (
29+
testSource = "TestSource"
30+
testConditionA = "TestConditionA"
31+
testConditionB = "TestConditionB"
32+
)
33+
2834
func TestGenerateStatus(t *testing.T) {
2935
uptime := time.Unix(1000, 0)
30-
initCondition := defaultCondition()
36+
initConditions := []types.Condition{
37+
{
38+
Type: testConditionA,
39+
Status: true,
40+
Transition: time.Now(),
41+
},
42+
{
43+
Type: testConditionB,
44+
Status: false,
45+
Transition: time.Now(),
46+
},
47+
}
3148
logs := []*kerntypes.KernelLog{
3249
{
3350
Timestamp: 100000,
@@ -45,17 +62,21 @@ func TestGenerateStatus(t *testing.T) {
4562
// Do not need Pattern because we don't do pattern match in this test
4663
{
4764
rule: kerntypes.Rule{
48-
Type: kerntypes.Perm,
49-
Reason: "test reason",
65+
Type: kerntypes.Perm,
66+
Condition: testConditionA,
67+
Reason: "test reason",
5068
},
5169
expected: types.Status{
52-
Source: KernelMonitorSource,
53-
Condition: types.Condition{
54-
Type: KernelDeadlockCondition,
55-
Status: true,
56-
Transition: time.Unix(1000, 100000*1000),
57-
Reason: "test reason",
58-
Message: "test message 1\ntest message 2",
70+
Source: testSource,
71+
Conditions: []types.Condition{
72+
{
73+
Type: testConditionA,
74+
Status: true,
75+
Transition: time.Unix(1000, 100000*1000),
76+
Reason: "test reason",
77+
Message: "test message 1\ntest message 2",
78+
},
79+
initConditions[1],
5980
},
6081
},
6182
},
@@ -65,20 +86,23 @@ func TestGenerateStatus(t *testing.T) {
6586
Reason: "test reason",
6687
},
6788
expected: types.Status{
68-
Source: KernelMonitorSource,
69-
Event: &types.Event{
89+
Source: testSource,
90+
Events: []types.Event{{
7091
Severity: types.Warn,
7192
Timestamp: time.Unix(1000, 100000*1000),
7293
Reason: "test reason",
7394
Message: "test message 1\ntest message 2",
74-
},
75-
Condition: initCondition,
95+
}},
96+
Conditions: initConditions,
7697
},
7798
},
7899
} {
79100
k := &kernelMonitor{
80-
condition: initCondition,
81-
uptime: uptime,
101+
config: MonitorConfig{
102+
Source: testSource,
103+
},
104+
conditions: initConditions,
105+
uptime: uptime,
82106
}
83107
got := k.generateStatus(logs, test.rule)
84108
if !reflect.DeepEqual(&test.expected, got) {

pkg/kernelmonitor/types/types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ const (
3737
type Rule struct {
3838
// Type is the type of matched kernel problem.
3939
Type Type `json:"type"`
40+
// Condition is the type of the condition the kernel problem triggered. Notice that
41+
// the Condition field should be set only when the problem is permanent, or else the
42+
// field will be ignored.
43+
Condition string `json:"condition"`
4044
// Reason is the short reason of the kernel problem.
4145
Reason string `json:"reason"`
4246
// Pattern is the regular expression to match the kernel problem in kernel log.

pkg/problemclient/problem_client.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ func getEventRecorder(c *client.Client, nodeName, source string) record.EventRec
123123
}
124124

125125
func getNodeRef(nodeName string) *api.ObjectReference {
126+
// TODO(random-liu): Get node to initalize the node reference
126127
return &api.ObjectReference{
127128
Kind: "Node",
128129
Name: nodeName,

pkg/problemdetector/problem_detector.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,12 @@ func (p *problemDetector) Run() error {
6565
glog.Errorf("Monitor stopped unexpectedly")
6666
break
6767
}
68-
if status.Event != nil {
69-
p.client.Eventf(util.ConvertToAPIEventType(status.Event.Severity), status.Source, status.Event.Reason, status.Event.Message)
68+
for _, event := range status.Events {
69+
p.client.Eventf(util.ConvertToAPIEventType(event.Severity), status.Source, event.Reason, event.Message)
70+
}
71+
for _, condition := range status.Conditions {
72+
p.conditionManager.UpdateCondition(condition)
7073
}
71-
p.conditionManager.UpdateCondition(status.Condition)
7274
}
7375
}
7476
}

pkg/types/types.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,10 +69,10 @@ type Event struct {
6969
type Status struct {
7070
// Source is the name of the problem daemon.
7171
Source string `json:"source"`
72-
// Event is the temporary node problem event. If the status is only a condition update,
73-
// this field could be nil.
74-
Event *Event `json:"event"`
75-
// Condition is the permanent node condition. The problem daemon should always report the
76-
// newest node condition in this field.
77-
Condition Condition `json:"condition"`
72+
// Events are temporary node problem events. If the status is only a condition update,
73+
// this field could be nil. Notice that the events should be sorted from oldest to newest.
74+
Events []Event `json:"events"`
75+
// Conditions are the permanent node conditions. The problem daemon should always report the
76+
// newest node conditions in this field.
77+
Conditions []Condition `json:"conditions"`
7878
}

0 commit comments

Comments
 (0)