Skip to content

Commit 1f63638

Browse files
committed
Detect kubelet and container runtime frequent crashes
1 parent f382c37 commit 1f63638

File tree

18 files changed

+422
-96
lines changed

18 files changed

+422
-96
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ ifeq ($(ENABLE_JOURNALD), 1)
6969
endif
7070

7171
vet:
72-
go list ./... | grep -v "./vendor/*" | xargs go vet
72+
go list ./... | grep -v "./vendor/*" | xargs go vet $(BUILD_TAGS)
7373

7474
fmt:
7575
find . -type f -name "*.go" | grep -v "./vendor/*" | xargs gofmt -s -w -l

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ For example, to test [KernelMonitor](https://github.com/kubernetes/node-problem-
156156

157157
**Note**:
158158
- You can see more rule examples under [test/kernel_log_generator/problems](https://github.com/kubernetes/node-problem-detector/tree/master/test/kernel_log_generator/problems).
159-
- For [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) message injection, all messages should have ```kernel: ``` prefix (also note there is a space after ```:```).
159+
- For [KernelMonitor](https://github.com/kubernetes/node-problem-detector/blob/master/config/kernel-monitor.json) message injection, all messages should have ```kernel: ``` prefix (also note there is a space after ```:```); or use [generator.sh](https://github.com/kubernetes/node-problem-detector/blob/master/test/kernel_log_generator/generator.sh).
160+
- To inject other logs into journald like systemd logs, use ```echo 'Some systemd message' | systemd-cat -t systemd```.
160161

161162
# Remedy Systems
162163

cmd/logcounter/log_counter.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ func main() {
3232
fedo.AddFlags(pflag.CommandLine)
3333
pflag.Parse()
3434

35-
counter, err := logcounter.NewKmsgLogCounter(fedo)
35+
counter, err := logcounter.NewJournaldLogCounter(fedo)
3636
if err != nil {
3737
fmt.Print(err)
3838
os.Exit(int(types.Unknown))

cmd/logcounter/options/options.go

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,21 @@ func NewLogCounterOptions() *LogCounterOptions {
2929
// LogCounterOptions contains frequent event detector command line and application options.
3030
type LogCounterOptions struct {
3131
// command line options. See flag descriptions for the description
32-
Lookback string
33-
Pattern string
34-
Count int
32+
JournaldSource string
33+
LogPath string
34+
Lookback string
35+
Delay string
36+
Pattern string
37+
Count int
3538
}
3639

3740
// AddFlags adds log counter command line options to pflag.
3841
func (fedo *LogCounterOptions) AddFlags(fs *pflag.FlagSet) {
42+
fs.StringVar(&fedo.JournaldSource, "journald-source", "", "The source configuration of journald, e.g., kernel, kubelet, dockerd, etc")
43+
fs.StringVar(&fedo.LogPath, "log-path", "", "The log path that log watcher looks up")
3944
fs.StringVar(&fedo.Lookback, "lookback", "", "The time log watcher looks up")
45+
fs.StringVar(&fedo.Delay, "delay", "",
46+
"The time duration log watcher delays after node boot time. This is useful when log watcher needs to wait for some time until the node is stable.")
4047
fs.StringVar(&fedo.Pattern, "pattern", "",
4148
"The regular expression to match the problem in log. The pattern must match to the end of the line.")
4249
fs.IntVar(&fedo.Count, "count", 1,

config/docker-monitor.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"plugin": "journald",
33
"pluginConfig": {
4-
"source": "docker"
4+
"source": "dockerd"
55
},
66
"logPath": "/var/log/journal",
77
"lookback": "5m",

config/kernel-monitor-counter.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
"reason": "UnregisterNetDevice",
2222
"path": "/home/kubernetes/bin/log-counter",
2323
"args": [
24+
"--journald-source=kernel",
25+
"--log-path=/var/log/journal",
2426
"--lookback=20m",
2527
"--count=3",
2628
"--pattern=unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"

config/systemd-monitor-counter.json

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "5m",
5+
"timeout": "1m",
6+
"max_output_length": 80,
7+
"concurrency": 1
8+
},
9+
"source": "systemd-monitor",
10+
"conditions": [
11+
{
12+
"type": "FrequentKubeletRestart",
13+
"reason": "NoFrequentKubeletRestart",
14+
"message": "kubelet is functioning properly"
15+
},
16+
{
17+
"type": "FrequentDockerRestart",
18+
"reason": "NoFrequentDockerRestart",
19+
"message": "docker is functioning properly"
20+
},
21+
{
22+
"type": "FrequentContainerdRestart",
23+
"reason": "NoFrequentContainerdRestart",
24+
"message": "containerd is functioning properly"
25+
}
26+
],
27+
"rules": [
28+
{
29+
"type": "permanent",
30+
"condition": "FrequentKubeletRestart",
31+
"reason": "FrequentKubeletRestart",
32+
"path": "/home/kubernetes/bin/log-counter",
33+
"args": [
34+
"--journald-source=systemd",
35+
"--log-path=/var/log/journal",
36+
"--lookback=20m",
37+
"--delay=5m",
38+
"--count=5",
39+
"--pattern=Started Kubernetes kubelet."
40+
],
41+
"timeout": "1m"
42+
},
43+
{
44+
"type": "permanent",
45+
"condition": "FrequentDockerRestart",
46+
"reason": "FrequentDockerRestart",
47+
"path": "/home/kubernetes/bin/log-counter",
48+
"args": [
49+
"--journald-source=systemd",
50+
"--log-path=/var/log/journal",
51+
"--lookback=20m",
52+
"--count=5",
53+
"--pattern=Starting Docker Application Container Engine..."
54+
],
55+
"timeout": "1m"
56+
},
57+
{
58+
"type": "permanent",
59+
"condition": "FrequentContainerdRestart",
60+
"reason": "FrequentContainerdRestart",
61+
"path": "/home/kubernetes/bin/log-counter",
62+
"args": [
63+
"--journald-source=systemd",
64+
"--log-path=/var/log/journal",
65+
"--lookback=20m",
66+
"--count=5",
67+
"--pattern=Starting containerd container runtime..."
68+
],
69+
"timeout": "1m"
70+
}
71+
]
72+
}

deployment/node-problem-detector-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ data:
6969
{
7070
"plugin": "journald",
7171
"pluginConfig": {
72-
"source": "docker"
72+
"source": "dockerd"
7373
},
7474
"logPath": "/var/log/journal",
7575
"lookback": "5m",

pkg/custompluginmonitor/types/config.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,26 +37,26 @@ var (
3737

3838
type pluginGlobalConfig struct {
3939
// InvokeIntervalString is the interval string at which plugins will be invoked.
40-
InvokeIntervalString *string `json:"invoke_interval, omitempty"`
40+
InvokeIntervalString *string `json:"invoke_interval,omitempty"`
4141
// TimeoutString is the global plugin execution timeout string.
42-
TimeoutString *string `json:"timeout, omitempty"`
42+
TimeoutString *string `json:"timeout,omitempty"`
4343
// InvokeInterval is the interval at which plugins will be invoked.
4444
InvokeInterval *time.Duration `json:"-"`
4545
// Timeout is the global plugin execution timeout.
4646
Timeout *time.Duration `json:"-"`
4747
// MaxOutputLength is the maximum plugin output message length.
48-
MaxOutputLength *int `json:"max_output_length, omitempty"`
48+
MaxOutputLength *int `json:"max_output_length,omitempty"`
4949
// Concurrency is the number of concurrent running plugins.
50-
Concurrency *int `json:"concurrency, omitempty"`
50+
Concurrency *int `json:"concurrency,omitempty"`
5151
}
5252

5353
// Custom plugin config is the configuration of custom plugin monitor.
5454
type CustomPluginConfig struct {
5555
// Plugin is the name of plugin which is currently used.
5656
// Currently supported: custom.
57-
Plugin string `json:"plugin, omitempty"`
57+
Plugin string `json:"plugin,omitempty"`
5858
// PluginConfig is global plugin configuration.
59-
PluginGlobalConfig pluginGlobalConfig `json:"pluginConfig, omitempty"`
59+
PluginGlobalConfig pluginGlobalConfig `json:"pluginConfig,omitempty"`
6060
// Source is the source name of the custom plugin monitor
6161
Source string `json:"source"`
6262
// DefaultConditions are the default states of all the conditions custom plugin monitor should handle.

pkg/logcounter/log_counter.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,15 @@ import (
2525
"k8s.io/node-problem-detector/cmd/logcounter/options"
2626
"k8s.io/node-problem-detector/pkg/logcounter/types"
2727
"k8s.io/node-problem-detector/pkg/systemlogmonitor"
28-
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/kmsg"
28+
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/journald"
2929
watchertypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
3030
systemtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
3131
)
3232

3333
const (
34-
bufferSize = 1000
35-
timeout = 1 * time.Second
34+
bufferSize = 1000
35+
timeout = 1 * time.Second
36+
journaldSourceKey = "source"
3637
)
3738

3839
type logCounter struct {
@@ -42,11 +43,17 @@ type logCounter struct {
4243
clock clock.Clock
4344
}
4445

45-
func NewKmsgLogCounter(options *options.LogCounterOptions) (types.LogCounter, error) {
46-
watcher := kmsg.NewKmsgWatcher(watchertypes.WatcherConfig{Lookback: options.Lookback})
46+
func NewJournaldLogCounter(options *options.LogCounterOptions) (types.LogCounter, error) {
47+
watcher := journald.NewJournaldWatcher(watchertypes.WatcherConfig{
48+
Plugin: "journald",
49+
PluginConfig: map[string]string{journaldSourceKey: options.JournaldSource},
50+
LogPath: options.LogPath,
51+
Lookback: options.Lookback,
52+
Delay: options.Delay,
53+
})
4754
logCh, err := watcher.Watch()
4855
if err != nil {
49-
return nil, fmt.Errorf("error watching kmsg: %v", err)
56+
return nil, fmt.Errorf("error watching journald: %v", err)
5057
}
5158
return &logCounter{
5259
logCh: logCh,

0 commit comments

Comments
 (0)