Skip to content

Commit bf730e9

Browse files
committed
add log-counter go plugin
1 parent 2f915ec commit bf730e9

File tree

9 files changed

+364
-3
lines changed

9 files changed

+364
-3
lines changed

Makefile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ fmt:
7777
version:
7878
@echo $(VERSION)
7979

80+
./bin/log-counter: $(PKG_SOURCES)
81+
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux go build -o bin/log-counter \
82+
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
83+
$(BUILD_TAGS) cmd/logcounter/log_counter.go
84+
8085
./bin/node-problem-detector: $(PKG_SOURCES)
8186
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux go build -o bin/node-problem-detector \
8287
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
@@ -88,10 +93,10 @@ Dockerfile: Dockerfile.in
8893
test: vet fmt
8994
go test -timeout=1m -v -race ./cmd/options ./pkg/... $(BUILD_TAGS)
9095

91-
build-container: ./bin/node-problem-detector Dockerfile
96+
build-container: ./bin/node-problem-detector ./bin/log-counter Dockerfile
9297
docker build -t $(IMAGE) .
9398

94-
build-tar: ./bin/node-problem-detector
99+
build-tar: ./bin/node-problem-detector ./bin/log-counter
95100
tar -zcvf $(TARBALL) bin/ config/
96101
sha1sum $(TARBALL)
97102
md5sum $(TARBALL)
@@ -107,5 +112,6 @@ push-tar: build-tar
107112
push: push-container push-tar
108113

109114
clean:
115+
rm -f bin/log-counter
110116
rm -f bin/node-problem-detector
111117
rm -f node-problem-detector-*.tar.gz

cmd/logcounter/log_counter.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
Copyright 2018 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package main
18+
19+
import (
20+
"fmt"
21+
"os"
22+
23+
"github.com/spf13/pflag"
24+
25+
"k8s.io/node-problem-detector/cmd/logcounter/options"
26+
"k8s.io/node-problem-detector/pkg/custompluginmonitor/types"
27+
"k8s.io/node-problem-detector/pkg/logcounter"
28+
)
29+
30+
func main() {
31+
fedo := options.NewLogCounterOptions()
32+
fedo.AddFlags(pflag.CommandLine)
33+
pflag.Parse()
34+
35+
counter, err := logcounter.NewKmsgLogCounter(fedo)
36+
if err != nil {
37+
fmt.Print(err)
38+
os.Exit(int(types.Unknown))
39+
}
40+
actual := counter.Count()
41+
if actual >= fedo.Count {
42+
fmt.Printf("Found %d matching logs, which meets the threshold of %d\n", actual, fedo.Count)
43+
os.Exit(int(types.NonOK))
44+
}
45+
os.Exit(int(types.OK))
46+
}

cmd/logcounter/options/options.go

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
Copyright 2018 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package options
18+
19+
import (
20+
"flag"
21+
22+
"github.com/spf13/pflag"
23+
)
24+
25+
func NewLogCounterOptions() *LogCounterOptions {
26+
return &LogCounterOptions{}
27+
}
28+
29+
// LogCounterOptions contains frequent event detector command line and application options.
30+
type LogCounterOptions struct {
31+
// command line options. See flag descriptions for the description
32+
Lookback string
33+
Pattern string
34+
Count int
35+
}
36+
37+
// AddFlags adds log counter command line options to pflag.
38+
func (fedo *LogCounterOptions) AddFlags(fs *pflag.FlagSet) {
39+
fs.StringVar(&fedo.Lookback, "lookback", "", "The time log watcher looks up")
40+
fs.StringVar(&fedo.Pattern, "pattern", "",
41+
"The regular expression to match the problem in log. The pattern must match to the end of the line.")
42+
fs.IntVar(&fedo.Count, "count", 1,
43+
"The number of times the pattern must be found to trigger the condition")
44+
}
45+
46+
func init() {
47+
pflag.CommandLine.AddGoFlagSet(flag.CommandLine)
48+
}

config/kernel-monitor-counter.json

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"plugin": "custom",
3+
"pluginConfig": {
4+
"invoke_interval": "5m",
5+
"timeout": "1m",
6+
"max_output_length": 80,
7+
"concurrency": 1
8+
},
9+
"source": "kernel-monitor",
10+
"conditions": [
11+
{
12+
"type": "FrequentUnregisterNetDevice",
13+
"reason": "NoFrequentUnregisterNetDevice",
14+
"message": "node is functioning properly"
15+
}
16+
],
17+
"rules": [
18+
{
19+
"type": "permanent",
20+
"condition": "FrequentUnregisterNetDevice",
21+
"reason": "UnregisterNetDevice",
22+
"path": "/home/kubernetes/bin/log-counter",
23+
"args": [
24+
"--lookback=20m",
25+
"--count=3",
26+
"--pattern=unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
27+
],
28+
"timeout": "1m"
29+
}
30+
]
31+
}

pkg/custompluginmonitor/plugin/plugin.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ func (p *Plugin) run(rule cpmtypes.CustomRule) (exitStatus cpmtypes.Status, outp
108108
}
109109
defer cancel()
110110

111-
cmd := exec.CommandContext(ctx, rule.Path)
111+
cmd := exec.CommandContext(ctx, rule.Path, rule.Args...)
112112
stdout, err := cmd.Output()
113113
if err != nil {
114114
if _, ok := err.(*exec.ExitError); !ok {

pkg/custompluginmonitor/types/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ type CustomRule struct {
4848
Reason string `json:"reason"`
4949
// Path is the path to the custom plugin.
5050
Path string `json:"path"`
51+
// Args is the args passed to the custom plugin.
52+
Args []string `json:"args"`
5153
// Timeout is the timeout string for the custom plugin to execute.
5254
TimeoutString *string `json:"timeout"`
5355
// Timeout is the timeout for the custom plugin to execute.

pkg/logcounter/log_counter.go

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
/*
2+
Copyright 2018 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package logcounter
18+
19+
import (
20+
"fmt"
21+
"time"
22+
23+
"k8s.io/kubernetes/pkg/util/clock"
24+
25+
"k8s.io/node-problem-detector/cmd/logcounter/options"
26+
"k8s.io/node-problem-detector/pkg/logcounter/types"
27+
"k8s.io/node-problem-detector/pkg/systemlogmonitor"
28+
"k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/kmsg"
29+
watchertypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/logwatchers/types"
30+
systemtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
31+
)
32+
33+
const (
34+
bufferSize = 1000
35+
timeout = 1 * time.Second
36+
)
37+
38+
type logCounter struct {
39+
logCh <-chan *systemtypes.Log
40+
buffer systemlogmonitor.LogBuffer
41+
pattern string
42+
clock clock.Clock
43+
}
44+
45+
func NewKmsgLogCounter(options *options.LogCounterOptions) (types.LogCounter, error) {
46+
watcher := kmsg.NewKmsgWatcher(watchertypes.WatcherConfig{Lookback: options.Lookback})
47+
logCh, err := watcher.Watch()
48+
if err != nil {
49+
return nil, fmt.Errorf("error watching kmsg: %v", err)
50+
}
51+
return &logCounter{
52+
logCh: logCh,
53+
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
54+
pattern: options.Pattern,
55+
clock: clock.RealClock{},
56+
}, nil
57+
}
58+
59+
func (e *logCounter) Count() (count int) {
60+
start := e.clock.Now()
61+
for {
62+
select {
63+
case log := <-e.logCh:
64+
// We only want to count events up until the time at which we started.
65+
// Otherwise we would run forever
66+
if start.Before(log.Timestamp) {
67+
return
68+
}
69+
e.buffer.Push(log)
70+
if len(e.buffer.Match(e.pattern)) != 0 {
71+
count++
72+
}
73+
case <-e.clock.After(timeout):
74+
// Don't block forever if we do not get any new messages
75+
return
76+
}
77+
}
78+
}

pkg/logcounter/log_counter_test.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
/*
2+
Copyright 2018 The Kubernetes Authors All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package logcounter
18+
19+
import (
20+
"testing"
21+
"time"
22+
23+
"k8s.io/kubernetes/pkg/util/clock"
24+
25+
"k8s.io/node-problem-detector/pkg/logcounter/types"
26+
"k8s.io/node-problem-detector/pkg/systemlogmonitor"
27+
systemtypes "k8s.io/node-problem-detector/pkg/systemlogmonitor/types"
28+
)
29+
30+
func NewTestLogCounter(pattern string, startTime time.Time) (types.LogCounter, *clock.FakeClock, chan *systemtypes.Log) {
31+
logCh := make(chan *systemtypes.Log)
32+
clock := clock.NewFakeClock(startTime)
33+
return &logCounter{
34+
logCh: logCh,
35+
buffer: systemlogmonitor.NewLogBuffer(bufferSize),
36+
pattern: pattern,
37+
clock: clock,
38+
}, clock, logCh
39+
}
40+
41+
func TestCount(t *testing.T) {
42+
startTime := time.Now()
43+
for _, tc := range []struct {
44+
description string
45+
logs []*systemtypes.Log
46+
pattern string
47+
expectedCount int
48+
}{
49+
{
50+
description: "no logs",
51+
logs: []*systemtypes.Log{},
52+
pattern: "",
53+
expectedCount: 0,
54+
},
55+
{
56+
description: "one matching log",
57+
logs: []*systemtypes.Log{
58+
{
59+
Timestamp: startTime.Add(-time.Second),
60+
Message: "0",
61+
},
62+
},
63+
pattern: "0",
64+
expectedCount: 1,
65+
},
66+
{
67+
description: "one non-matching log",
68+
logs: []*systemtypes.Log{
69+
{
70+
Timestamp: startTime.Add(-time.Second),
71+
Message: "1",
72+
},
73+
},
74+
pattern: "0",
75+
expectedCount: 0,
76+
},
77+
{
78+
description: "log too new",
79+
logs: []*systemtypes.Log{
80+
{
81+
Timestamp: startTime.Add(time.Second),
82+
Message: "0",
83+
},
84+
},
85+
pattern: "0",
86+
expectedCount: 0,
87+
},
88+
{
89+
description: "many logs",
90+
logs: []*systemtypes.Log{
91+
{
92+
Timestamp: startTime.Add(-time.Second),
93+
Message: "0",
94+
},
95+
{
96+
Timestamp: startTime.Add(-time.Second),
97+
Message: "0",
98+
},
99+
{
100+
Timestamp: startTime.Add(-time.Second),
101+
Message: "1",
102+
},
103+
{
104+
Timestamp: startTime.Add(time.Second),
105+
Message: "0",
106+
},
107+
},
108+
pattern: "0",
109+
expectedCount: 2,
110+
},
111+
} {
112+
t.Run(tc.description, func(t *testing.T) {
113+
counter, fakeClock, logCh := NewTestLogCounter(tc.pattern, startTime)
114+
go func(logs []*systemtypes.Log, ch chan<- *systemtypes.Log) {
115+
for _, log := range logs {
116+
ch <- log
117+
}
118+
// trigger the timeout to ensure the test doesn't block permenantly
119+
for {
120+
fakeClock.Step(2 * timeout)
121+
}
122+
}(tc.logs, logCh)
123+
actualCount := counter.Count()
124+
if actualCount != tc.expectedCount {
125+
t.Errorf("got %d; expected %d", actualCount, tc.expectedCount)
126+
}
127+
})
128+
}
129+
}

0 commit comments

Comments
 (0)