Skip to content

Commit f031265

Browse files
committed
Add first version of node-problem-detector
1 parent 802acee commit f031265

31 files changed

+2370
-0
lines changed

Dockerfile

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Copyright 2016 The Kubernetes Authors All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
FROM alpine:3.1
16+
MAINTAINER Random Liu <[email protected]>
17+
ADD node-problem-detector /node-problem-detector

Makefile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
all: push
2+
3+
# See pod.yaml for the version currently running-- bump this ahead before rebuilding!
4+
TAG = 0.1
5+
6+
# TODO(random-liu): Change the project to google_containers.
7+
PROJ = google.com/noogler-kubernetes
8+
9+
node-problem-detector: node_problem_detector.go
10+
CGO_ENABLED=0 GOOS=linux godep go build -a -installsuffix cgo -ldflags '-w' -o node-problem-detector
11+
12+
container: node-problem-detector
13+
docker build -t gcr.io/$(PROJ)/node-problem-detector:$(TAG) .
14+
15+
push: container
16+
gcloud docker push gcr.io/$(PROJ)/node-problem-detector:$(TAG)
17+
18+
clean:
19+
rm -f node-problem-detector

config/kernel-monitor.json

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"logPath": "/log/kern.log",
3+
"bufferSize": 50,
4+
"rules": [
5+
{
6+
"type": "temporary",
7+
"reason": "OOMKilling",
8+
"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB"
9+
},
10+
{
11+
"type": "temporary",
12+
"reason": "TaskHung",
13+
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
14+
},
15+
{
16+
"type": "permanent",
17+
"reason": "AUFSUmountHung",
18+
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
19+
},
20+
{
21+
"type": "permanent",
22+
"reason": "DockerHung",
23+
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
24+
},
25+
{
26+
"type": "permanent",
27+
"reason": "KernelBug",
28+
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
29+
}
30+
]
31+
}

demo/au_opts_verify

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
aufs au_opts_verify:1570:docker[0000]: dirperm1 breaks the protection by the permission bits on the lower branch

demo/aufs_umount_hung

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
INFO: task umount.aufs:21568 blocked for more than 120 seconds.
2+
Tainted: G C 3.16.0-4-amd64 #1
3+
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
4+
umount.aufs D ffff8802111a9848 0 21568 21567 0x00000000
5+
ffff8802111a93f0 0000000000000086 0000000000012f00 ffff8800baa0ffd8
6+
0000000000012f00 ffff8802111a93f0 ffff8802111a93f0 ffff8800baa0fdd8
7+
ffff8802149ef038 ffff8802149ef020 ffffffff00000000 ffff8802149ef028
8+
Call Trace:
9+
[<ffffffff81512d45>] ? rwsem_down_write_failed+0x1d5/0x320
10+
[<ffffffff812b7d13>] ? call_rwsem_down_write_failed+0x13/0x20
11+
[<ffffffff815126b9>] ? down_write+0x29/0x40
12+
[<ffffffffa03556b1>] ? si_write_lock+0x31/0x110 [aufs]
13+
[<ffffffff811b83ca>] ? do_filp_open+0x3a/0x90
14+
[<ffffffffa0374e50>] ? au_plink_maint_enter+0x20/0xd0 [aufs]
15+
[<ffffffffa037473d>] ? au_procfs_plm_write+0x13d/0x200 [aufs]
16+
[<ffffffff81207069>] ? proc_reg_write+0x39/0x70
17+
[<ffffffff811a8562>] ? vfs_write+0xb2/0x1f0
18+
[<ffffffff811a90a2>] ? SyS_write+0x42/0xa0
19+
[<ffffffff81513d0d>] ? system_call_fast_compare_end+0x10/0x15

demo/demo

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/bin/bash
2+
3+
ZONE=us-central1-b
4+
KERNLOG=/var/log/kern.log
5+
6+
usage () {
7+
echo "Usage : `basename $0` COMMAND [arg...]
8+
Commands:
9+
create: create node problem detector daemon.
10+
nodes: describe all nodes or specific node.
11+
pods: describe all pods or specific pod.
12+
inject: inject error kernel log into specific node.
13+
reboot: generate a fake reboot log on specific node.
14+
delete: delete node problem detector daemon."
15+
exit
16+
}
17+
18+
runCmd() {
19+
echo $1
20+
eval $1
21+
}
22+
23+
rebootAll() {
24+
TEMP=`mktemp`
25+
kubectl get nodes | awk 'NR>1{print $1}' > $TEMP
26+
while read -r node; do
27+
reboot $node
28+
done < $TEMP
29+
rm $TEMP
30+
}
31+
32+
reboot() {
33+
LATEST=`gcloud compute ssh -n root@$1 --zone=$ZONE "tail -1 $KERNLOG"`
34+
PREFIX=`echo $LATEST | cut -d "[" -f 1 -`"[0.000000]"
35+
runCmd "gcloud compute ssh -n root@$1 --zone=$ZONE \"echo '$PREFIX reboot' >> $KERNLOG\""
36+
}
37+
38+
case $1 in
39+
create )
40+
runCmd "kubectl create configmap node-problem-detector-config --from-file=../config/"
41+
runCmd "kubectl create -f ../node-problem-detector.yaml --validate=false"
42+
;;
43+
nodes )
44+
runCmd "kubectl describe nodes $2"
45+
;;
46+
pods )
47+
runCmd "kubectl describe pods $2"
48+
;;
49+
inject )
50+
if [ -z $3 ]; then
51+
usage
52+
exit 1
53+
fi
54+
NODE=$3
55+
LATEST=`gcloud compute ssh root@$NODE --zone=$ZONE "tail -1 $KERNLOG"`
56+
PREFIX=`echo $LATEST | cut -d "]" -f 1 -`"]"
57+
PREFIX=`printf "%q" "$PREFIX"`
58+
COMMAND=
59+
while read error
60+
do
61+
ERROR=`printf "%q" "$error"`
62+
COMMAND=$COMMAND"echo $PREFIX $ERROR >> $KERNLOG; "
63+
done < $2
64+
runCmd "gcloud compute ssh root@$NODE --zone=$ZONE '$COMMAND'"
65+
;;
66+
reboot )
67+
if [ -z $2 ]; then
68+
usage
69+
exit 1
70+
fi
71+
reboot $2
72+
;;
73+
delete )
74+
runCmd "kubectl delete -f ../node-problem-detector.yaml"
75+
runCmd "kubectl delete configmap node-problem-detector-config"
76+
;;
77+
* )
78+
usage
79+
exit 1
80+
;;
81+
esac

demo/docker_hung

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
INFO: task docker:20744 blocked for more than 120 seconds.
2+
Tainted: G C 3.16.0-4-amd64 #1
3+
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
4+
docker D ffff8801a8f2b078 0 20744 1 0x00000000
5+
ffff8801a8f2ac20 0000000000000082 0000000000012f00 ffff880057a17fd8
6+
0000000000012f00 ffff8801a8f2ac20 ffffffff818bb4a0 ffff880057a17d80
7+
ffffffff818bb4a4 ffff8801a8f2ac20 00000000ffffffff ffffffff818bb4a8
8+
Call Trace:
9+
[<ffffffff81510915>] ? schedule_preempt_disabled+0x25/0x70
10+
[<ffffffff815123c3>] ? __mutex_lock_slowpath+0xd3/0x1c0
11+
[<ffffffff815124cb>] ? mutex_lock+0x1b/0x2a
12+
[<ffffffff814175bc>] ? copy_net_ns+0x6c/0x130
13+
[<ffffffff8108bdf4>] ? create_new_namespaces+0xf4/0x180
14+
[<ffffffff8108beec>] ? copy_namespaces+0x6c/0x90
15+
[<ffffffff810654f6>] ? copy_process.part.25+0x966/0x1c30
16+
[<ffffffff81066991>] ? do_fork+0xe1/0x390
17+
[<ffffffff811c442c>] ? __alloc_fd+0x7c/0x120
18+
[<ffffffff81514079>] ? stub_clone+0x69/0x90
19+
[<ffffffff81513d0d>] ? system_call_fast_compare_end+0x10/0x15

demo/oom_kill

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Memory cgroup out of memory: Kill process 1012 (heapster) score 1035 or sacrifice child
2+
Killed process 1012 (heapster) total-vm:327128kB, anon-rss:306328kB, file-rss:11132kB

demo/unregister_netdevice

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
unregister_netdevice: waiting for lo to become free. Usage count = 1

node-problem-detector.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
apiVersion: extensions/v1beta1
2+
kind: DaemonSet
3+
metadata:
4+
name: node-problem-detector
5+
spec:
6+
template:
7+
metadata:
8+
labels:
9+
app: node-problem-detector
10+
spec:
11+
hostNetwork: true
12+
containers:
13+
- name: node-problem-detector
14+
command:
15+
- /node-problem-detector
16+
- --kernel-monitor=/config/kernel-monitor.json
17+
image: gcr.io/google.com/noogler-kubernetes/node-problem-detector:0.1
18+
imagePullPolicy: Always
19+
env:
20+
# Config the host ip and port of apiserver.
21+
- name: "KUBERNETES_SERVICE_HOST"
22+
value: "e2e-test-lantaol-master"
23+
- name: "KUBERNETES_SERVICE_PORT"
24+
value: "443"
25+
securityContext:
26+
privileged: true
27+
volumeMounts:
28+
- name: log
29+
mountPath: /log
30+
readOnly: true
31+
- name: config
32+
mountPath: /config
33+
readOnly: true
34+
volumes:
35+
- name: log
36+
hostPath:
37+
path: /var/log/
38+
- name: config
39+
configMap:
40+
name: node-problem-detector-config

0 commit comments

Comments
 (0)