Skip to content

Commit ea83111

Browse files
authored
Merge pull request #22 from Random-Liu/add-look-back
Kernel Monitor: Add look back support and kernel panic handling
2 parents 92a3c35 + 532f933 commit ea83111

File tree

12 files changed

+249
-114
lines changed

12 files changed

+249
-114
lines changed

config/kernel-monitor.json

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
{
22
"logPath": "/log/kern.log",
3+
"lookback": "10m",
4+
"startPattern": "Initializing cgroup subsys cpuset",
35
"bufferSize": 10,
46
"source": "kernel-monitor",
57
"conditions": [
@@ -20,6 +22,16 @@
2022
"reason": "TaskHung",
2123
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
2224
},
25+
{
26+
"type": "temporary",
27+
"reason": "KernelPanic",
28+
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
29+
},
30+
{
31+
"type": "temporary",
32+
"reason": "KernelPanic",
33+
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
34+
},
2335
{
2436
"type": "permanent",
2537
"condition": "KernelDeadlock",

demo/demo

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ rebootAll() {
3131

3232
reboot() {
3333
LATEST=`gcloud compute ssh -n root@$1 --zone=$ZONE "tail -1 $KERNLOG"`
34-
PREFIX=`echo $LATEST | cut -d "[" -f 1 -`"[0.000000]"
35-
runCmd "gcloud compute ssh -n root@$1 --zone=$ZONE \"echo '$PREFIX reboot' >> $KERNLOG\""
34+
PREFIX=`echo $LATEST | cut -d "]" -f 1 -`"]"
35+
runCmd "gcloud compute ssh -n root@$1 --zone=$ZONE \"echo '$PREFIX Initializing cgroup subsys cpuset' >> $KERNLOG\""
3636
}
3737

3838
case $1 in

demo/divide_zero

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
divide error: 0000 [#1] SMP
2+
Modules linked in: dm_thin_pool dm_persistent_data dm_bio_prison dm_bufio libcrc32c xt_statistic xt_nat xt_mark ipt_REJECT xt_tcpudp xt_comment loop veth binfmt_misc sch_htb ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter ip_tables x_tables nf_nat nf_conntrack bridge stp llc aufs(C) nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc crc32_pclmul ppdev aesni_intel aes_x86_64 lrw gf128mul glue_helper ablk_helper cryptd evdev psmouse serio_raw parport_pc ttm parport drm_kms_helper drm i2c_piix4 i2c_core processor button thermal_sys autofs4 ext4 crc16 mbcache jbd2 btrfs xor raid6_pq dm_mod ata_generic crct10dif_pclmul crct10dif_common xen_netfront xen_blkfront crc32c_intel ata_piix libata scsi_mod floppy
3+
CPU: 1 PID: 1853 Comm: docker Tainted: G C 3.16.0-4-amd64 #1 Debian 3.16.7-ckt20-1+deb8u4
4+
Hardware name: Xen HVM domU, BIOS 4.2.amazon 05/12/2016
5+
task: ffff8801e3657470 ti: ffff8801e47a8000 task.ti: ffff8801e47a8000
6+
RIP: 0010:[<ffffffffa0577200>] [<ffffffffa0577200>] pool_io_hints+0xf0/0x1a0 [dm_thin_pool]
7+
RSP: 0018:ffff8801e47abbc8 EFLAGS: 00010246
8+
RAX: 0000000000010000 RBX: ffff8801e4736840 RCX: ffff8801c2662000
9+
RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8801e48c4080
10+
RBP: ffff8801e47abc10 R08: 0000000000000000 R09: 0000000000000000
11+
R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffa057f5c8
12+
R13: 0000000000000001 R14: ffff8801e47abc90 R15: 0000000000000131
13+
FS: 00007ff465daf700(0000) GS:ffff8801efc20000(0000) knlGS:0000000000000000
14+
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
15+
CR2: 000000c207f1c3fb CR3: 00000001e2a5a000 CR4: 00000000001406e0
16+
Stack:
17+
ffffffff810a7c71 0000000043e06d70 ffffc9000115f040 0000000000000000
18+
0000000043e06d70 ffffc9000115f040 0000000000000000 ffff8800e9da3800
19+
ffffffffa00ba615 000fffffffffffff 00000000ffffffff 00000000000000ff
20+
Call Trace:
21+
[<ffffffff810a7c71>] ? complete+0x31/0x40
22+
[<ffffffffa00ba615>] ? dm_calculate_queue_limits+0x95/0x130 [dm_mod]
23+
[<ffffffffa00b7ec3>] ? dm_swap_table+0x73/0x320 [dm_mod]
24+
[<ffffffffa00b0101>] ? crc_t10dif_generic+0x101/0x1000 [crct10dif_common]
25+
[<ffffffffa00bd0d0>] ? table_load+0x330/0x330 [dm_mod]
26+
[<ffffffffa00bd165>] ? dev_suspend+0x95/0x220 [dm_mod]
27+
[<ffffffffa00bda55>] ? ctl_ioctl+0x205/0x430 [dm_mod]
28+
[<ffffffffa00bdc8f>] ? dm_ctl_ioctl+0xf/0x20 [dm_mod]
29+
[<ffffffff811ba99f>] ? do_vfs_ioctl+0x2cf/0x4b0
30+
[<ffffffff810d485e>] ? SyS_futex+0x6e/0x150
31+
[<ffffffff811bac01>] ? SyS_ioctl+0x81/0xa0
32+
[<ffffffff81513ecd>] ? system_call_fast_compare_end+0x10/0x15
33+
Code: 0f 84 a5 00 00 00 3b 96 10 06 00 00 49 c7 c4 c8 f5 57 a0 77 26 8b b6 18 06 00 00 89 d0 c1 e0 09 48 39 f0 0f 82 92 00 00 00 31 d2 <48> f7 f6 85 d2 74 2d 49 c7 c4 70 f5 57 a0 66 90 48 89 e6 e8 28
34+
RIP [<ffffffffa0577200>] pool_io_hints+0xf0/0x1a0 [dm_thin_pool]
35+
RSP <ffff8801e47abbc8>
36+
---[ end trace fcce781faebae9ce ]---

demo/null_pointer

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
BUG: unable to handle kernel NULL pointer dereference at 0000000000000078
2+
IP: [<ffffffff810a2c38>] pick_next_task_fair+0x6b8/0x820
3+
PGD 0
4+
Oops: 0000 [#1] SMP
5+
Modules linked in: binfmt_misc ipt_REJECT xt_nat xt_tcpudp xt_multiport veth xt_conntrack ipt_MASQUERADE iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 xt_addrtype iptable_filter ip_tables x_tables nf_nat nf_conntrack bridge stp llc aufs(C) cpufreq_conservative cpufreq_powersave cpufreq_stats cpufreq_userspace xfs libcrc32c x86_pkg_temp_thermal intel_powerclamp intel_rapl coretemp kvm_intel kvm crc32_pclmul iTCO_wdt iTCO_vendor_support aesni_intel aes_x86_64 lrw gf128mul ttm glue_helper evdev ablk_helper cryptd drm_kms_helper drm serio_raw shpchp i2c_i801 lpc_ich mfd_core wmi ipmi_msghandler processor tpm_infineon thermal_sys tpm_tis tpm acpi_pad button autofs4 ext4 crc16 mbcache jbd2 btrfs xor raid6_pq dm_mod raid1 md_mod sg sd_mod crc_t10dif crct10dif_generic crct10dif_pclmul crct10dif_common crc32c_intel ehci_pci ahci igb libahci xhci_hcd ehci_hcd i2c_algo_bit i2c_core libata dca ptp usbcore scsi_mod pps_core usb_common
6+
CPU: 11 PID: 63 Comm: ksoftirqd/11 Tainted: G C 3.16.0-4-amd64 #1 Debian 3.16.7-ckt20-1+deb8u3
7+
Hardware name: ASUSTeK COMPUTER INC. Z10PA-U8 Series/Z10PA-U8 Series, BIOS 0601 06/26/2015
8+
task: ffff881fe1d0cb60 ti: ffff881fe1d18000 task.ti: ffff881fe1d18000
9+
RIP: 0010:[<ffffffff810a2c38>] [<ffffffff810a2c38>] pick_next_task_fair+0x6b8/0x820
10+
RSP: 0018:ffff881fe1d1bde0 EFLAGS: 00010046
11+
RAX: 0000000000000000 RBX: ffff881fa84a3580 RCX: 0000000000000000
12+
RDX: 0000000000000001 RSI: ffff881fb5837628 RDI: ffff881d1fab2308
13+
RBP: ffff881fb5837600 R08: 0000000000000000 R09: 000000000000b9dc
14+
R10: 0000000000000004 R11: 0000000000000005 R12: 0000000000000000
15+
R13: 0000000000000000 R14: 0000000000000000 R15: ffff88207fd72f00
16+
FS: 0000000000000000(0000) GS:ffff88207fd60000(0000) knlGS:0000000000000000
17+
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
18+
CR2: 0000000000000078 CR3: 0000000001813000 CR4: 00000000001407e0
19+
Stack:
20+
ffff881d1fab2290 000000018109ef94 ffff881fe1d0cb60 0000000000012f00
21+
ffff88207fd72f78 ffffffff8101b975 ffff881fe1d0cfb8 ffff881fe1d0cb60
22+
ffff88207fd72f00 000000000000000b 0000000000000000 0000000000000000
23+
Call Trace:
24+
[<ffffffff8101b975>] ? sched_clock+0x5/0x10
25+
[<ffffffff8150fed6>] ? __schedule+0x106/0x700
26+
[<ffffffff8108ea86>] ? smpboot_thread_fn+0xc6/0x190
27+
[<ffffffff8108e9c0>] ? SyS_setgroups+0x170/0x170
28+
[<ffffffff8108805d>] ? kthread+0xbd/0xe0
29+
[<ffffffff81087fa0>] ? kthread_create_on_node+0x180/0x180
30+
[<ffffffff81513c58>] ? ret_from_fork+0x58/0x90
31+
[<ffffffff81087fa0>] ? kthread_create_on_node+0x180/0x180
32+
Code: 49 8b 7c 24 78 48 39 fd 74 2f 44 8b 73 68 45 8b 6c 24 68 45 39 ee 0f 8e c7 00 00 00 48 89 ef 48 89 de e8 fc 91 ff ff 48 8b 5b 70 <49> 8b 7c 24 78 48 8b 6b 78 48 39 fd 75 d1 48 85 ed 74 cc 4c 89
33+
RIP [<ffffffff810a2c38>] pick_next_task_fair+0x6b8/0x820
34+
RSP <ffff881fe1d1bde0>
35+
CR2: 0000000000000078
36+
---[ end trace 61f6991dc9ee5be0 ]---

node_problem_detector.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import (
2424
)
2525

2626
var (
27-
kernelMonitorConfigPath = flag.String("kernel-monitor", "/config/kernel_monitor", "The path to the kernel monitor config file")
27+
kernelMonitorConfigPath = flag.String("kernel-monitor", "/config/kernel_monitor.json", "The path to the kernel monitor config file")
2828
)
2929

3030
func main() {

pkg/kernelmonitor/kernel_log_watcher.go

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,17 @@ package kernelmonitor
1818

1919
import (
2020
"bufio"
21+
"fmt"
2122
"os"
23+
"time"
2224

2325
"k8s.io/node-problem-detector/pkg/kernelmonitor/translator"
2426
"k8s.io/node-problem-detector/pkg/kernelmonitor/types"
2527
"k8s.io/node-problem-detector/pkg/kernelmonitor/util"
2628

2729
"github.com/golang/glog"
2830
"github.com/hpcloud/tail"
31+
utilclock "github.com/pivotal-golang/clock"
2932
)
3033

3134
const (
@@ -34,7 +37,12 @@ const (
3437

3538
// WatcherConfig is the configuration of kernel log watcher.
3639
type WatcherConfig struct {
40+
// KernelLogPath is the path to the kernel log
3741
KernelLogPath string `json:"logPath, omitempty"`
42+
// StartPattern is the pattern of the start line
43+
StartPattern string `json:"startPattern, omitempty"`
44+
// Lookback is the time kernel watcher looks up
45+
Lookback string `json:"lookback, omitempty"`
3846
}
3947

4048
// KernelLogWatcher watches and translates the kernel log. Once there is new log line,
@@ -53,6 +61,7 @@ type kernelLogWatcher struct {
5361
tl *tail.Tail
5462
logCh chan *types.KernelLog
5563
tomb *util.Tomb
64+
clock utilclock.Clock
5665
}
5766

5867
// NewKernelLogWatcher creates a new kernel log watcher.
@@ -63,6 +72,7 @@ func NewKernelLogWatcher(cfg WatcherConfig) KernelLogWatcher {
6372
tomb: util.NewTomb(),
6473
// A capacity 1000 buffer should be enough
6574
logCh: make(chan *types.KernelLog, 1000),
75+
clock: utilclock.NewClock(),
6676
}
6777
}
6878

@@ -144,39 +154,46 @@ func (k *kernelLogWatcher) watchLoop() {
144154
}
145155
}
146156

147-
// getStartPoint parses the newest kernel log file and try to find the latest reboot point.
148-
// Currently we rely on the kernel log timestamp to find the reboot point. The basic idea
149-
// is straight forward: In the whole lifecycle of a node, the kernel log timestamp should
150-
// always increase, only when it is reboot, the timestamp will decrease. We just parse the
151-
// log and find the latest timestamp decreasing, then it should be the latest reboot point.
152-
// TODO(random-liu): A drawback is that if the node is started long time ago, we'll only get
153-
// logs in the newest kernel log file. We may want to improve this in the future.
157+
// getStartPoint finds the start point to parse the log. The start point is either
158+
// the line at (now - lookback) or the first line of kernel log.
159+
// Notice that, kernel log watcher doesn't look back to the rolled out logs.
154160
func (k *kernelLogWatcher) getStartPoint(path string) (int64, error) {
155161
f, err := os.Open(path)
156162
if err != nil {
157-
return -1, err
163+
return 0, fmt.Errorf("failed to open file %q: %v", path, err)
158164
}
159165
defer f.Close()
166+
lookback, err := parseDuration(k.cfg.Lookback)
167+
if err != nil {
168+
return 0, fmt.Errorf("failed to parse duration %q: %v", k.cfg.Lookback, err)
169+
}
160170
start := int64(0)
161-
total := 0
162-
lastTimestamp := int64(0)
163171
reader := bufio.NewReader(f)
164172
done := false
165173
for !done {
166174
line, err := reader.ReadString('\n')
167175
if err != nil {
176+
if len(line) == 0 {
177+
// No need to continue parsing if nothing is read
178+
break
179+
}
168180
done = true
169181
}
170-
total += len(line)
171182
log, err := k.trans.Translate(line)
172183
if err != nil {
173184
glog.Infof("unable to parse line: %q, %v", line, err)
174-
continue
175-
}
176-
if log.Timestamp < lastTimestamp {
177-
start = int64(total - len(line))
185+
} else if k.clock.Since(log.Timestamp) <= lookback {
186+
break
178187
}
179-
lastTimestamp = log.Timestamp
188+
start += int64(len(line))
180189
}
181190
return start, nil
182191
}
192+
193+
func parseDuration(s string) (time.Duration, error) {
194+
// If the duration is not configured, just return 0 by default
195+
if s == "" {
196+
return 0, nil
197+
}
198+
return time.ParseDuration(s)
199+
}

pkg/kernelmonitor/kernel_log_watcher_test.go

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -24,50 +24,73 @@ import (
2424
"time"
2525

2626
"k8s.io/node-problem-detector/pkg/kernelmonitor/types"
27+
28+
"github.com/pivotal-golang/clock/fakeclock"
2729
)
2830

2931
func TestGetStartPoint(t *testing.T) {
32+
// now is a fake time
33+
now := time.Date(time.Now().Year(), time.January, 2, 3, 4, 5, 0, time.Local)
34+
fakeClock := fakeclock.NewFakeClock(now)
3035
testCases := []struct {
31-
log string
32-
logs []types.KernelLog
33-
err bool
36+
log string
37+
logs []types.KernelLog
38+
lookback string
3439
}{
3540
{
3641
// The start point is at the head of the log file.
37-
log: `kernel: [1.000000] 1
38-
kernel: [2.000000] 2
39-
kernel: [3.000000] 3
42+
log: `Jan 2 03:04:05 kernel: [0.000000] 1
43+
Jan 2 03:04:06 kernel: [1.000000] 2
44+
Jan 2 03:04:07 kernel: [2.000000] 3
4045
`,
4146
logs: []types.KernelLog{
4247
{
43-
Timestamp: 1000000,
48+
Timestamp: now,
4449
Message: "1",
4550
},
4651
{
47-
Timestamp: 2000000,
52+
Timestamp: now.Add(time.Second),
4853
Message: "2",
4954
},
5055
{
51-
Timestamp: 3000000,
56+
Timestamp: now.Add(2 * time.Second),
5257
Message: "3",
5358
},
5459
},
5560
},
5661
{
5762
// The start point is in the middle of the log file.
58-
log: `kernel: [3.000000] 3
59-
kernel: [1.000000] 1
60-
kernel: [2.000000] 2
63+
log: `Jan 2 03:04:04 kernel: [0.000000] 1
64+
Jan 2 03:04:05 kernel: [1.000000] 2
65+
Jan 2 03:04:06 kernel: [2.000000] 3
6166
`,
6267
logs: []types.KernelLog{
6368
{
64-
Timestamp: 1000000,
65-
Message: "1",
69+
Timestamp: now,
70+
Message: "2",
6671
},
6772
{
68-
Timestamp: 2000000,
73+
Timestamp: now.Add(time.Second),
74+
Message: "3",
75+
},
76+
},
77+
},
78+
{
79+
// The start point is at the end of the log file, but we look back.
80+
log: `Jan 2 03:04:03 kernel: [0.000000] 1
81+
Jan 2 03:04:04 kernel: [1.000000] 2
82+
Jan 2 03:04:05 kernel: [2.000000] 3
83+
`,
84+
lookback: "1s",
85+
logs: []types.KernelLog{
86+
{
87+
Timestamp: now.Add(-time.Second),
6988
Message: "2",
7089
},
90+
{
91+
Timestamp: now,
92+
Message: "3",
93+
},
7194
},
7295
},
7396
}
@@ -84,7 +107,9 @@ func TestGetStartPoint(t *testing.T) {
84107
if err != nil {
85108
t.Fatal(err)
86109
}
87-
w := NewKernelLogWatcher(WatcherConfig{KernelLogPath: f.Name()})
110+
w := NewKernelLogWatcher(WatcherConfig{KernelLogPath: f.Name(), Lookback: test.lookback})
111+
// Set the fake clock.
112+
w.(*kernelLogWatcher).clock = fakeClock
88113
logCh, err := w.Watch()
89114
if err != nil {
90115
t.Fatal(err)

0 commit comments

Comments
 (0)