Skip to content

Commit 160499f

Browse files
authored
Merge pull request #10 from ruijzhan/features/oom-pod-name-new-check-fd
Features/oom pod name new check fd
2 parents 4bedb3c + 9ac45e8 commit 160499f

File tree

6 files changed

+208
-18
lines changed

6 files changed

+208
-18
lines changed

Dockerfile.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ RUN test -h /etc/localtime && rm -f /etc/localtime && cp /usr/share/zoneinfo/UTC
2828

2929
ADD ./bin/node-problem-detector /node-problem-detector
3030
ADD ./bin/log-counter /home/kubernetes/bin/log-counter
31+
ADD ./bin/check-fd /home/kubernetes/bin/check-fd
3132
ADD config /config
3233
RUN chmod +x /config/plugin/*.sh
3334
ENTRYPOINT ["/node-problem-detector", "--system-log-monitors=/config/kernel-monitor.json"]

Makefile

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,14 @@ version:
102102
$(BUILD_TAGS) \
103103
./cmd/nodeproblemdetector
104104

105+
./bin/check-fd: $(PKG_SOURCES)
106+
CGO_ENABLED=$(CGO_ENABLED) GOOS=linux GO111MODULE=on go build \
107+
-mod vendor \
108+
-o bin/check-fd \
109+
-ldflags '-X $(PKG)/pkg/version.version=$(VERSION)' \
110+
$(BUILD_TAGS) \
111+
./cmd/checkfd
112+
105113
Dockerfile: Dockerfile.in
106114
sed -e 's|@BASEIMAGE@|$(BASEIMAGE)|g' $< >$@
107115

@@ -118,12 +126,12 @@ e2e-test: vet fmt build-tar
118126
-boskos-project-type=$(BOSKOS_PROJECT_TYPE) -job-name=$(JOB_NAME) \
119127
-artifacts-dir=$(ARTIFACTS)
120128

121-
build-binaries: ./bin/node-problem-detector ./bin/log-counter
129+
build-binaries: ./bin/node-problem-detector ./bin/log-counter ./bin/check-fd
122130

123131
build-container: build-binaries Dockerfile
124132
docker build -t $(IMAGE) .
125133

126-
build-tar: ./bin/node-problem-detector ./bin/log-counter
134+
build-tar: ./bin/node-problem-detector ./bin/log-counter ./bin/check-fd
127135
tar -zcvf $(TARBALL) bin/ config/ test/e2e-install.sh
128136
sha1sum $(TARBALL)
129137
md5sum $(TARBALL)
@@ -150,4 +158,5 @@ push: push-container push-tar
150158
clean:
151159
rm -f bin/log-counter
152160
rm -f bin/node-problem-detector
161+
rm -f bin/check-fd
153162
rm -f node-problem-detector-*.tar.gz

cmd/checkfd/check_fd.go

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package main
2+
3+
import (
4+
"flag"
5+
"fmt"
6+
"io/ioutil"
7+
"os"
8+
"regexp"
9+
"strconv"
10+
"strings"
11+
"sync"
12+
)
13+
14+
const (
15+
OK = 0
16+
NONOK = 1
17+
UNKNOWN = 2
18+
)
19+
20+
var (
21+
procPath string
22+
printCount bool
23+
percentThreshold int
24+
)
25+
26+
func init() {
27+
flag.StringVar(&procPath, "p", "/proc", "actual path of /proc")
28+
flag.BoolVar(&printCount,"c", false, "print numbers of used df and max")
29+
flag.IntVar(&percentThreshold, "t", 80, "Warning threshold of percentage of fd usage to max limitation")
30+
}
31+
32+
func main() {
33+
flag.Parse()
34+
35+
if percentThreshold >= 100 || percentThreshold <=0 {
36+
fmt.Fprintf(os.Stderr, "value of -t must between 0 and 100")
37+
os.Exit(UNKNOWN)
38+
}
39+
40+
maxPath := procPath + "/sys/fs/file-max"
41+
f, err := os.Open(maxPath)
42+
if err != nil {
43+
fmt.Fprintf(os.Stderr, "%v", err)
44+
os.Exit(UNKNOWN)
45+
}
46+
defer f.Close()
47+
maxBytes, err := ioutil.ReadAll(f)
48+
if err != nil {
49+
fmt.Fprintf(os.Stderr, "%v", err)
50+
os.Exit(UNKNOWN)
51+
}
52+
fdMax, err := strconv.Atoi(strings.TrimSpace(string(maxBytes)))
53+
if err != nil {
54+
fmt.Fprintf(os.Stderr, "%v", err)
55+
os.Exit(UNKNOWN)
56+
}
57+
58+
files, err := ioutil.ReadDir(procPath)
59+
if err != nil {
60+
fmt.Fprintf(os.Stderr, "%v", err)
61+
os.Exit(UNKNOWN)
62+
}
63+
64+
ch := make(chan int)
65+
wg := sync.WaitGroup{}
66+
re := regexp.MustCompile(`[0-9][0-9]*`)
67+
for _, f := range files {
68+
if f.IsDir() {
69+
if re.MatchString(f.Name()) {
70+
wg.Add(1)
71+
go countFD(procPath+"/"+f.Name()+"/fd", &wg, ch)
72+
}
73+
}
74+
}
75+
76+
go func() {
77+
wg.Wait()
78+
close(ch)
79+
}()
80+
81+
fdTotal := 0
82+
for {
83+
c, ok := <-ch
84+
if !ok {
85+
break
86+
}
87+
fdTotal += c
88+
}
89+
90+
if printCount {
91+
fmt.Fprintf(os.Stdout, "current fd usage is %d and limitaion is %d\n", fdTotal, fdMax)
92+
}
93+
if fdTotal > fdMax / 100 * percentThreshold {
94+
fmt.Fprintf(os.Stdout, "current fd usage is %d and is over %d of limition %d, \n",
95+
fdTotal, percentThreshold, fdMax)
96+
os.Exit(NONOK)
97+
} else {
98+
fmt.Fprintf(os.Stdout, "node has no fd pressure\n")
99+
os.Exit(OK)
100+
}
101+
}
102+
103+
func countFD(path string, wg *sync.WaitGroup, fNum chan<- int) {
104+
defer wg.Done()
105+
files, err := ioutil.ReadDir(path)
106+
if err != nil {
107+
fmt.Fprintf(os.Stderr, "%v", err)
108+
return
109+
}
110+
fNum <- len(files)
111+
}

cmd/nodeproblemdetector/node_problem_detector.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ limitations under the License.
1717
package main
1818

1919
import (
20+
"k8s.io/node-problem-detector/pkg/systemlogmonitor"
2021
"os"
2122

2223
"github.com/golang/glog"
@@ -55,6 +56,11 @@ func main() {
5556
glog.Fatalf("No problem daemon is configured")
5657
}
5758

59+
if c := systemlogmonitor.InitK8sClientOrDie(npdo); c != nil {
60+
glog.Info("System Log Monitor K8S client initialized")
61+
} else {
62+
glog.Error("Failed to initialize System Log Monitor K8S client")
63+
}
5864
// Initialize exporters.
5965
defaultExporters := []types.Exporter{}
6066
if ke := k8sexporter.NewExporterOrDie(npdo); ke != nil {

config/plugin/network_problem.sh

Lines changed: 28 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,36 @@
11
#!/bin/bash
22

3-
# This plugin checks for common network issues. Currently, it only checks
4-
# if the conntrack table is full.
3+
# This plugin checks for common network issues.
4+
# Currently only checks if conntrack table is more than 90% used.
55

6-
OK=0
7-
NONOK=1
8-
UNKNOWN=2
6+
readonly OK=0
7+
readonly NONOK=1
8+
readonly UNKNOWN=2
99

10-
[ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_max ] || exit $UNKNOWN
11-
[ -f /proc/sys/net/ipv4/netfilter/ip_conntrack_count ] || exit $UNKNOWN
10+
# "nf_conntrack" replaces "ip_conntrack" - support both
11+
readonly NF_CT_COUNT_PATH='/proc/sys/net/netfilter/nf_conntrack_count'
12+
readonly NF_CT_MAX_PATH='/proc/sys/net/netfilter/nf_conntrack_max'
13+
readonly IP_CT_COUNT_PATH='/proc/sys/net/ipv4/netfilter/ip_conntrack_count'
14+
readonly IP_CT_MAX_PATH='/proc/sys/net/ipv4/netfilter/ip_conntrack_max'
1215

13-
conntrack_max=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_max)
14-
conntrack_count=$(cat /proc/sys/net/ipv4/netfilter/ip_conntrack_count)
15-
16-
if (( conntrack_count >= conntrack_max )); then
17-
echo "Conntrack table full"
18-
exit $NONOK
16+
if [[ -f $NF_CT_COUNT_PATH ]] && [[ -f $NF_CT_MAX_PATH ]]; then
17+
readonly CT_COUNT_PATH=$NF_CT_COUNT_PATH
18+
readonly CT_MAX_PATH=$NF_CT_MAX_PATH
19+
elif [[ -f $IP_CT_COUNT_PATH ]] && [[ -f $IP_CT_MAX_PATH ]]; then
20+
readonly CT_COUNT_PATH=$IP_CT_COUNT_PATH
21+
readonly CT_MAX_PATH=$IP_CT_MAX_PATH
22+
else
23+
exit $UNKNOWN
1924
fi
2025

21-
echo "Conntrack table available"
22-
exit $OK
26+
readonly conntrack_count=$(< $CT_COUNT_PATH) || exit $UNKNOWN
27+
readonly conntrack_max=$(< $CT_MAX_PATH) || exit $UNKNOWN
28+
readonly conntrack_usage_msg="${conntrack_count} out of ${conntrack_max}"
2329

30+
if (( conntrack_count > conntrack_max * 9 /10 )); then
31+
echo "Conntrack table usage over 90%: ${conntrack_usage_msg}"
32+
exit $NONOK
33+
else
34+
echo "Conntrack table usage: ${conntrack_usage_msg}"
35+
exit $OK
36+
fi

pkg/systemlogmonitor/log_monitor.go

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,16 @@ package systemlogmonitor
1919
import (
2020
"encoding/json"
2121
"io/ioutil"
22+
"k8s.io/heapster/common/kubernetes"
23+
clientset "k8s.io/client-go/kubernetes"
24+
"k8s.io/node-problem-detector/cmd/options"
25+
"net/url"
26+
"os"
27+
"path/filepath"
28+
"regexp"
29+
"strings"
2230
"time"
31+
"fmt"
2332

2433
"github.com/golang/glog"
2534

@@ -32,9 +41,19 @@ import (
3241
"k8s.io/node-problem-detector/pkg/types"
3342
"k8s.io/node-problem-detector/pkg/util"
3443
"k8s.io/node-problem-detector/pkg/util/tomb"
44+
"k8s.io/node-problem-detector/pkg/version"
45+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3546
)
3647

37-
const SystemLogMonitorName = "system-log-monitor"
48+
const (
49+
SystemLogMonitorName = "system-log-monitor"
50+
OOMREASON = "PodOOMKilling"
51+
)
52+
53+
var (
54+
uuidRegx *regexp.Regexp
55+
k8sClient *clientset.Clientset
56+
)
3857

3958
func init() {
4059
problemdaemon.Register(
@@ -44,6 +63,10 @@ func init() {
4463
CmdOptionDescription: "Set to config file paths."})
4564
}
4665

66+
func init() {
67+
uuidRegx = regexp.MustCompile("[0-9a-f]{8}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{4}_[0-9a-f]{12}")
68+
}
69+
4770
type logMonitor struct {
4871
configPath string
4972
watcher watchertypes.LogWatcher
@@ -55,6 +78,17 @@ type logMonitor struct {
5578
tomb *tomb.Tomb
5679
}
5780

81+
func InitK8sClientOrDie(options *options.NodeProblemDetectorOptions) *clientset.Clientset{
82+
uri, _ := url.Parse(options.ApiServerOverride)
83+
cfg, err := kubernetes.GetKubeClientConfig(uri)
84+
if err != nil {
85+
panic(err)
86+
}
87+
cfg.UserAgent = fmt.Sprintf("%s/%s", filepath.Base(os.Args[0]), version.Version())
88+
k8sClient = clientset.NewForConfigOrDie(cfg)
89+
return k8sClient
90+
}
91+
5892
// NewLogMonitorOrDie create a new LogMonitor, panic if error occurs.
5993
func NewLogMonitorOrDie(configPath string) types.Monitor {
6094
l := &logMonitor{
@@ -167,6 +201,22 @@ func (l *logMonitor) generateStatus(logs []*logtypes.Log, rule systemlogtypes.Ru
167201
// We use the timestamp of the first log line as the timestamp of the status.
168202
timestamp := logs[0].Timestamp
169203
message := generateMessage(logs)
204+
if rule.Reason == OOMREASON && k8sClient != nil{
205+
uuid := string(uuidRegx.Find([]byte(message)))
206+
uuid = strings.ReplaceAll(uuid,"_","-")
207+
pl, err := k8sClient.CoreV1().Pods("").List(metav1.ListOptions{})
208+
if err != nil {
209+
glog.Error("Error in getting pods: %v", err.Error())
210+
} else {
211+
for _, pod := range pl.Items {
212+
if string(pod.UID) == uuid {
213+
message = fmt.Sprintf("pod was OOM killed. node:%s pod:%s namespace:%s uuid:%s",
214+
pod.Spec.NodeName, pod.Name, pod.Namespace, uuid)
215+
break
216+
}
217+
}
218+
}
219+
}
170220
var events []types.Event
171221
var changedConditions []*types.Condition
172222
if rule.Type == types.Temp {

0 commit comments

Comments
 (0)