Skip to content

Commit 3296cc2

Browse files
authored
Merge pull request #28 from PERBILITY/master
fixes #27 and several other improvements
2 parents b2ed408 + b54b2b7 commit 3296cc2

File tree

2 files changed

+29
-88
lines changed

2 files changed

+29
-88
lines changed

account.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ metadata:
1313

1414
---
1515
kind: ClusterRole
16-
apiVersion: rbac.authorization.k8s.io/v1beta1
16+
apiVersion: rbac.authorization.k8s.io/v1
1717
metadata:
1818
name: monitoring
1919
namespace: monitoring
@@ -23,8 +23,7 @@ rules:
2323
resources:
2424
- pods
2525
- nodes
26-
- componentstatuses
27-
- secrets
26+
# - secrets
2827
- persistentvolumes
2928
verbs:
3029
- get
@@ -49,7 +48,7 @@ rules:
4948
- list
5049

5150
---
52-
apiVersion: rbac.authorization.k8s.io/v1beta1
51+
apiVersion: rbac.authorization.k8s.io/v1
5352
kind: ClusterRoleBinding
5453
metadata:
5554
name: monitoring

check_kubernetes.sh

Lines changed: 26 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ usage() {
3333
- Unbound Persistent Volumes in unboundpvs mode; default is 5
3434
- Job failed count in jobs mode; default is 2
3535
- Pvc storage utilization; default is 90%
36-
-b Brief mode (more suitable for Zabbix)
3736
-M EXIT_CODE Exit code when resource is missing; default is 2 (CRITICAL)
3837
-h Show this help and exit
3938
@@ -49,25 +48,20 @@ usage() {
4948
tls Check for tls secrets expiration dates
5049
pvc Check for pvc utilization
5150
unboundpvs Check for unbound persistent volumes
52-
components Check for health of k8s components (deprecated in K8s 1.19+)
5351
EOF
5452

5553
exit 2
5654
}
5755

58-
BRIEF=0
5956
TIMEOUT=15
57+
unset NAME
6058

6159
die() {
62-
if [ "$BRIEF" = 1 ]; then
63-
echo "-1"
64-
else
6560
echo "$1"
66-
fi
6761
exit "${2:-2}"
6862
}
6963

70-
while getopts ":m:M:H:T:t:K:N:n:o:c:w:bh" arg; do
64+
while getopts ":m:M:H:T:t:K:N:n:o:c:w:h" arg; do
7165
case $arg in
7266
h) usage ;;
7367
m) MODE="$OPTARG" ;;
@@ -81,7 +75,6 @@ while getopts ":m:M:H:T:t:K:N:n:o:c:w:bh" arg; do
8175
n) NAME="$OPTARG" ;;
8276
w) WARN="$OPTARG" ;;
8377
c) CRIT="$OPTARG" ;;
84-
b) BRIEF=1 ;;
8578
*) usage ;;
8679
esac
8780
done
@@ -151,7 +144,7 @@ mode_apiserver() {
151144
data=$(getJSON "" "healthz")
152145
[ $? -gt 0 ] && die "$data"
153146
if [ "$data" = ok ]; then
154-
OUTPUT="OK. Kubernetes apiserver health is OK"
147+
OUTPUT="OK. Kubernetes apiserver is healthy"
155148
EXITCODE=0
156149
else
157150
data=$(echo "$data" | grep "\[\-\]")
@@ -171,15 +164,15 @@ mode_nodes() {
171164
.status")"
172165
if [ "$ready" != True ]; then
173166
EXITCODE=2
174-
OUTPUT="${OUTPUT}Node $node not ready. "
167+
OUTPUT="ERROR. ${OUTPUT}Node $node not ready\n"
175168
fi
176169
for condition in OutOfDisk MemoryPressure DiskPressure; do
177170
state="$(echo "$data" | jq -r ".items[] | select(.metadata.name==\"$node\") | \
178171
.status.conditions[] | select(.type==\"$condition\") | \
179172
.status")"
180173
if [ "$state" = True ]; then
181174
[ $EXITCODE -lt 1 ] && EXITCODE=1
182-
OUTPUT="$OUTPUT $node $condition."
175+
OUTPUT="WARN. ${OUTPUT} $node $condition\n"
183176
fi
184177
done
185178
done
@@ -189,43 +182,8 @@ mode_nodes() {
189182
OUTPUT="No nodes found"
190183
EXITCODE="$MISSING_EXITCODE"
191184
else
192-
OUTPUT="OK. ${#nodes[@]} nodes are Ready"
193-
BRIEF_OUTPUT="${#nodes[@]}"
185+
OUTPUT="OK. ${#nodes[@]} nodes are ready"
194186
fi
195-
else
196-
BRIEF_OUTPUT="-1"
197-
fi
198-
}
199-
200-
mode_components() {
201-
healthy_comps=""
202-
unhealthy_comps=""
203-
data="$(getJSON "get cs" "api/v1/componentstatuses")"
204-
[ $? -gt 0 ] && die "$data"
205-
components=($(echo "$data" | jq -r ".items[].metadata.name"))
206-
207-
for comp in "${components[@]}"; do
208-
healthy=$(echo "$data" | jq -r ".items[] | select(.metadata.name==\"$comp\") | \
209-
.conditions[] | select(.type==\"Healthy\") | \
210-
.status")
211-
if [ "$healthy" != True ]; then
212-
EXITCODE=2
213-
unhealthy_comps="$unhealthy_comps $comp"
214-
else
215-
healthy_comps="$healthy_comps $comp"
216-
fi
217-
done
218-
219-
BRIEF_OUTPUT="$healthy_comps"
220-
if [ $EXITCODE = 0 ]; then
221-
if [ -z "${components[*]}" ]; then
222-
OUTPUT="No components found"
223-
EXITCODE="$MISSING_EXITCODE"
224-
else
225-
OUTPUT="OK. Healthy: $healthy_comps"
226-
fi
227-
else
228-
OUTPUT="CRITICAL. Unhealthy: $unhealthy_comps; Healthy: $healthy_comps"
229187
fi
230188
}
231189

@@ -246,9 +204,7 @@ mode_unboundpvs() {
246204
select(.status.phase!=\"Bound\") | \
247205
\"\(.metadata.name):\(.status.phase):\(.spec.claimRef.uid)\"")
248206

249-
BRIEF_OUTPUT="${#pvsArr[*]}"
250207
if [ ${#unboundPvsArr[*]} -gt 0 ]; then
251-
BRIEF_OUTPUT="-${#unboundPvsArr[*]}"
252208
if [ ${#unboundPvsArr[*]} -ge "$CRIT" ]; then
253209
OUTPUT="CRITICAL. Unbound persistentvolumes:\n$OUTPUT"
254210
EXITCODE=2
@@ -387,7 +343,6 @@ mode_tls() {
387343
done
388344
done
389345

390-
BRIEF_OUTPUT="$count_ok"
391346
if [ $EXITCODE = 0 ]; then
392347
if [ -z "$ns" ]; then
393348
OUTPUT="No TLS certs found"
@@ -465,28 +420,30 @@ mode_pods() {
465420
else
466421
((count_failed++))
467422
fi
423+
if [ "$restart_count" -ge "$WARN" ]; then
424+
OUTPUT="${OUTPUT}Container $bad_container: $restart_count restarts.\n"
425+
EXITCODE=1
426+
if [ "$restart_count" -ge "$CRIT" ]; then
427+
EXITCODE=2
428+
fi
429+
fi
468430
done
469431
done
470432

471-
if [ "$max_restart_count" -ge "$WARN" ]; then
472-
BRIEF_OUTPUT="-$max_restart_count"
473-
else
474-
BRIEF_OUTPUT="$count_ready"
475-
fi
476-
433+
if [ $EXITCODE = 0 ]; then
477434
if [ -z "$ns" ]; then
478435
OUTPUT="No pods found"
479436
EXITCODE="$MISSING_EXITCODE"
480437
else
481-
if [ "$max_restart_count" -ge "$WARN" ]; then
482-
OUTPUT="Container $bad_container: $max_restart_count restarts. "
483-
EXITCODE=1
484-
if [ "$max_restart_count" -ge "$CRIT" ]; then
485-
EXITCODE=2
438+
OUTPUT="OK. $count_ready pods ready, $count_succeeded pods succeeded, $count_failed pods not ready\n${OUTPUT}"
439+
fi
440+
else
441+
if [ $EXITCODE = 1 ]; then
442+
OUTPUT="WARNING. $count_ready pods ready, $count_succeeded pods succeeded, $count_failed pods not ready\n${OUTPUT}"
443+
else
444+
OUTPUT="ERROR. $count_ready pods ready, $count_succeeded pods succeeded, $count_failed pods not ready\n${OUTPUT}"
486445
fi
487446
fi
488-
OUTPUT="$OUTPUT$count_ready pods ready, $count_succeeded pods succeeded, $count_failed pods not ready"
489-
fi
490447
}
491448

492449
mode_deployments() {
@@ -525,7 +482,6 @@ mode_deployments() {
525482
done
526483
done
527484

528-
BRIEF_OUTPUT="$count_avail"
529485
if [ $EXITCODE = 0 ]; then
530486
if [ -z "$ns" ]; then
531487
OUTPUT="No deployments found"
@@ -586,7 +542,6 @@ mode_daemonsets() {
586542
done
587543
done
588544

589-
BRIEF_OUTPUT="$count_avail"
590545
if [ $EXITCODE = 0 ]; then
591546
if [ -z "$ns" ]; then
592547
OUTPUT="No daemonsets found"
@@ -648,7 +603,6 @@ mode_replicasets() {
648603
done
649604
done
650605

651-
BRIEF_OUTPUT="$count_avail"
652606
if [ $EXITCODE = 0 ]; then
653607
if [ -z "$ns" ]; then
654608
OUTPUT="No replicasets found"
@@ -701,7 +655,7 @@ mode_statefulsets() {
701655
done < <(echo "$data" | \
702656
jq -r ".items[] | select(.metadata.namespace==\"$ns\" and .metadata.name==\"$rs\") | \
703657
.status | to_entries | map(\"\(.key)=\(.value)\") | .[]")
704-
OUTPUT="Statefulset $ns/$rs ${statusArr[readyReplicas]}/${statusArr[currentReplicas]} ready"
658+
OUTPUT="${OUTPUT}Statefulset $ns/$rs ${statusArr[readyReplicas]}/${statusArr[currentReplicas]} ready\n"
705659
if [ "${statusArr[readyReplicas]}" != "${statusArr[currentReplicas]}" ]; then
706660
((count_failed++))
707661
EXITCODE=2
@@ -711,7 +665,6 @@ mode_statefulsets() {
711665
done
712666
done
713667

714-
BRIEF_OUTPUT="$count_avail"
715668
if [ $EXITCODE = 0 ]; then
716669
if [ -z "$ns" ]; then
717670
OUTPUT="No statefulsets found"
@@ -766,7 +719,7 @@ mode_jobs() {
766719
job_fail_count=$(echo "$data" | jq -r ".items[] | select(.status.failed and .metadata.name==\"$job\") | .status.failed")
767720
total_failed_count="$((total_failed_count+job_fail_count))"
768721
if [ "$job_fail_count" -ge "${WARN}" ]; then
769-
OUTPUT="${OUTPUT}Job $job has $job_fail_count failures. "
722+
OUTPUT="${OUTPUT}Job $job has $job_fail_count failures\n"
770723
EXITCODE=1
771724
elif [ "$job_fail_count" -ge "${CRIT}" ]; then
772725
EXITCODE=2
@@ -783,7 +736,7 @@ mode_jobs() {
783736
if [ -z "$ns" ]; then
784737
OUTPUT="No jobs found"
785738
else
786-
OUTPUT="OK. $total_jobs checked. ${total_failed_count} failed jobs is below threshold"
739+
OUTPUT="OK. $total_jobs checked. ${total_failed_count} failed jobs is below threshold\n"
787740
fi
788741
else
789742
if [ "$EXITCODE" -eq 1 ] ; then
@@ -792,14 +745,13 @@ mode_jobs() {
792745
OUTPUT="CRITICAL. ${OUTPUT}"
793746
fi
794747
if [ -z "$NAME" ] && [ "$EXITCODE" -ge 1 ] ; then
795-
OUTPUT="${OUTPUT}${total_failed_count} jobs in total have failed"
748+
OUTPUT="${OUTPUT}${total_failed_count} jobs have failed"
796749
fi
797750
fi
798751
}
799752

800753
case "$MODE" in
801754
(apiserver) mode_apiserver ;;
802-
(components) mode_components ;;
803755
(daemonsets) mode_daemonsets ;;
804756
(deployments) mode_deployments ;;
805757
(nodes) mode_nodes ;;
@@ -813,16 +765,6 @@ case "$MODE" in
813765
(*) usage ;;
814766
esac
815767

816-
if [ "$BRIEF" = 1 ]; then
817-
if [ "$EXITCODE" = 0 ]; then
818-
echo "${BRIEF_OUTPUT:-1}"
819-
elif [ -z "$BRIEF_FAIL_OUTPUT" ]; then
820-
echo "${BRIEF_OUTPUT:-0}"
821-
else
822-
echo "${BRIEF_FAIL_OUTPUT}"
823-
fi
824-
else
825-
echo "$OUTPUT"
826-
fi
768+
printf "$OUTPUT"
827769

828770
exit $EXITCODE

0 commit comments

Comments
 (0)