Skip to content

Commit d4f05fd

Browse files
committed
Introduce a metric to track kubelet admission failure.
1 parent aafcf4e commit d4f05fd

File tree

8 files changed

+343
-23
lines changed

8 files changed

+343
-23
lines changed

pkg/kubelet/kubelet.go

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
sysruntime "runtime"
3030
"slices"
3131
"sort"
32+
"strings"
3233
"sync"
3334
"sync/atomic"
3435
"time"
@@ -81,6 +82,7 @@ import (
8182
"k8s.io/kubernetes/pkg/kubelet/cloudresource"
8283
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
8384
"k8s.io/kubernetes/pkg/kubelet/cm"
85+
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
8486
"k8s.io/kubernetes/pkg/kubelet/config"
8587
"k8s.io/kubernetes/pkg/kubelet/configmap"
8688
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@@ -120,6 +122,7 @@ import (
120122
"k8s.io/kubernetes/pkg/kubelet/volumemanager"
121123
"k8s.io/kubernetes/pkg/kubelet/watchdog"
122124
httpprobe "k8s.io/kubernetes/pkg/probe/http"
125+
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
123126
"k8s.io/kubernetes/pkg/security/apparmor"
124127
"k8s.io/kubernetes/pkg/util/oom"
125128
"k8s.io/kubernetes/pkg/volume"
@@ -220,6 +223,26 @@ var (
220223
// ContainerLogsDir can be overwritten for testing usage
221224
ContainerLogsDir = DefaultContainerLogsDir
222225
etcHostsPath = getContainerEtcHostsPath()
226+
227+
admissionRejectionReasons = sets.New[string](
228+
lifecycle.AppArmorNotAdmittedReason,
229+
lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
230+
lifecycle.PodOSNotSupported,
231+
lifecycle.InvalidNodeInfo,
232+
lifecycle.InitContainerRestartPolicyForbidden,
233+
lifecycle.UnexpectedAdmissionError,
234+
lifecycle.UnknownReason,
235+
lifecycle.UnexpectedPredicateFailureType,
236+
lifecycle.OutOfCPU,
237+
lifecycle.OutOfMemory,
238+
lifecycle.OutOfEphemeralStorage,
239+
lifecycle.OutOfPods,
240+
tainttoleration.ErrReasonNotMatch,
241+
eviction.Reason,
242+
sysctl.ForbiddenReason,
243+
topologymanager.ErrorTopologyAffinity,
244+
nodeshutdown.NodeShutdownNotAdmittedReason,
245+
)
223246
)
224247

225248
func getContainerEtcHostsPath() string {
@@ -2304,7 +2327,6 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
23042327
attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods}
23052328
for _, podAdmitHandler := range kl.admitHandlers {
23062329
if result := podAdmitHandler.Admit(attrs); !result.Admit {
2307-
23082330
klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message)
23092331

23102332
return false, result.Reason, result.Message
@@ -2314,6 +2336,22 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
23142336
return true, "", ""
23152337
}
23162338

2339+
func recordAdmissionRejection(reason string) {
2340+
// It is possible that the "reason" label can have high cardinality.
2341+
// To avoid this metric from exploding, we create an allowlist of known
2342+
// reasons, and only record reasons from this list. Use "Other" reason
2343+
// for the rest.
2344+
if admissionRejectionReasons.Has(reason) {
2345+
metrics.AdmissionRejectionsTotal.WithLabelValues(reason).Inc()
2346+
} else if strings.HasPrefix(reason, lifecycle.InsufficientResourcePrefix) {
2347+
// non-extended resources (like cpu, memory, ephemeral-storage, pods)
2348+
// are already included in admissionRejectionReasons.
2349+
metrics.AdmissionRejectionsTotal.WithLabelValues("OutOfExtendedResources").Inc()
2350+
} else {
2351+
metrics.AdmissionRejectionsTotal.WithLabelValues("Other").Inc()
2352+
}
2353+
}
2354+
23172355
// syncLoop is the main loop for processing changes. It watches for changes from
23182356
// three channels (file, apiserver, and http) and creates a union of them. For
23192357
// any new change seen, will run a sync against desired state and running state. If
@@ -2584,6 +2622,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
25842622
// Check if we can admit the pod; if not, reject it.
25852623
if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok {
25862624
kl.rejectPod(pod, reason, message)
2625+
// We avoid recording the metric in canAdmitPod because it's called
2626+
// repeatedly during a resize, which would inflate the metric.
2627+
// Instead, we record the metric here in HandlePodAdditions for new pods
2628+
// and capture resize events separately.
2629+
recordAdmissionRejection(reason)
25872630
continue
25882631
}
25892632
// For new pod, checkpoint the resource values at which the Pod has been admitted
@@ -2595,6 +2638,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
25952638
// Check if we can admit the pod; if not, reject it.
25962639
if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok {
25972640
kl.rejectPod(pod, reason, message)
2641+
// We avoid recording the metric in canAdmitPod because it's called
2642+
// repeatedly during a resize, which would inflate the metric.
2643+
// Instead, we record the metric here in HandlePodAdditions for new pods
2644+
// and capture resize events separately.
2645+
recordAdmissionRejection(reason)
25982646
continue
25992647
}
26002648
}

pkg/kubelet/kubelet_test.go

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ import (
5555
"k8s.io/client-go/tools/record"
5656
"k8s.io/client-go/util/flowcontrol"
5757
featuregatetesting "k8s.io/component-base/featuregate/testing"
58+
"k8s.io/component-base/metrics/testutil"
5859
internalapi "k8s.io/cri-api/pkg/apis"
5960
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
6061
remote "k8s.io/cri-client/pkg"
@@ -66,6 +67,7 @@ import (
6667
cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing"
6768
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
6869
"k8s.io/kubernetes/pkg/kubelet/cm"
70+
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
6971
"k8s.io/kubernetes/pkg/kubelet/config"
7072
"k8s.io/kubernetes/pkg/kubelet/configmap"
7173
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@@ -75,6 +77,7 @@ import (
7577
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
7678
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
7779
"k8s.io/kubernetes/pkg/kubelet/logs"
80+
"k8s.io/kubernetes/pkg/kubelet/metrics"
7881
"k8s.io/kubernetes/pkg/kubelet/network/dns"
7982
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
8083
"k8s.io/kubernetes/pkg/kubelet/pleg"
@@ -90,12 +93,14 @@ import (
9093
"k8s.io/kubernetes/pkg/kubelet/status"
9194
"k8s.io/kubernetes/pkg/kubelet/status/state"
9295
statustest "k8s.io/kubernetes/pkg/kubelet/status/testing"
96+
"k8s.io/kubernetes/pkg/kubelet/sysctl"
9397
"k8s.io/kubernetes/pkg/kubelet/token"
9498
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
9599
kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
96100
"k8s.io/kubernetes/pkg/kubelet/util/queue"
97101
kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager"
98102
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
103+
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
99104
"k8s.io/kubernetes/pkg/util/oom"
100105
"k8s.io/kubernetes/pkg/volume"
101106
_ "k8s.io/kubernetes/pkg/volume/hostpath"
@@ -3460,3 +3465,200 @@ func TestIsPodResizeInProgress(t *testing.T) {
34603465
})
34613466
}
34623467
}
3468+
3469+
func TestRecordAdmissionRejection(t *testing.T) {
3470+
metrics.Register()
3471+
3472+
testCases := []struct {
3473+
name string
3474+
reason string
3475+
wants string
3476+
}{
3477+
{
3478+
name: "AppArmor",
3479+
reason: lifecycle.AppArmorNotAdmittedReason,
3480+
wants: `
3481+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3482+
# TYPE kubelet_admission_rejections_total counter
3483+
kubelet_admission_rejections_total{reason="AppArmor"} 1
3484+
`,
3485+
},
3486+
{
3487+
name: "PodOSSelectorNodeLabelDoesNotMatch",
3488+
reason: lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
3489+
wants: `
3490+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3491+
# TYPE kubelet_admission_rejections_total counter
3492+
kubelet_admission_rejections_total{reason="PodOSSelectorNodeLabelDoesNotMatch"} 1
3493+
`,
3494+
},
3495+
{
3496+
name: "PodOSNotSupported",
3497+
reason: lifecycle.PodOSNotSupported,
3498+
wants: `
3499+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3500+
# TYPE kubelet_admission_rejections_total counter
3501+
kubelet_admission_rejections_total{reason="PodOSNotSupported"} 1
3502+
`,
3503+
},
3504+
{
3505+
name: "InvalidNodeInfo",
3506+
reason: lifecycle.InvalidNodeInfo,
3507+
wants: `
3508+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3509+
# TYPE kubelet_admission_rejections_total counter
3510+
kubelet_admission_rejections_total{reason="InvalidNodeInfo"} 1
3511+
`,
3512+
},
3513+
{
3514+
name: "InitContainerRestartPolicyForbidden",
3515+
reason: lifecycle.InitContainerRestartPolicyForbidden,
3516+
wants: `
3517+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3518+
# TYPE kubelet_admission_rejections_total counter
3519+
kubelet_admission_rejections_total{reason="InitContainerRestartPolicyForbidden"} 1
3520+
`,
3521+
},
3522+
{
3523+
name: "UnexpectedAdmissionError",
3524+
reason: lifecycle.UnexpectedAdmissionError,
3525+
wants: `
3526+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3527+
# TYPE kubelet_admission_rejections_total counter
3528+
kubelet_admission_rejections_total{reason="UnexpectedAdmissionError"} 1
3529+
`,
3530+
},
3531+
{
3532+
name: "UnknownReason",
3533+
reason: lifecycle.UnknownReason,
3534+
wants: `
3535+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3536+
# TYPE kubelet_admission_rejections_total counter
3537+
kubelet_admission_rejections_total{reason="UnknownReason"} 1
3538+
`,
3539+
},
3540+
{
3541+
name: "UnexpectedPredicateFailureType",
3542+
reason: lifecycle.UnexpectedPredicateFailureType,
3543+
wants: `
3544+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3545+
# TYPE kubelet_admission_rejections_total counter
3546+
kubelet_admission_rejections_total{reason="UnexpectedPredicateFailureType"} 1
3547+
`,
3548+
},
3549+
{
3550+
name: "node(s) had taints that the pod didn't tolerate",
3551+
reason: tainttoleration.ErrReasonNotMatch,
3552+
wants: `
3553+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3554+
# TYPE kubelet_admission_rejections_total counter
3555+
kubelet_admission_rejections_total{reason="node(s) had taints that the pod didn't tolerate"} 1
3556+
`,
3557+
},
3558+
{
3559+
name: "Evicted",
3560+
reason: eviction.Reason,
3561+
wants: `
3562+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3563+
# TYPE kubelet_admission_rejections_total counter
3564+
kubelet_admission_rejections_total{reason="Evicted"} 1
3565+
`,
3566+
},
3567+
{
3568+
name: "SysctlForbidden",
3569+
reason: sysctl.ForbiddenReason,
3570+
wants: `
3571+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3572+
# TYPE kubelet_admission_rejections_total counter
3573+
kubelet_admission_rejections_total{reason="SysctlForbidden"} 1
3574+
`,
3575+
},
3576+
{
3577+
name: "TopologyAffinityError",
3578+
reason: topologymanager.ErrorTopologyAffinity,
3579+
wants: `
3580+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3581+
# TYPE kubelet_admission_rejections_total counter
3582+
kubelet_admission_rejections_total{reason="TopologyAffinityError"} 1
3583+
`,
3584+
},
3585+
{
3586+
name: "NodeShutdown",
3587+
reason: nodeshutdown.NodeShutdownNotAdmittedReason,
3588+
wants: `
3589+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3590+
# TYPE kubelet_admission_rejections_total counter
3591+
kubelet_admission_rejections_total{reason="NodeShutdown"} 1
3592+
`,
3593+
},
3594+
{
3595+
name: "OutOfcpu",
3596+
reason: "OutOfcpu",
3597+
wants: `
3598+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3599+
# TYPE kubelet_admission_rejections_total counter
3600+
kubelet_admission_rejections_total{reason="OutOfcpu"} 1
3601+
`,
3602+
},
3603+
{
3604+
name: "OutOfmemory",
3605+
reason: "OutOfmemory",
3606+
wants: `
3607+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3608+
# TYPE kubelet_admission_rejections_total counter
3609+
kubelet_admission_rejections_total{reason="OutOfmemory"} 1
3610+
`,
3611+
},
3612+
{
3613+
name: "OutOfephemeral-storage",
3614+
reason: "OutOfephemeral-storage",
3615+
wants: `
3616+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3617+
# TYPE kubelet_admission_rejections_total counter
3618+
kubelet_admission_rejections_total{reason="OutOfephemeral-storage"} 1
3619+
`,
3620+
},
3621+
{
3622+
name: "OutOfpods",
3623+
reason: "OutOfpods",
3624+
wants: `
3625+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3626+
# TYPE kubelet_admission_rejections_total counter
3627+
kubelet_admission_rejections_total{reason="OutOfpods"} 1
3628+
`,
3629+
},
3630+
{
3631+
name: "OutOfgpu",
3632+
reason: "OutOfgpu",
3633+
wants: `
3634+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3635+
# TYPE kubelet_admission_rejections_total counter
3636+
kubelet_admission_rejections_total{reason="OutOfExtendedResources"} 1
3637+
`,
3638+
},
3639+
{
3640+
name: "OtherReason",
3641+
reason: "OtherReason",
3642+
wants: `
3643+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3644+
# TYPE kubelet_admission_rejections_total counter
3645+
kubelet_admission_rejections_total{reason="Other"} 1
3646+
`,
3647+
},
3648+
}
3649+
3650+
// Run tests.
3651+
for _, tc := range testCases {
3652+
t.Run(tc.name, func(t *testing.T) {
3653+
// Clear the metrics after the test.
3654+
metrics.AdmissionRejectionsTotal.Reset()
3655+
3656+
// Call the function.
3657+
recordAdmissionRejection(tc.reason)
3658+
3659+
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(tc.wants), "kubelet_admission_rejections_total"); err != nil {
3660+
t.Error(err)
3661+
}
3662+
})
3663+
}
3664+
}

pkg/kubelet/lifecycle/handlers.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ import (
4444

4545
const (
4646
maxRespBodyLength = 10 * 1 << 10 // 10KB
47+
48+
AppArmorNotAdmittedReason = "AppArmor"
4749
)
4850

4951
type handlerRunner struct {
@@ -224,7 +226,7 @@ func (a *appArmorAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult {
224226
}
225227
return PodAdmitResult{
226228
Admit: false,
227-
Reason: "AppArmor",
229+
Reason: AppArmorNotAdmittedReason,
228230
Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err),
229231
}
230232
}

0 commit comments

Comments
 (0)