Skip to content

Commit 0994499

Browse files
authored
Merge pull request kubernetes#128556 from AnishShah/kubelet-reject-metric
Introduce a metric to track kubelet admission failure.
2 parents 7a1f8aa + d4f05fd commit 0994499

File tree

8 files changed

+343
-23
lines changed

8 files changed

+343
-23
lines changed

pkg/kubelet/kubelet.go

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
sysruntime "runtime"
3030
"slices"
3131
"sort"
32+
"strings"
3233
"sync"
3334
"sync/atomic"
3435
"time"
@@ -81,6 +82,7 @@ import (
8182
"k8s.io/kubernetes/pkg/kubelet/cloudresource"
8283
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
8384
"k8s.io/kubernetes/pkg/kubelet/cm"
85+
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
8486
"k8s.io/kubernetes/pkg/kubelet/config"
8587
"k8s.io/kubernetes/pkg/kubelet/configmap"
8688
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@@ -120,6 +122,7 @@ import (
120122
"k8s.io/kubernetes/pkg/kubelet/volumemanager"
121123
"k8s.io/kubernetes/pkg/kubelet/watchdog"
122124
httpprobe "k8s.io/kubernetes/pkg/probe/http"
125+
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
123126
"k8s.io/kubernetes/pkg/security/apparmor"
124127
"k8s.io/kubernetes/pkg/util/oom"
125128
"k8s.io/kubernetes/pkg/volume"
@@ -220,6 +223,26 @@ var (
220223
// ContainerLogsDir can be overwritten for testing usage
221224
ContainerLogsDir = DefaultContainerLogsDir
222225
etcHostsPath = getContainerEtcHostsPath()
226+
227+
admissionRejectionReasons = sets.New[string](
228+
lifecycle.AppArmorNotAdmittedReason,
229+
lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
230+
lifecycle.PodOSNotSupported,
231+
lifecycle.InvalidNodeInfo,
232+
lifecycle.InitContainerRestartPolicyForbidden,
233+
lifecycle.UnexpectedAdmissionError,
234+
lifecycle.UnknownReason,
235+
lifecycle.UnexpectedPredicateFailureType,
236+
lifecycle.OutOfCPU,
237+
lifecycle.OutOfMemory,
238+
lifecycle.OutOfEphemeralStorage,
239+
lifecycle.OutOfPods,
240+
tainttoleration.ErrReasonNotMatch,
241+
eviction.Reason,
242+
sysctl.ForbiddenReason,
243+
topologymanager.ErrorTopologyAffinity,
244+
nodeshutdown.NodeShutdownNotAdmittedReason,
245+
)
223246
)
224247

225248
func getContainerEtcHostsPath() string {
@@ -2310,7 +2333,6 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
23102333
attrs := &lifecycle.PodAdmitAttributes{Pod: pod, OtherPods: allocatedPods}
23112334
for _, podAdmitHandler := range kl.admitHandlers {
23122335
if result := podAdmitHandler.Admit(attrs); !result.Admit {
2313-
23142336
klog.InfoS("Pod admission denied", "podUID", attrs.Pod.UID, "pod", klog.KObj(attrs.Pod), "reason", result.Reason, "message", result.Message)
23152337

23162338
return false, result.Reason, result.Message
@@ -2320,6 +2342,22 @@ func (kl *Kubelet) canAdmitPod(allocatedPods []*v1.Pod, pod *v1.Pod) (bool, stri
23202342
return true, "", ""
23212343
}
23222344

2345+
func recordAdmissionRejection(reason string) {
2346+
// It is possible that the "reason" label can have high cardinality.
2347+
// To avoid this metric from exploding, we create an allowlist of known
2348+
// reasons, and only record reasons from this list. Use "Other" reason
2349+
// for the rest.
2350+
if admissionRejectionReasons.Has(reason) {
2351+
metrics.AdmissionRejectionsTotal.WithLabelValues(reason).Inc()
2352+
} else if strings.HasPrefix(reason, lifecycle.InsufficientResourcePrefix) {
2353+
// non-extended resources (like cpu, memory, ephemeral-storage, pods)
2354+
// are already included in admissionRejectionReasons.
2355+
metrics.AdmissionRejectionsTotal.WithLabelValues("OutOfExtendedResources").Inc()
2356+
} else {
2357+
metrics.AdmissionRejectionsTotal.WithLabelValues("Other").Inc()
2358+
}
2359+
}
2360+
23232361
// syncLoop is the main loop for processing changes. It watches for changes from
23242362
// three channels (file, apiserver, and http) and creates a union of them. For
23252363
// any new change seen, will run a sync against desired state and running state. If
@@ -2590,6 +2628,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
25902628
// Check if we can admit the pod; if not, reject it.
25912629
if ok, reason, message := kl.canAdmitPod(allocatedPods, allocatedPod); !ok {
25922630
kl.rejectPod(pod, reason, message)
2631+
// We avoid recording the metric in canAdmitPod because it's called
2632+
// repeatedly during a resize, which would inflate the metric.
2633+
// Instead, we record the metric here in HandlePodAdditions for new pods
2634+
// and capture resize events separately.
2635+
recordAdmissionRejection(reason)
25932636
continue
25942637
}
25952638
// For new pod, checkpoint the resource values at which the Pod has been admitted
@@ -2601,6 +2644,11 @@ func (kl *Kubelet) HandlePodAdditions(pods []*v1.Pod) {
26012644
// Check if we can admit the pod; if not, reject it.
26022645
if ok, reason, message := kl.canAdmitPod(allocatedPods, pod); !ok {
26032646
kl.rejectPod(pod, reason, message)
2647+
// We avoid recording the metric in canAdmitPod because it's called
2648+
// repeatedly during a resize, which would inflate the metric.
2649+
// Instead, we record the metric here in HandlePodAdditions for new pods
2650+
// and capture resize events separately.
2651+
recordAdmissionRejection(reason)
26042652
continue
26052653
}
26062654
}

pkg/kubelet/kubelet_test.go

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ import (
5555
"k8s.io/client-go/tools/record"
5656
"k8s.io/client-go/util/flowcontrol"
5757
featuregatetesting "k8s.io/component-base/featuregate/testing"
58+
"k8s.io/component-base/metrics/testutil"
5859
internalapi "k8s.io/cri-api/pkg/apis"
5960
runtimeapi "k8s.io/cri-api/pkg/apis/runtime/v1"
6061
remote "k8s.io/cri-client/pkg"
@@ -65,6 +66,7 @@ import (
6566
cadvisortest "k8s.io/kubernetes/pkg/kubelet/cadvisor/testing"
6667
"k8s.io/kubernetes/pkg/kubelet/clustertrustbundle"
6768
"k8s.io/kubernetes/pkg/kubelet/cm"
69+
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
6870
"k8s.io/kubernetes/pkg/kubelet/config"
6971
"k8s.io/kubernetes/pkg/kubelet/configmap"
7072
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
@@ -74,6 +76,7 @@ import (
7476
"k8s.io/kubernetes/pkg/kubelet/kuberuntime"
7577
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
7678
"k8s.io/kubernetes/pkg/kubelet/logs"
79+
"k8s.io/kubernetes/pkg/kubelet/metrics"
7780
"k8s.io/kubernetes/pkg/kubelet/network/dns"
7881
"k8s.io/kubernetes/pkg/kubelet/nodeshutdown"
7982
"k8s.io/kubernetes/pkg/kubelet/pleg"
@@ -89,12 +92,14 @@ import (
8992
"k8s.io/kubernetes/pkg/kubelet/status"
9093
"k8s.io/kubernetes/pkg/kubelet/status/state"
9194
statustest "k8s.io/kubernetes/pkg/kubelet/status/testing"
95+
"k8s.io/kubernetes/pkg/kubelet/sysctl"
9296
"k8s.io/kubernetes/pkg/kubelet/token"
9397
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
9498
kubeletutil "k8s.io/kubernetes/pkg/kubelet/util"
9599
"k8s.io/kubernetes/pkg/kubelet/util/queue"
96100
kubeletvolume "k8s.io/kubernetes/pkg/kubelet/volumemanager"
97101
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
102+
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
98103
"k8s.io/kubernetes/pkg/util/oom"
99104
"k8s.io/kubernetes/pkg/volume"
100105
_ "k8s.io/kubernetes/pkg/volume/hostpath"
@@ -3461,3 +3466,200 @@ func TestIsPodResizeInProgress(t *testing.T) {
34613466
})
34623467
}
34633468
}
3469+
3470+
func TestRecordAdmissionRejection(t *testing.T) {
3471+
metrics.Register()
3472+
3473+
testCases := []struct {
3474+
name string
3475+
reason string
3476+
wants string
3477+
}{
3478+
{
3479+
name: "AppArmor",
3480+
reason: lifecycle.AppArmorNotAdmittedReason,
3481+
wants: `
3482+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3483+
# TYPE kubelet_admission_rejections_total counter
3484+
kubelet_admission_rejections_total{reason="AppArmor"} 1
3485+
`,
3486+
},
3487+
{
3488+
name: "PodOSSelectorNodeLabelDoesNotMatch",
3489+
reason: lifecycle.PodOSSelectorNodeLabelDoesNotMatch,
3490+
wants: `
3491+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3492+
# TYPE kubelet_admission_rejections_total counter
3493+
kubelet_admission_rejections_total{reason="PodOSSelectorNodeLabelDoesNotMatch"} 1
3494+
`,
3495+
},
3496+
{
3497+
name: "PodOSNotSupported",
3498+
reason: lifecycle.PodOSNotSupported,
3499+
wants: `
3500+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3501+
# TYPE kubelet_admission_rejections_total counter
3502+
kubelet_admission_rejections_total{reason="PodOSNotSupported"} 1
3503+
`,
3504+
},
3505+
{
3506+
name: "InvalidNodeInfo",
3507+
reason: lifecycle.InvalidNodeInfo,
3508+
wants: `
3509+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3510+
# TYPE kubelet_admission_rejections_total counter
3511+
kubelet_admission_rejections_total{reason="InvalidNodeInfo"} 1
3512+
`,
3513+
},
3514+
{
3515+
name: "InitContainerRestartPolicyForbidden",
3516+
reason: lifecycle.InitContainerRestartPolicyForbidden,
3517+
wants: `
3518+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3519+
# TYPE kubelet_admission_rejections_total counter
3520+
kubelet_admission_rejections_total{reason="InitContainerRestartPolicyForbidden"} 1
3521+
`,
3522+
},
3523+
{
3524+
name: "UnexpectedAdmissionError",
3525+
reason: lifecycle.UnexpectedAdmissionError,
3526+
wants: `
3527+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3528+
# TYPE kubelet_admission_rejections_total counter
3529+
kubelet_admission_rejections_total{reason="UnexpectedAdmissionError"} 1
3530+
`,
3531+
},
3532+
{
3533+
name: "UnknownReason",
3534+
reason: lifecycle.UnknownReason,
3535+
wants: `
3536+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3537+
# TYPE kubelet_admission_rejections_total counter
3538+
kubelet_admission_rejections_total{reason="UnknownReason"} 1
3539+
`,
3540+
},
3541+
{
3542+
name: "UnexpectedPredicateFailureType",
3543+
reason: lifecycle.UnexpectedPredicateFailureType,
3544+
wants: `
3545+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3546+
# TYPE kubelet_admission_rejections_total counter
3547+
kubelet_admission_rejections_total{reason="UnexpectedPredicateFailureType"} 1
3548+
`,
3549+
},
3550+
{
3551+
name: "node(s) had taints that the pod didn't tolerate",
3552+
reason: tainttoleration.ErrReasonNotMatch,
3553+
wants: `
3554+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3555+
# TYPE kubelet_admission_rejections_total counter
3556+
kubelet_admission_rejections_total{reason="node(s) had taints that the pod didn't tolerate"} 1
3557+
`,
3558+
},
3559+
{
3560+
name: "Evicted",
3561+
reason: eviction.Reason,
3562+
wants: `
3563+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3564+
# TYPE kubelet_admission_rejections_total counter
3565+
kubelet_admission_rejections_total{reason="Evicted"} 1
3566+
`,
3567+
},
3568+
{
3569+
name: "SysctlForbidden",
3570+
reason: sysctl.ForbiddenReason,
3571+
wants: `
3572+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3573+
# TYPE kubelet_admission_rejections_total counter
3574+
kubelet_admission_rejections_total{reason="SysctlForbidden"} 1
3575+
`,
3576+
},
3577+
{
3578+
name: "TopologyAffinityError",
3579+
reason: topologymanager.ErrorTopologyAffinity,
3580+
wants: `
3581+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3582+
# TYPE kubelet_admission_rejections_total counter
3583+
kubelet_admission_rejections_total{reason="TopologyAffinityError"} 1
3584+
`,
3585+
},
3586+
{
3587+
name: "NodeShutdown",
3588+
reason: nodeshutdown.NodeShutdownNotAdmittedReason,
3589+
wants: `
3590+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3591+
# TYPE kubelet_admission_rejections_total counter
3592+
kubelet_admission_rejections_total{reason="NodeShutdown"} 1
3593+
`,
3594+
},
3595+
{
3596+
name: "OutOfcpu",
3597+
reason: "OutOfcpu",
3598+
wants: `
3599+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3600+
# TYPE kubelet_admission_rejections_total counter
3601+
kubelet_admission_rejections_total{reason="OutOfcpu"} 1
3602+
`,
3603+
},
3604+
{
3605+
name: "OutOfmemory",
3606+
reason: "OutOfmemory",
3607+
wants: `
3608+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3609+
# TYPE kubelet_admission_rejections_total counter
3610+
kubelet_admission_rejections_total{reason="OutOfmemory"} 1
3611+
`,
3612+
},
3613+
{
3614+
name: "OutOfephemeral-storage",
3615+
reason: "OutOfephemeral-storage",
3616+
wants: `
3617+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3618+
# TYPE kubelet_admission_rejections_total counter
3619+
kubelet_admission_rejections_total{reason="OutOfephemeral-storage"} 1
3620+
`,
3621+
},
3622+
{
3623+
name: "OutOfpods",
3624+
reason: "OutOfpods",
3625+
wants: `
3626+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3627+
# TYPE kubelet_admission_rejections_total counter
3628+
kubelet_admission_rejections_total{reason="OutOfpods"} 1
3629+
`,
3630+
},
3631+
{
3632+
name: "OutOfgpu",
3633+
reason: "OutOfgpu",
3634+
wants: `
3635+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3636+
# TYPE kubelet_admission_rejections_total counter
3637+
kubelet_admission_rejections_total{reason="OutOfExtendedResources"} 1
3638+
`,
3639+
},
3640+
{
3641+
name: "OtherReason",
3642+
reason: "OtherReason",
3643+
wants: `
3644+
# HELP kubelet_admission_rejections_total [ALPHA] Cumulative number pod admission rejections by the Kubelet.
3645+
# TYPE kubelet_admission_rejections_total counter
3646+
kubelet_admission_rejections_total{reason="Other"} 1
3647+
`,
3648+
},
3649+
}
3650+
3651+
// Run tests.
3652+
for _, tc := range testCases {
3653+
t.Run(tc.name, func(t *testing.T) {
3654+
// Clear the metrics after the test.
3655+
metrics.AdmissionRejectionsTotal.Reset()
3656+
3657+
// Call the function.
3658+
recordAdmissionRejection(tc.reason)
3659+
3660+
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(tc.wants), "kubelet_admission_rejections_total"); err != nil {
3661+
t.Error(err)
3662+
}
3663+
})
3664+
}
3665+
}

pkg/kubelet/lifecycle/handlers.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ import (
4444

4545
const (
4646
maxRespBodyLength = 10 * 1 << 10 // 10KB
47+
48+
AppArmorNotAdmittedReason = "AppArmor"
4749
)
4850

4951
type handlerRunner struct {
@@ -224,7 +226,7 @@ func (a *appArmorAdmitHandler) Admit(attrs *PodAdmitAttributes) PodAdmitResult {
224226
}
225227
return PodAdmitResult{
226228
Admit: false,
227-
Reason: "AppArmor",
229+
Reason: AppArmorNotAdmittedReason,
228230
Message: fmt.Sprintf("Cannot enforce AppArmor: %v", err),
229231
}
230232
}

0 commit comments

Comments
 (0)