Skip to content

Commit 04129d1

Browse files
committed
node: metrics for alignment failures
Add metrics to report alignment allocation failures See: kubernetes/enhancements#5108 Signed-off-by: Francesco Romani <[email protected]>
1 parent d66928b commit 04129d1

File tree

7 files changed

+75
-7
lines changed

7 files changed

+75
-7
lines changed

pkg/kubelet/cm/cpumanager/policy_static.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -325,13 +325,15 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
325325
defer func() {
326326
if rerr != nil {
327327
metrics.CPUManagerPinningErrorsTotal.Inc()
328+
if p.options.FullPhysicalCPUsOnly {
329+
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
330+
}
328331
return
329332
}
330-
if !p.options.FullPhysicalCPUsOnly {
333+
if p.options.FullPhysicalCPUsOnly {
331334
// increment only if we know we allocate aligned resources
332-
return
335+
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
333336
}
334-
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
335337
}()
336338

337339
if p.options.FullPhysicalCPUsOnly {
@@ -752,6 +754,7 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC
752754
func (p *staticPolicy) initializeMetrics(s state.State) {
753755
metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
754756
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
757+
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Add(0) // ensure the value exists
755758
}
756759

757760
func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) {

pkg/kubelet/cm/topologymanager/scope_container.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
5050
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod), "containerName", container.Name)
5151

5252
if !admit {
53+
if IsAlignmentGuaranteed(s.policy) {
54+
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
55+
}
5356
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
5457
return admission.GetPodAdmitResult(&TopologyAffinityError{})
5558
}

pkg/kubelet/cm/topologymanager/scope_pod.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
4848
bestHint, admit := s.calculateAffinity(pod)
4949
klog.InfoS("Best TopologyHint", "bestHint", bestHint, "pod", klog.KObj(pod))
5050
if !admit {
51+
if IsAlignmentGuaranteed(s.policy) {
52+
// increment only if we know we allocate aligned resources.
53+
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
54+
}
5155
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
5256
return admission.GetPodAdmitResult(&TopologyAffinityError{})
5357
}

pkg/kubelet/cm/topologymanager/topology_manager.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,19 @@ func NewManager(topology []cadvisorapi.Node, topologyPolicyName string, topology
188188
scope: scope,
189189
}
190190

191+
manager.initializeMetrics()
192+
191193
return manager, nil
192194
}
193195

196+
func (m *manager) initializeMetrics() {
197+
// ensure the values exist
198+
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
199+
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
200+
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Add(0)
201+
metrics.ContainerAlignedComputeResourcesFailure.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Add(0)
202+
}
203+
194204
func (m *manager) GetAffinity(podUID string, containerName string) TopologyHint {
195205
return m.scope.GetAffinity(podUID, containerName)
196206
}

pkg/kubelet/metrics/metrics.go

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ const (
132132

133133
// Metric for tracking aligment of compute resources
134134
ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count"
135+
ContainerAlignedComputeResourcesFailureNameKey = "container_aligned_compute_resources_failure_count"
135136
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
136137
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
137138

@@ -818,7 +819,18 @@ var (
818819
},
819820
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
820821
)
821-
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
822+
823+
// ContainerAlignedComputeResourcesFailure reports the count of resources allocation attempts which failed to align resources, per alignment boundary
824+
ContainerAlignedComputeResourcesFailure = metrics.NewCounterVec(
825+
&metrics.CounterOpts{
826+
Subsystem: KubeletSubsystem,
827+
Name: ContainerAlignedComputeResourcesFailureNameKey,
828+
Help: "Cumulative number of failures to allocate aligned compute resources to containers by alignment type.",
829+
StabilityLevel: metrics.ALPHA,
830+
},
831+
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
832+
)
833+
822834
MemoryManagerPinningRequestTotal = metrics.NewCounter(
823835
&metrics.CounterOpts{
824836
Subsystem: KubeletSubsystem,
@@ -1079,6 +1091,7 @@ func Register(collectors ...metrics.StableCollector) {
10791091
legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores)
10801092
legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount)
10811093
legacyregistry.MustRegister(ContainerAlignedComputeResources)
1094+
legacyregistry.MustRegister(ContainerAlignedComputeResourcesFailure)
10821095
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
10831096
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)
10841097
legacyregistry.MustRegister(TopologyManagerAdmissionRequestsTotal)

test/e2e_node/cpu_manager_metrics_test.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,17 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
104104
// being [Serial], we can also assume noone else but us is running pods.
105105
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with no pods running")
106106

107+
idFn := makeCustomPairID("scope", "boundary")
107108
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
108109
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
109110
"": timelessSample(0),
110111
}),
111112
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
112113
"": timelessSample(0),
113114
}),
115+
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
116+
"container::physical_cpu": timelessSample(0),
117+
}),
114118
})
115119

116120
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@@ -127,13 +131,17 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
127131
// being [Serial], we can also assume noone else but us is running pods.
128132
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod failed to admit")
129133

134+
idFn := makeCustomPairID("scope", "boundary")
130135
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
131136
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
132137
"": timelessSample(1),
133138
}),
134139
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
135140
"": timelessSample(1),
136141
}),
142+
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
143+
"container::physical_cpu": timelessSample(1),
144+
}),
137145
})
138146

139147
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@@ -150,13 +158,17 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
150158
// being [Serial], we can also assume noone else but us is running pods.
151159
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
152160

161+
idFn := makeCustomPairID("scope", "boundary")
153162
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
154163
"kubelet_cpu_manager_pinning_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
155164
"": timelessSample(1),
156165
}),
157166
"kubelet_cpu_manager_pinning_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
158167
"": timelessSample(0),
159168
}),
169+
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
170+
"container::physical_cpu": timelessSample(0),
171+
}),
160172
})
161173

162174
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
@@ -178,6 +190,9 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
178190
"kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
179191
"container::physical_cpu": timelessSample(1),
180192
}),
193+
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
194+
"container::physical_cpu": timelessSample(0),
195+
}),
181196
})
182197

183198
ginkgo.By("Giving the Kubelet time to update the alignment metrics")

test/e2e_node/topology_manager_metrics_test.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ import (
2828
v1 "k8s.io/api/core/v1"
2929
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
3030
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
31-
"k8s.io/kubernetes/pkg/kubelet/metrics"
3231
"k8s.io/kubernetes/test/e2e/feature"
3332
"k8s.io/kubernetes/test/e2e/framework"
3433
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -84,13 +83,18 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
8483
// being [Serial], we can also assume noone else but us is running pods.
8584
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with no pods running")
8685

86+
idFn := makeCustomPairID("scope", "boundary")
8787
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
8888
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
8989
"": timelessSample(0),
9090
}),
9191
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
9292
"": timelessSample(0),
9393
}),
94+
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
95+
"container::numa_node": timelessSample(0),
96+
"pod::numa_node": timelessSample(0),
97+
}),
9498
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
9599
"": timelessSample(0),
96100
}),
@@ -110,13 +114,18 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
110114
// being [Serial], we can also assume noone else but us is running pods.
111115
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod failed to admit")
112116

117+
idFn := makeCustomPairID("scope", "boundary")
113118
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
114119
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
115120
"": timelessSample(1),
116121
}),
117122
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
118123
"": timelessSample(1),
119124
}),
125+
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
126+
"container::numa_node": timelessSample(0),
127+
"pod::numa_node": timelessSample(1),
128+
}),
120129
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
121130
"": checkMetricValueGreaterThan(0),
122131
}),
@@ -136,13 +145,18 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
136145
// being [Serial], we can also assume noone else but us is running pods.
137146
ginkgo.By("Checking the topologymanager metrics right after the kubelet restart, with pod should be admitted")
138147

148+
idFn := makeCustomPairID("scope", "boundary")
139149
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
140150
"kubelet_topology_manager_admission_requests_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
141151
"": timelessSample(1),
142152
}),
143153
"kubelet_topology_manager_admission_errors_total": gstruct.MatchAllElements(nodeID, gstruct.Elements{
144154
"": timelessSample(0),
145155
}),
156+
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
157+
"container::numa_node": timelessSample(0),
158+
"pod::numa_node": timelessSample(0),
159+
}),
146160
"kubelet_topology_manager_admission_duration_ms_count": gstruct.MatchElements(nodeID, gstruct.IgnoreExtras, gstruct.Elements{
147161
"": checkMetricValueGreaterThan(0),
148162
}),
@@ -162,9 +176,15 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
162176
// being [Serial], we can also assume noone else but us is running pods.
163177
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
164178

179+
idFn := makeCustomPairID("scope", "boundary")
165180
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
166-
"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
167-
metrics.AlignedNUMANode: timelessSample(1),
181+
"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(idFn, gstruct.Elements{
182+
"container::numa_node": timelessSample(0),
183+
"pod::numa_node": timelessSample(1),
184+
}),
185+
"kubelet_container_aligned_compute_resources_failure_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
186+
"container::numa_node": timelessSample(0),
187+
"pod::numa_node": timelessSample(0),
168188
}),
169189
})
170190

0 commit comments

Comments
 (0)