Skip to content

Commit c6669ea

Browse files
authored
Merge pull request kubernetes#127155 from ffromani/alignment-metrics
node: metrics: add resource alignment metrics
2 parents 403fcab + c025861 commit c6669ea

File tree

8 files changed

+96
-2
lines changed

8 files changed

+96
-2
lines changed

pkg/kubelet/cm/cpumanager/policy_static.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,13 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
312312
defer func() {
313313
if rerr != nil {
314314
metrics.CPUManagerPinningErrorsTotal.Inc()
315+
return
316+
}
317+
if !p.options.FullPhysicalCPUsOnly {
318+
// increment only if we know we allocate aligned resources
319+
return
315320
}
321+
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedPhysicalCPU).Inc()
316322
}()
317323

318324
if p.options.FullPhysicalCPUsOnly {

pkg/kubelet/cm/topologymanager/policy.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,14 @@ type Policy interface {
3030
Merge(providersHints []map[string][]TopologyHint) (TopologyHint, bool)
3131
}
3232

33+
// IsAlignmentGuaranteed return true if the given policy guarantees that either
34+
// the compute resources will be allocated within a NUMA boundary, or the allocation will fail at all.
35+
func IsAlignmentGuaranteed(p Policy) bool {
36+
// We are abusing the name, but atm this matches almost 1:1 the policy name
37+
// so we are not adding new fields for now.
38+
return p.Name() == PolicySingleNumaNode
39+
}
40+
3341
// Merge a TopologyHints permutation to a single hint by performing a bitwise-AND
3442
// of their affinity masks. The hint shall be preferred if all hits in the permutation
3543
// are preferred.

pkg/kubelet/cm/topologymanager/scope_container.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ func (s *containerScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
6161
metrics.TopologyManagerAdmissionErrorsTotal.Inc()
6262
return admission.GetPodAdmitResult(err)
6363
}
64+
65+
if IsAlignmentGuaranteed(s.policy) {
66+
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopeContainer, metrics.AlignedNUMANode).Inc()
67+
}
6468
}
6569
return admission.GetPodAdmitResult(nil)
6670
}

pkg/kubelet/cm/topologymanager/scope_pod.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ func (s *podScope) Admit(pod *v1.Pod) lifecycle.PodAdmitResult {
6262
return admission.GetPodAdmitResult(err)
6363
}
6464
}
65+
if IsAlignmentGuaranteed(s.policy) {
66+
// increment only if we know we allocate aligned resources.
67+
metrics.ContainerAlignedComputeResources.WithLabelValues(metrics.AlignScopePod, metrics.AlignedNUMANode).Inc()
68+
}
6569
return admission.GetPodAdmitResult(nil)
6670
}
6771

pkg/kubelet/metrics/metrics.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,21 @@ const (
127127
// Metric for tracking garbage collected images
128128
ImageGarbageCollectedTotalKey = "image_garbage_collected_total"
129129

130+
// Metric for tracking aligment of compute resources
131+
ContainerAlignedComputeResourcesNameKey = "container_aligned_compute_resources_count"
132+
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
133+
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
134+
130135
// Values used in metric labels
131136
Container = "container"
132137
InitContainer = "init_container"
133138
EphemeralContainer = "ephemeral_container"
139+
140+
AlignScopePod = "pod"
141+
AlignScopeContainer = "container"
142+
143+
AlignedPhysicalCPU = "physical_cpu"
144+
AlignedNUMANode = "numa_node"
134145
)
135146

136147
type imageSizeBucket struct {
@@ -762,6 +773,16 @@ var (
762773
},
763774
)
764775

776+
ContainerAlignedComputeResources = metrics.NewCounterVec(
777+
&metrics.CounterOpts{
778+
Subsystem: KubeletSubsystem,
779+
Name: ContainerAlignedComputeResourcesNameKey,
780+
Help: "Cumulative number of aligned compute resources allocated to containers by alignment type.",
781+
StabilityLevel: metrics.ALPHA,
782+
},
783+
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
784+
)
785+
765786
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
766787
MemoryManagerPinningRequestTotal = metrics.NewCounter(
767788
&metrics.CounterOpts{
@@ -985,6 +1006,7 @@ func Register(collectors ...metrics.StableCollector) {
9851006
legacyregistry.MustRegister(RunPodSandboxErrors)
9861007
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
9871008
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
1009+
legacyregistry.MustRegister(ContainerAlignedComputeResources)
9881010
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
9891011
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)
9901012
legacyregistry.MustRegister(MemoryManagerPinningErrorsTotal)

test/e2e_node/cpu_manager_metrics_test.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
9191
ginkgo.AfterEach(func(ctx context.Context) {
9292
if testPod != nil {
9393
deletePodSyncByName(ctx, f, testPod.Name)
94+
waitForContainerRemoval(ctx, testPod.Spec.Containers[0].Name, testPod.Name, testPod.Namespace)
9495
}
9596
updateKubeletConfig(ctx, f, oldCfg, true)
9697
})
@@ -160,11 +161,32 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
160161
ginkgo.By("Ensuring the metrics match the expectations a few more times")
161162
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchResourceMetrics)
162163
})
164+
165+
ginkgo.It("should return updated alignment counters when pod successfully run", func(ctx context.Context) {
166+
ginkgo.By("Creating the test pod")
167+
testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("count-align-smt-ok", smtLevel))
168+
169+
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
170+
// being [Serial], we can also assume noone else but us is running pods.
171+
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
172+
173+
idFn := makeCustomPairID("scope", "boundary")
174+
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
175+
"kubelet_container_aligned_compute_resources_count": gstruct.MatchElements(idFn, gstruct.IgnoreExtras, gstruct.Elements{
176+
"container::physical_cpu": timelessSample(1),
177+
}),
178+
})
179+
180+
ginkgo.By("Giving the Kubelet time to update the alignment metrics")
181+
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
182+
ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
183+
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
184+
})
163185
})
164186
})
165187

166188
func getKubeletMetrics(ctx context.Context) (e2emetrics.KubeletMetrics, error) {
167-
ginkgo.By("getting Kubelet metrics from the metrics API")
189+
ginkgo.By("Getting Kubelet metrics from the metrics API")
168190
return e2emetrics.GrabKubeletMetricsWithoutProxy(ctx, nodeNameOrIP()+":10255", "/metrics")
169191
}
170192

@@ -189,7 +211,7 @@ func makeGuaranteedCPUExclusiveSleeperPod(name string, cpus int) *v1.Pod {
189211
v1.ResourceMemory: resource.MustParse("64Mi"),
190212
},
191213
},
192-
Command: []string{"sh", "-c", "sleep", "1d"},
214+
Command: []string{"sh", "-c", "sleep 1d"},
193215
},
194216
},
195217
},

test/e2e_node/resource_metrics_test.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,13 @@ func containerID(element interface{}) string {
157157
return fmt.Sprintf("%s::%s::%s", el.Metric["namespace"], el.Metric["pod"], el.Metric["container"])
158158
}
159159

160+
func makeCustomPairID(pri, sec string) func(interface{}) string {
161+
return func(element interface{}) string {
162+
el := element.(*model.Sample)
163+
return fmt.Sprintf("%s::%s", el.Metric[model.LabelName(pri)], el.Metric[model.LabelName(sec)])
164+
}
165+
}
166+
160167
func boundedSample(lower, upper interface{}) types.GomegaMatcher {
161168
return gstruct.PointTo(gstruct.MatchAllFields(gstruct.Fields{
162169
// We already check Metric when matching the Id

test/e2e_node/topology_manager_metrics_test.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
v1 "k8s.io/api/core/v1"
2929
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
3030
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
31+
"k8s.io/kubernetes/pkg/kubelet/metrics"
3132
"k8s.io/kubernetes/test/e2e/feature"
3233
"k8s.io/kubernetes/test/e2e/framework"
3334
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -152,6 +153,26 @@ var _ = SIGDescribe("Topology Manager Metrics", framework.WithSerial(), feature.
152153
ginkgo.By("Ensuring the metrics match the expectations a few more times")
153154
gomega.Consistently(ctx, getKubeletMetrics, 2*time.Minute, 10*time.Second).Should(matchResourceMetrics)
154155
})
156+
157+
ginkgo.It("[alignment] should return updated alignment counters when pod successfully run", func(ctx context.Context) {
158+
ginkgo.By("Creating the test pod")
159+
testPod = e2epod.NewPodClient(f).Create(ctx, makeGuaranteedCPUExclusiveSleeperPod("count-align-numa-ok", cpusNumPerNUMA))
160+
161+
// we updated the kubelet config in BeforeEach, so we can assume we start fresh.
162+
// being [Serial], we can also assume noone else but us is running pods.
163+
ginkgo.By("Checking the cpumanager metrics right after the kubelet restart, with pod should be admitted")
164+
165+
matchAlignmentMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
166+
"kubelet_container_aligned_compute_resources_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
167+
metrics.AlignedNUMANode: timelessSample(1),
168+
}),
169+
})
170+
171+
ginkgo.By("Giving the Kubelet time to update the alignment metrics")
172+
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
173+
ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
174+
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
175+
})
155176
})
156177
})
157178

0 commit comments

Comments
 (0)