Skip to content

Commit 14ec0ed

Browse files
committed
node: metrics: add metrics about cpu pool sizes
Add metrics about the sizing of the cpu pools. Currently the cpumanager maintains 2 cpu pools: - shared pool: this is where all pods with non-exclusive cpu allocation run - exclusive pool: this is the union of the set of exclusive cpus allocated to containers, if any (requires static policy in use). By reporting the size of the pools, the users (humans or machines) can get better insights and more feedback about how the resources actually allocated to the workload and how the node resources are used.
1 parent 212c4c4 commit 14ec0ed

File tree

4 files changed

+168
-15
lines changed

4 files changed

+168
-15
lines changed

pkg/kubelet/cm/cpumanager/policy_static.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@ func (p *staticPolicy) Start(s state.State) error {
194194
klog.ErrorS(err, "Static policy invalid state, please drain node and remove policy state file")
195195
return err
196196
}
197+
p.initializeMetrics(s)
197198
return nil
198199
}
199200

@@ -370,8 +371,10 @@ func (p *staticPolicy) Allocate(s state.State, pod *v1.Pod, container *v1.Contai
370371
klog.ErrorS(err, "Unable to allocate CPUs", "pod", klog.KObj(pod), "containerName", container.Name, "numCPUs", numCPUs)
371372
return err
372373
}
374+
373375
s.SetCPUSet(string(pod.UID), container.Name, cpuset)
374376
p.updateCPUsToReuse(pod, container, cpuset)
377+
p.updateMetricsOnAllocate(cpuset)
375378

376379
return nil
377380
}
@@ -397,6 +400,7 @@ func (p *staticPolicy) RemoveContainer(s state.State, podUID string, containerNa
397400
// Mutate the shared pool, adding released cpus.
398401
toRelease = toRelease.Difference(cpusInUse)
399402
s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(toRelease))
403+
p.updateMetricsOnRelease(toRelease)
400404
}
401405
return nil
402406
}
@@ -720,3 +724,30 @@ func (p *staticPolicy) getAlignedCPUs(numaAffinity bitmask.BitMask, allocatableC
720724

721725
return alignedCPUs
722726
}
727+
728+
func (p *staticPolicy) initializeMetrics(s state.State) {
729+
metrics.CPUManagerSharedPoolSizeMilliCores.Set(float64(p.GetAvailableCPUs(s).Size() * 1000))
730+
metrics.CPUManagerExclusiveCPUsAllocationCount.Set(float64(countExclusiveCPUs(s)))
731+
}
732+
733+
func (p *staticPolicy) updateMetricsOnAllocate(cset cpuset.CPUSet) {
734+
ncpus := cset.Size()
735+
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(ncpus))
736+
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(-ncpus * 1000))
737+
}
738+
739+
func (p *staticPolicy) updateMetricsOnRelease(cset cpuset.CPUSet) {
740+
ncpus := cset.Size()
741+
metrics.CPUManagerExclusiveCPUsAllocationCount.Add(float64(-ncpus))
742+
metrics.CPUManagerSharedPoolSizeMilliCores.Add(float64(ncpus * 1000))
743+
}
744+
745+
func countExclusiveCPUs(s state.State) int {
746+
exclusiveCPUs := 0
747+
for _, cpuAssign := range s.GetCPUAssignments() {
748+
for _, cset := range cpuAssign {
749+
exclusiveCPUs += cset.Size()
750+
}
751+
}
752+
return exclusiveCPUs
753+
}

pkg/kubelet/metrics/metrics.go

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,10 @@ const (
108108
ManagedEphemeralContainersKey = "managed_ephemeral_containers"
109109

110110
// Metrics to track the CPU manager behavior
111-
CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total"
112-
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
111+
CPUManagerPinningRequestsTotalKey = "cpu_manager_pinning_requests_total"
112+
CPUManagerPinningErrorsTotalKey = "cpu_manager_pinning_errors_total"
113+
CPUManagerSharedPoolSizeMilliCoresKey = "cpu_manager_shared_pool_size_millicores"
114+
CPUManagerExclusiveCPUsAllocationCountKey = "cpu_manager_exclusive_cpu_allocation_count"
113115

114116
// Metrics to track the Memory manager behavior
115117
MemoryManagerPinningRequestsTotalKey = "memory_manager_pinning_requests_total"
@@ -773,6 +775,27 @@ var (
773775
},
774776
)
775777

778+
// CPUManagerSharedPoolSizeMilliCores reports the current size of the shared CPU pool for non-guaranteed pods
779+
CPUManagerSharedPoolSizeMilliCores = metrics.NewGauge(
780+
&metrics.GaugeOpts{
781+
Subsystem: KubeletSubsystem,
782+
Name: CPUManagerSharedPoolSizeMilliCoresKey,
783+
Help: "The size of the shared CPU pool for non-guaranteed QoS pods, in millicores.",
784+
StabilityLevel: metrics.ALPHA,
785+
},
786+
)
787+
788+
// CPUManagerExclusiveCPUsAllocationCount reports the total number of CPUs exclusively allocated to containers running on this node
789+
CPUManagerExclusiveCPUsAllocationCount = metrics.NewGauge(
790+
&metrics.GaugeOpts{
791+
Subsystem: KubeletSubsystem,
792+
Name: CPUManagerExclusiveCPUsAllocationCountKey,
793+
Help: "The total number of CPUs exclusively allocated to containers running on this node",
794+
StabilityLevel: metrics.ALPHA,
795+
},
796+
)
797+
798+
// ContainerAlignedComputeResources reports the count of resources allocation which granted aligned resources, per alignment boundary
776799
ContainerAlignedComputeResources = metrics.NewCounterVec(
777800
&metrics.CounterOpts{
778801
Subsystem: KubeletSubsystem,
@@ -782,7 +805,6 @@ var (
782805
},
783806
[]string{ContainerAlignedComputeResourcesScopeLabelKey, ContainerAlignedComputeResourcesBoundaryLabelKey},
784807
)
785-
786808
// MemoryManagerPinningRequestTotal tracks the number of times the pod spec required the memory manager to pin memory pages
787809
MemoryManagerPinningRequestTotal = metrics.NewCounter(
788810
&metrics.CounterOpts{
@@ -1006,6 +1028,8 @@ func Register(collectors ...metrics.StableCollector) {
10061028
legacyregistry.MustRegister(RunPodSandboxErrors)
10071029
legacyregistry.MustRegister(CPUManagerPinningRequestsTotal)
10081030
legacyregistry.MustRegister(CPUManagerPinningErrorsTotal)
1031+
legacyregistry.MustRegister(CPUManagerSharedPoolSizeMilliCores)
1032+
legacyregistry.MustRegister(CPUManagerExclusiveCPUsAllocationCount)
10091033
legacyregistry.MustRegister(ContainerAlignedComputeResources)
10101034
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
10111035
legacyregistry.MustRegister(MemoryManagerPinningRequestTotal)

test/e2e_node/cpu_manager_metrics_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,11 @@ import (
2929
v1 "k8s.io/api/core/v1"
3030
"k8s.io/apimachinery/pkg/api/resource"
3131
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32+
kubeletpodresourcesv1 "k8s.io/kubelet/pkg/apis/podresources/v1"
3233
kubeletconfig "k8s.io/kubernetes/pkg/kubelet/apis/config"
34+
"k8s.io/kubernetes/pkg/kubelet/apis/podresources"
3335
"k8s.io/kubernetes/pkg/kubelet/cm/cpumanager"
36+
"k8s.io/kubernetes/pkg/kubelet/util"
3437
"k8s.io/kubernetes/test/e2e/feature"
3538
"k8s.io/kubernetes/test/e2e/framework"
3639
e2emetrics "k8s.io/kubernetes/test/e2e/framework/metrics"
@@ -182,6 +185,95 @@ var _ = SIGDescribe("CPU Manager Metrics", framework.WithSerial(), feature.CPUMa
182185
ginkgo.By("Ensuring the metrics match the expectations about alignment metrics a few more times")
183186
gomega.Consistently(ctx, getKubeletMetrics, 1*time.Minute, 15*time.Second).Should(matchAlignmentMetrics)
184187
})
188+
189+
ginkgo.It("should report the default idle cpu pool size", func(ctx context.Context) {
190+
ginkgo.By("Querying the podresources endpoint to get the baseline")
191+
endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
192+
framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
193+
194+
cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
195+
framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
196+
defer func() {
197+
framework.ExpectNoError(conn.Close())
198+
}()
199+
200+
ginkgo.By("Checking the pool allocatable resources from the kubelet")
201+
resp, err := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{})
202+
framework.ExpectNoError(err, "failed to get the kubelet allocatable resources")
203+
allocatableCPUs, _ := demuxCPUsAndDevicesFromGetAllocatableResources(resp)
204+
205+
matchResourceMetrics := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
206+
"kubelet_cpu_manager_shared_pool_size_millicores": gstruct.MatchAllElements(nodeID, gstruct.Elements{
207+
"": timelessSample(int(allocatableCPUs.Size() * 1000)),
208+
}),
209+
"kubelet_cpu_manager_exclusive_cpu_allocation_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
210+
"": timelessSample(0),
211+
}),
212+
})
213+
214+
ginkgo.By("Giving the Kubelet time to start up and produce metrics about idle pool size")
215+
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 10*time.Second).Should(matchResourceMetrics)
216+
ginkgo.By("Ensuring the metrics match the expectations about idle pool size a few more times")
217+
gomega.Consistently(ctx, getKubeletMetrics, 30*time.Second, 10*time.Second).Should(matchResourceMetrics)
218+
})
219+
220+
ginkgo.It("should report mutating cpu pool size when handling guaranteed pods", func(ctx context.Context) {
221+
ginkgo.By("Querying the podresources endpoint to get the baseline")
222+
endpoint, err := util.LocalEndpoint(defaultPodResourcesPath, podresources.Socket)
223+
framework.ExpectNoError(err, "LocalEndpoint() failed err: %v", err)
224+
225+
cli, conn, err := podresources.GetV1Client(endpoint, defaultPodResourcesTimeout, defaultPodResourcesMaxSize)
226+
framework.ExpectNoError(err, "GetV1Client() failed err: %v", err)
227+
defer func() {
228+
framework.ExpectNoError(conn.Close())
229+
}()
230+
231+
ginkgo.By("Checking the pool allocatable resources from the kubelet")
232+
resp, err := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{})
233+
framework.ExpectNoError(err, "failed to get the kubelet allocatable resources")
234+
allocatableCPUs, _ := demuxCPUsAndDevicesFromGetAllocatableResources(resp)
235+
236+
allocatableCPUsIdleMillis := int(allocatableCPUs.Size() * 1000)
237+
238+
matchResourceMetricsIdle := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
239+
"kubelet_cpu_manager_shared_pool_size_millicores": gstruct.MatchAllElements(nodeID, gstruct.Elements{
240+
"": timelessSample(allocatableCPUsIdleMillis),
241+
}),
242+
"kubelet_cpu_manager_exclusive_cpu_allocation_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
243+
"": timelessSample(0),
244+
}),
245+
})
246+
ginkgo.By(fmt.Sprintf("Pool allocatable resources from the kubelet: shared pool %d cpus %d millis", allocatableCPUs.Size(), allocatableCPUsIdleMillis))
247+
248+
ginkgo.By("Giving the Kubelet time to start up and produce metrics about idle pool size")
249+
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 10*time.Second).Should(matchResourceMetricsIdle)
250+
ginkgo.By("Ensuring the metrics match the expectations about idle pool size a few more times")
251+
gomega.Consistently(ctx, getKubeletMetrics, 30*time.Second, 10*time.Second).Should(matchResourceMetricsIdle)
252+
253+
ginkgo.By("Creating the test pod to consume exclusive cpus from the pool")
254+
testPod = e2epod.NewPodClient(f).CreateSync(ctx, makeGuaranteedCPUExclusiveSleeperPod("smt-cpupool", smtLevel))
255+
256+
matchResourceMetricsBusy := gstruct.MatchKeys(gstruct.IgnoreExtras, gstruct.Keys{
257+
"kubelet_cpu_manager_shared_pool_size_millicores": gstruct.MatchAllElements(nodeID, gstruct.Elements{
258+
"": timelessSample(allocatableCPUsIdleMillis - (smtLevel * 1000)),
259+
}),
260+
"kubelet_cpu_manager_exclusive_cpu_allocation_count": gstruct.MatchAllElements(nodeID, gstruct.Elements{
261+
"": timelessSample(smtLevel),
262+
}),
263+
})
264+
265+
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
266+
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 10*time.Second).Should(matchResourceMetricsBusy)
267+
ginkgo.By("Ensuring the metrics match the expectations a few more times")
268+
gomega.Consistently(ctx, getKubeletMetrics, 30*time.Second, 10*time.Second).Should(matchResourceMetricsBusy)
269+
270+
deletePodSyncByName(ctx, f, testPod.Name)
271+
272+
ginkgo.By("Giving the Kubelet time to start up and produce metrics")
273+
gomega.Eventually(ctx, getKubeletMetrics, 1*time.Minute, 10*time.Second).Should(matchResourceMetricsIdle)
274+
ginkgo.By("Ensuring the metrics match the expectations a few more times")
275+
gomega.Consistently(ctx, getKubeletMetrics, 30*time.Second, 10*time.Second).Should(matchResourceMetricsIdle)
276+
})
185277
})
186278
})
187279

test/e2e_node/podresources_test.go

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -719,19 +719,16 @@ func podresourcesListTests(ctx context.Context, f *framework.Framework, cli kube
719719
}
720720

721721
func podresourcesGetAllocatableResourcesTests(ctx context.Context, cli kubeletpodresourcesv1.PodResourcesListerClient, sd *sriovData, onlineCPUs, reservedSystemCPUs cpuset.CPUSet) {
722+
ginkgo.GinkgoHelper()
723+
722724
ginkgo.By("checking the devices known to the kubelet")
723725
resp, err := cli.GetAllocatableResources(ctx, &kubeletpodresourcesv1.AllocatableResourcesRequest{})
724-
framework.ExpectNoErrorWithOffset(1, err)
725-
devs := resp.GetDevices()
726-
var cpus []int
727-
for _, cpuid := range resp.GetCpuIds() {
728-
cpus = append(cpus, int(cpuid))
729-
}
730-
allocatableCPUs := cpuset.New(cpus...)
726+
framework.ExpectNoError(err, "cannot get allocatable CPUs from podresources")
727+
allocatableCPUs, devs := demuxCPUsAndDevicesFromGetAllocatableResources(resp)
731728

732729
if onlineCPUs.Size() == 0 {
733730
ginkgo.By("expecting no CPUs reported")
734-
gomega.ExpectWithOffset(1, onlineCPUs.Size()).To(gomega.Equal(reservedSystemCPUs.Size()), "with no online CPUs, no CPUs should be reserved")
731+
gomega.Expect(onlineCPUs.Size()).To(gomega.Equal(reservedSystemCPUs.Size()), "with no online CPUs, no CPUs should be reserved")
735732
} else {
736733
ginkgo.By(fmt.Sprintf("expecting online CPUs reported - online=%v (%d) reserved=%v (%d)", onlineCPUs, onlineCPUs.Size(), reservedSystemCPUs, reservedSystemCPUs.Size()))
737734
if reservedSystemCPUs.Size() > onlineCPUs.Size() {
@@ -740,21 +737,30 @@ func podresourcesGetAllocatableResourcesTests(ctx context.Context, cli kubeletpo
740737
expectedCPUs := onlineCPUs.Difference(reservedSystemCPUs)
741738

742739
ginkgo.By(fmt.Sprintf("expecting CPUs '%v'='%v'", allocatableCPUs, expectedCPUs))
743-
gomega.ExpectWithOffset(1, allocatableCPUs.Equals(expectedCPUs)).To(gomega.BeTrueBecause("mismatch expecting CPUs"))
740+
gomega.Expect(allocatableCPUs.Equals(expectedCPUs)).To(gomega.BeTrueBecause("mismatch expecting CPUs"))
744741
}
745742

746743
if sd == nil { // no devices in the environment, so expect no devices
747744
ginkgo.By("expecting no devices reported")
748-
gomega.ExpectWithOffset(1, devs).To(gomega.BeEmpty(), fmt.Sprintf("got unexpected devices %#v", devs))
745+
gomega.Expect(devs).To(gomega.BeEmpty(), fmt.Sprintf("got unexpected devices %#v", devs))
749746
return
750747
}
751748

752749
ginkgo.By(fmt.Sprintf("expecting some %q devices reported", sd.resourceName))
753-
gomega.ExpectWithOffset(1, devs).ToNot(gomega.BeEmpty())
750+
gomega.Expect(devs).ToNot(gomega.BeEmpty())
754751
for _, dev := range devs {
755752
gomega.Expect(dev.ResourceName).To(gomega.Equal(sd.resourceName))
756-
gomega.ExpectWithOffset(1, dev.DeviceIds).ToNot(gomega.BeEmpty())
753+
gomega.Expect(dev.DeviceIds).ToNot(gomega.BeEmpty())
754+
}
755+
}
756+
757+
func demuxCPUsAndDevicesFromGetAllocatableResources(resp *kubeletpodresourcesv1.AllocatableResourcesResponse) (cpuset.CPUSet, []*kubeletpodresourcesv1.ContainerDevices) {
758+
devs := resp.GetDevices()
759+
var cpus []int
760+
for _, cpuid := range resp.GetCpuIds() {
761+
cpus = append(cpus, int(cpuid))
757762
}
763+
return cpuset.New(cpus...), devs
758764
}
759765

760766
func podresourcesGetTests(ctx context.Context, f *framework.Framework, cli kubeletpodresourcesv1.PodResourcesListerClient, sidecarContainersEnabled bool) {

0 commit comments

Comments
 (0)