Skip to content

Commit c2524cb

Browse files
committed
DRA resourceclaims: maintain metric of total and allocated claims
These metrics can provide insights into ResourceClaim usage. The total count is redundant because the apiserver also provides count of resources, but having it in the same sub-system next to the count of allocated claims might be more discoverable and helps monitor the controller itself.
1 parent 98e5a70 commit c2524cb

File tree

3 files changed

+197
-29
lines changed

3 files changed

+197
-29
lines changed

pkg/controller/resourceclaim/controller.go

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,15 +157,15 @@ func NewController(
157157
if _, err := claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
158158
AddFunc: func(obj interface{}) {
159159
logger.V(6).Info("new claim", "claimDump", obj)
160-
ec.enqueueResourceClaim(logger, obj, false)
160+
ec.enqueueResourceClaim(logger, nil, obj)
161161
},
162162
UpdateFunc: func(old, updated interface{}) {
163163
logger.V(6).Info("updated claim", "claimDump", updated)
164-
ec.enqueueResourceClaim(logger, updated, false)
164+
ec.enqueueResourceClaim(logger, old, updated)
165165
},
166166
DeleteFunc: func(obj interface{}) {
167167
logger.V(6).Info("deleted claim", "claimDump", obj)
168-
ec.enqueueResourceClaim(logger, obj, true)
168+
ec.enqueueResourceClaim(logger, obj, nil)
169169
},
170170
}); err != nil {
171171
return nil, err
@@ -326,15 +326,48 @@ func (ec *Controller) podNeedsWork(pod *v1.Pod) (bool, string) {
326326
return false, "nothing to do"
327327
}
328328

329-
func (ec *Controller) enqueueResourceClaim(logger klog.Logger, obj interface{}, deleted bool) {
330-
if d, ok := obj.(cache.DeletedFinalStateUnknown); ok {
331-
obj = d.Obj
329+
func (ec *Controller) enqueueResourceClaim(logger klog.Logger, oldObj, newObj interface{}) {
330+
deleted := newObj != nil
331+
if d, ok := oldObj.(cache.DeletedFinalStateUnknown); ok {
332+
oldObj = d.Obj
332333
}
333-
claim, ok := obj.(*resourceapi.ResourceClaim)
334-
if !ok {
334+
oldClaim, ok := oldObj.(*resourceapi.ResourceClaim)
335+
if oldObj != nil && !ok {
335336
return
336337
}
338+
newClaim, ok := newObj.(*resourceapi.ResourceClaim)
339+
if newObj != nil && !ok {
340+
return
341+
}
342+
343+
// Maintain metrics based on what was observed.
344+
switch {
345+
case oldClaim == nil:
346+
// Added.
347+
metrics.NumResourceClaims.Inc()
348+
if newClaim.Status.Allocation != nil {
349+
metrics.NumAllocatedResourceClaims.Inc()
350+
}
351+
case newClaim == nil:
352+
// Deleted.
353+
metrics.NumResourceClaims.Dec()
354+
if oldClaim.Status.Allocation != nil {
355+
metrics.NumAllocatedResourceClaims.Dec()
356+
}
357+
default:
358+
// Updated.
359+
switch {
360+
case oldClaim.Status.Allocation == nil && newClaim.Status.Allocation != nil:
361+
metrics.NumAllocatedResourceClaims.Inc()
362+
case oldClaim.Status.Allocation != nil && newClaim.Status.Allocation == nil:
363+
metrics.NumAllocatedResourceClaims.Dec()
364+
}
365+
}
337366

367+
claim := newClaim
368+
if claim == nil {
369+
claim = oldClaim
370+
}
338371
if !deleted {
339372
// When starting up, we have to check all claims to find those with
340373
// stale pods in ReservedFor. During an update, a pod might get added

pkg/controller/resourceclaim/controller_test.go

Lines changed: 138 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,14 @@ limitations under the License.
1717
package resourceclaim
1818

1919
import (
20-
"context"
2120
"errors"
2221
"fmt"
2322
"sort"
2423
"sync"
2524
"testing"
25+
"time"
2626

27+
"github.com/onsi/gomega"
2728
"github.com/stretchr/testify/assert"
2829

2930
v1 "k8s.io/api/core/v1"
@@ -38,7 +39,8 @@ import (
3839
"k8s.io/component-base/metrics/testutil"
3940
"k8s.io/klog/v2"
4041
"k8s.io/kubernetes/pkg/controller"
41-
ephemeralvolumemetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
42+
"k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
43+
"k8s.io/kubernetes/test/utils/ktesting"
4244
)
4345

4446
var (
@@ -79,10 +81,6 @@ var (
7981
}()
8082
)
8183

82-
func init() {
83-
klog.InitFlags(nil)
84-
}
85-
8684
func TestSyncHandler(t *testing.T) {
8785
tests := []struct {
8886
name string
@@ -366,8 +364,8 @@ func TestSyncHandler(t *testing.T) {
366364
for _, tc := range tests {
367365
// Run sequentially because of global logging and global metrics.
368366
t.Run(tc.name, func(t *testing.T) {
369-
ctx, cancel := context.WithCancel(context.Background())
370-
defer cancel()
367+
tCtx := ktesting.Init(t)
368+
tCtx = ktesting.WithCancel(tCtx)
371369

372370
var objects []runtime.Object
373371
for _, pod := range tc.pods {
@@ -392,19 +390,19 @@ func TestSyncHandler(t *testing.T) {
392390
claimInformer := informerFactory.Resource().V1alpha3().ResourceClaims()
393391
templateInformer := informerFactory.Resource().V1alpha3().ResourceClaimTemplates()
394392

395-
ec, err := NewController(klog.FromContext(ctx), fakeKubeClient, podInformer, claimInformer, templateInformer)
393+
ec, err := NewController(klog.FromContext(tCtx), fakeKubeClient, podInformer, claimInformer, templateInformer)
396394
if err != nil {
397395
t.Fatalf("error creating ephemeral controller : %v", err)
398396
}
399397

400398
// Ensure informers are up-to-date.
401-
informerFactory.Start(ctx.Done())
399+
informerFactory.Start(tCtx.Done())
402400
stopInformers := func() {
403-
cancel()
401+
tCtx.Cancel("stopping informers")
404402
informerFactory.Shutdown()
405403
}
406404
defer stopInformers()
407-
informerFactory.WaitForCacheSync(ctx.Done())
405+
informerFactory.WaitForCacheSync(tCtx.Done())
408406

409407
// Add claims that only exist in the mutation cache.
410408
for _, claim := range tc.claimsInCache {
@@ -414,27 +412,27 @@ func TestSyncHandler(t *testing.T) {
414412
// Simulate race: stop informers, add more pods that the controller doesn't know about.
415413
stopInformers()
416414
for _, pod := range tc.podsLater {
417-
_, err := fakeKubeClient.CoreV1().Pods(pod.Namespace).Create(ctx, pod, metav1.CreateOptions{})
415+
_, err := fakeKubeClient.CoreV1().Pods(pod.Namespace).Create(tCtx, pod, metav1.CreateOptions{})
418416
if err != nil {
419417
t.Fatalf("unexpected error while creating pod: %v", err)
420418
}
421419
}
422420

423-
err = ec.syncHandler(ctx, tc.key)
421+
err = ec.syncHandler(tCtx, tc.key)
424422
if err != nil && !tc.expectedError {
425423
t.Fatalf("unexpected error while running handler: %v", err)
426424
}
427425
if err == nil && tc.expectedError {
428426
t.Fatalf("unexpected success")
429427
}
430428

431-
claims, err := fakeKubeClient.ResourceV1alpha3().ResourceClaims("").List(ctx, metav1.ListOptions{})
429+
claims, err := fakeKubeClient.ResourceV1alpha3().ResourceClaims("").List(tCtx, metav1.ListOptions{})
432430
if err != nil {
433431
t.Fatalf("unexpected error while listing claims: %v", err)
434432
}
435433
assert.Equal(t, normalizeClaims(tc.expectedClaims), normalizeClaims(claims.Items))
436434

437-
pods, err := fakeKubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
435+
pods, err := fakeKubeClient.CoreV1().Pods("").List(tCtx, metav1.ListOptions{})
438436
if err != nil {
439437
t.Fatalf("unexpected error while listing pods: %v", err)
440438
}
@@ -455,6 +453,95 @@ func TestSyncHandler(t *testing.T) {
455453
}
456454
}
457455

456+
func TestResourceClaimEventHandler(t *testing.T) {
457+
tCtx := ktesting.Init(t)
458+
tCtx = ktesting.WithCancel(tCtx)
459+
460+
fakeKubeClient := createTestClient()
461+
setupMetrics()
462+
informerFactory := informers.NewSharedInformerFactory(fakeKubeClient, controller.NoResyncPeriodFunc())
463+
podInformer := informerFactory.Core().V1().Pods()
464+
claimInformer := informerFactory.Resource().V1alpha3().ResourceClaims()
465+
templateInformer := informerFactory.Resource().V1alpha3().ResourceClaimTemplates()
466+
claimClient := fakeKubeClient.ResourceV1alpha3().ResourceClaims(testNamespace)
467+
468+
_, err := NewController(tCtx.Logger(), fakeKubeClient, podInformer, claimInformer, templateInformer)
469+
tCtx.ExpectNoError(err, "creating ephemeral controller")
470+
471+
informerFactory.Start(tCtx.Done())
472+
stopInformers := func() {
473+
tCtx.Cancel("stopping informers")
474+
informerFactory.Shutdown()
475+
}
476+
defer stopInformers()
477+
478+
var em numMetrics
479+
480+
_, err = claimClient.Create(tCtx, testClaim, metav1.CreateOptions{})
481+
em.claims++
482+
ktesting.Step(tCtx, "create claim", func(tCtx ktesting.TContext) {
483+
tCtx.ExpectNoError(err)
484+
em.Eventually(tCtx)
485+
})
486+
487+
modifiedClaim := testClaim.DeepCopy()
488+
modifiedClaim.Labels = map[string]string{"foo": "bar"}
489+
_, err = claimClient.Update(tCtx, modifiedClaim, metav1.UpdateOptions{})
490+
ktesting.Step(tCtx, "modify claim", func(tCtx ktesting.TContext) {
491+
tCtx.ExpectNoError(err)
492+
em.Consistently(tCtx)
493+
})
494+
495+
_, err = claimClient.Update(tCtx, testClaimAllocated, metav1.UpdateOptions{})
496+
em.allocated++
497+
ktesting.Step(tCtx, "allocate claim", func(tCtx ktesting.TContext) {
498+
tCtx.ExpectNoError(err)
499+
em.Eventually(tCtx)
500+
})
501+
502+
modifiedClaim = testClaimAllocated.DeepCopy()
503+
modifiedClaim.Labels = map[string]string{"foo": "bar2"}
504+
_, err = claimClient.Update(tCtx, modifiedClaim, metav1.UpdateOptions{})
505+
ktesting.Step(tCtx, "modify claim", func(tCtx ktesting.TContext) {
506+
tCtx.ExpectNoError(err)
507+
em.Consistently(tCtx)
508+
})
509+
510+
otherClaimAllocated := testClaimAllocated.DeepCopy()
511+
otherClaimAllocated.Name += "2"
512+
_, err = claimClient.Create(tCtx, otherClaimAllocated, metav1.CreateOptions{})
513+
em.claims++
514+
em.allocated++
515+
ktesting.Step(tCtx, "create allocated claim", func(tCtx ktesting.TContext) {
516+
tCtx.ExpectNoError(err)
517+
em.Eventually(tCtx)
518+
})
519+
520+
_, err = claimClient.Update(tCtx, testClaim, metav1.UpdateOptions{})
521+
em.allocated--
522+
ktesting.Step(tCtx, "deallocate claim", func(tCtx ktesting.TContext) {
523+
tCtx.ExpectNoError(err)
524+
em.Eventually(tCtx)
525+
})
526+
527+
err = claimClient.Delete(tCtx, testClaim.Name, metav1.DeleteOptions{})
528+
em.claims--
529+
ktesting.Step(tCtx, "delete deallocated claim", func(tCtx ktesting.TContext) {
530+
tCtx.ExpectNoError(err)
531+
em.Eventually(tCtx)
532+
})
533+
534+
err = claimClient.Delete(tCtx, otherClaimAllocated.Name, metav1.DeleteOptions{})
535+
em.claims--
536+
em.allocated--
537+
ktesting.Step(tCtx, "delete allocated claim", func(tCtx ktesting.TContext) {
538+
tCtx.ExpectNoError(err)
539+
em.Eventually(tCtx)
540+
})
541+
542+
em.Consistently(tCtx)
543+
}
544+
458545
func makeClaim(name, namespace, classname string, owner *metav1.OwnerReference) *resourceapi.ResourceClaim {
459546
claim := &resourceapi.ResourceClaim{
460547
ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace},
@@ -596,6 +683,34 @@ func createResourceClaimReactor() func(action k8stesting.Action) (handled bool,
596683

597684
// Metrics helpers
598685

686+
type numMetrics struct {
687+
claims float64
688+
allocated float64
689+
}
690+
691+
func getNumMetric() (em numMetrics, err error) {
692+
em.claims, err = testutil.GetGaugeMetricValue(metrics.NumResourceClaims)
693+
if err != nil {
694+
return
695+
}
696+
em.allocated, err = testutil.GetGaugeMetricValue(metrics.NumAllocatedResourceClaims)
697+
return
698+
}
699+
700+
func (em numMetrics) Eventually(tCtx ktesting.TContext) {
701+
g := gomega.NewWithT(tCtx)
702+
tCtx.Helper()
703+
704+
g.Eventually(getNumMetric).WithTimeout(5 * time.Second).Should(gomega.Equal(em))
705+
}
706+
707+
func (em numMetrics) Consistently(tCtx ktesting.TContext) {
708+
g := gomega.NewWithT(tCtx)
709+
tCtx.Helper()
710+
711+
g.Consistently(getNumMetric).WithTimeout(time.Second).Should(gomega.Equal(em))
712+
}
713+
599714
type expectedMetrics struct {
600715
numCreated int
601716
numFailures int
@@ -604,12 +719,12 @@ type expectedMetrics struct {
604719
func expectMetrics(t *testing.T, em expectedMetrics) {
605720
t.Helper()
606721

607-
actualCreated, err := testutil.GetCounterMetricValue(ephemeralvolumemetrics.ResourceClaimCreateAttempts)
722+
actualCreated, err := testutil.GetCounterMetricValue(metrics.ResourceClaimCreateAttempts)
608723
handleErr(t, err, "ResourceClaimCreate")
609724
if actualCreated != float64(em.numCreated) {
610725
t.Errorf("Expected claims to be created %d, got %v", em.numCreated, actualCreated)
611726
}
612-
actualConflicts, err := testutil.GetCounterMetricValue(ephemeralvolumemetrics.ResourceClaimCreateFailures)
727+
actualConflicts, err := testutil.GetCounterMetricValue(metrics.ResourceClaimCreateFailures)
613728
handleErr(t, err, "ResourceClaimCreate/Conflict")
614729
if actualConflicts != float64(em.numFailures) {
615730
t.Errorf("Expected claims to have conflicts %d, got %v", em.numFailures, actualConflicts)
@@ -623,7 +738,9 @@ func handleErr(t *testing.T, err error, metricName string) {
623738
}
624739

625740
func setupMetrics() {
626-
ephemeralvolumemetrics.RegisterMetrics()
627-
ephemeralvolumemetrics.ResourceClaimCreateAttempts.Reset()
628-
ephemeralvolumemetrics.ResourceClaimCreateFailures.Reset()
741+
metrics.RegisterMetrics()
742+
metrics.ResourceClaimCreateAttempts.Reset()
743+
metrics.ResourceClaimCreateFailures.Reset()
744+
metrics.NumResourceClaims.Set(0)
745+
metrics.NumAllocatedResourceClaims.Set(0)
629746
}

pkg/controller/resourceclaim/metrics/metrics.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,22 @@ var (
4545
Help: "Number of ResourceClaims creation request failures",
4646
StabilityLevel: metrics.ALPHA,
4747
})
48+
// NumResourceClaims tracks the current number of ResourceClaims.
49+
NumResourceClaims = metrics.NewGauge(
50+
&metrics.GaugeOpts{
51+
Subsystem: ResourceClaimSubsystem,
52+
Name: "resource_claims",
53+
Help: "Number of ResourceClaims",
54+
StabilityLevel: metrics.ALPHA,
55+
})
56+
// NumAllocatedResourceClaims tracks the current number of allocated ResourceClaims.
57+
NumAllocatedResourceClaims = metrics.NewGauge(
58+
&metrics.GaugeOpts{
59+
Subsystem: ResourceClaimSubsystem,
60+
Name: "allocated_resource_claims",
61+
Help: "Number of allocated ResourceClaims",
62+
StabilityLevel: metrics.ALPHA,
63+
})
4864
)
4965

5066
var registerMetrics sync.Once
@@ -54,5 +70,7 @@ func RegisterMetrics() {
5470
registerMetrics.Do(func() {
5571
legacyregistry.MustRegister(ResourceClaimCreateAttempts)
5672
legacyregistry.MustRegister(ResourceClaimCreateFailures)
73+
legacyregistry.MustRegister(NumResourceClaims)
74+
legacyregistry.MustRegister(NumAllocatedResourceClaims)
5775
})
5876
}

0 commit comments

Comments
 (0)