Skip to content

Commit 8aae9aa

Browse files
authored
Merge pull request kubernetes#127661 from pohly/dra-resourceclaim-metrics
DRA resourceclaims: maintain metric of total and allocated claims
2 parents 1e7f752 + c2524cb commit 8aae9aa

File tree

3 files changed

+197
-29
lines changed

3 files changed

+197
-29
lines changed

pkg/controller/resourceclaim/controller.go

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -157,15 +157,15 @@ func NewController(
157157
if _, err := claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
158158
AddFunc: func(obj interface{}) {
159159
logger.V(6).Info("new claim", "claimDump", obj)
160-
ec.enqueueResourceClaim(logger, obj, false)
160+
ec.enqueueResourceClaim(logger, nil, obj)
161161
},
162162
UpdateFunc: func(old, updated interface{}) {
163163
logger.V(6).Info("updated claim", "claimDump", updated)
164-
ec.enqueueResourceClaim(logger, updated, false)
164+
ec.enqueueResourceClaim(logger, old, updated)
165165
},
166166
DeleteFunc: func(obj interface{}) {
167167
logger.V(6).Info("deleted claim", "claimDump", obj)
168-
ec.enqueueResourceClaim(logger, obj, true)
168+
ec.enqueueResourceClaim(logger, obj, nil)
169169
},
170170
}); err != nil {
171171
return nil, err
@@ -326,15 +326,48 @@ func (ec *Controller) podNeedsWork(pod *v1.Pod) (bool, string) {
326326
return false, "nothing to do"
327327
}
328328

329-
func (ec *Controller) enqueueResourceClaim(logger klog.Logger, obj interface{}, deleted bool) {
330-
if d, ok := obj.(cache.DeletedFinalStateUnknown); ok {
331-
obj = d.Obj
329+
func (ec *Controller) enqueueResourceClaim(logger klog.Logger, oldObj, newObj interface{}) {
330+
deleted := newObj != nil
331+
if d, ok := oldObj.(cache.DeletedFinalStateUnknown); ok {
332+
oldObj = d.Obj
332333
}
333-
claim, ok := obj.(*resourceapi.ResourceClaim)
334-
if !ok {
334+
oldClaim, ok := oldObj.(*resourceapi.ResourceClaim)
335+
if oldObj != nil && !ok {
335336
return
336337
}
338+
newClaim, ok := newObj.(*resourceapi.ResourceClaim)
339+
if newObj != nil && !ok {
340+
return
341+
}
342+
343+
// Maintain metrics based on what was observed.
344+
switch {
345+
case oldClaim == nil:
346+
// Added.
347+
metrics.NumResourceClaims.Inc()
348+
if newClaim.Status.Allocation != nil {
349+
metrics.NumAllocatedResourceClaims.Inc()
350+
}
351+
case newClaim == nil:
352+
// Deleted.
353+
metrics.NumResourceClaims.Dec()
354+
if oldClaim.Status.Allocation != nil {
355+
metrics.NumAllocatedResourceClaims.Dec()
356+
}
357+
default:
358+
// Updated.
359+
switch {
360+
case oldClaim.Status.Allocation == nil && newClaim.Status.Allocation != nil:
361+
metrics.NumAllocatedResourceClaims.Inc()
362+
case oldClaim.Status.Allocation != nil && newClaim.Status.Allocation == nil:
363+
metrics.NumAllocatedResourceClaims.Dec()
364+
}
365+
}
337366

367+
claim := newClaim
368+
if claim == nil {
369+
claim = oldClaim
370+
}
338371
if !deleted {
339372
// When starting up, we have to check all claims to find those with
340373
// stale pods in ReservedFor. During an update, a pod might get added

pkg/controller/resourceclaim/controller_test.go

Lines changed: 138 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,14 @@ limitations under the License.
1717
package resourceclaim
1818

1919
import (
20-
"context"
2120
"errors"
2221
"fmt"
2322
"sort"
2423
"sync"
2524
"testing"
25+
"time"
2626

27+
"github.com/onsi/gomega"
2728
"github.com/stretchr/testify/assert"
2829

2930
v1 "k8s.io/api/core/v1"
@@ -38,7 +39,8 @@ import (
3839
"k8s.io/component-base/metrics/testutil"
3940
"k8s.io/klog/v2"
4041
"k8s.io/kubernetes/pkg/controller"
41-
ephemeralvolumemetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
42+
"k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
43+
"k8s.io/kubernetes/test/utils/ktesting"
4244
)
4345

4446
var (
@@ -79,10 +81,6 @@ var (
7981
}()
8082
)
8183

82-
func init() {
83-
klog.InitFlags(nil)
84-
}
85-
8684
func TestSyncHandler(t *testing.T) {
8785
tests := []struct {
8886
name string
@@ -366,8 +364,8 @@ func TestSyncHandler(t *testing.T) {
366364
for _, tc := range tests {
367365
// Run sequentially because of global logging and global metrics.
368366
t.Run(tc.name, func(t *testing.T) {
369-
ctx, cancel := context.WithCancel(context.Background())
370-
defer cancel()
367+
tCtx := ktesting.Init(t)
368+
tCtx = ktesting.WithCancel(tCtx)
371369

372370
var objects []runtime.Object
373371
for _, pod := range tc.pods {
@@ -392,19 +390,19 @@ func TestSyncHandler(t *testing.T) {
392390
claimInformer := informerFactory.Resource().V1alpha3().ResourceClaims()
393391
templateInformer := informerFactory.Resource().V1alpha3().ResourceClaimTemplates()
394392

395-
ec, err := NewController(klog.FromContext(ctx), fakeKubeClient, podInformer, claimInformer, templateInformer)
393+
ec, err := NewController(klog.FromContext(tCtx), fakeKubeClient, podInformer, claimInformer, templateInformer)
396394
if err != nil {
397395
t.Fatalf("error creating ephemeral controller : %v", err)
398396
}
399397

400398
// Ensure informers are up-to-date.
401-
informerFactory.Start(ctx.Done())
399+
informerFactory.Start(tCtx.Done())
402400
stopInformers := func() {
403-
cancel()
401+
tCtx.Cancel("stopping informers")
404402
informerFactory.Shutdown()
405403
}
406404
defer stopInformers()
407-
informerFactory.WaitForCacheSync(ctx.Done())
405+
informerFactory.WaitForCacheSync(tCtx.Done())
408406

409407
// Add claims that only exist in the mutation cache.
410408
for _, claim := range tc.claimsInCache {
@@ -414,27 +412,27 @@ func TestSyncHandler(t *testing.T) {
414412
// Simulate race: stop informers, add more pods that the controller doesn't know about.
415413
stopInformers()
416414
for _, pod := range tc.podsLater {
417-
_, err := fakeKubeClient.CoreV1().Pods(pod.Namespace).Create(ctx, pod, metav1.CreateOptions{})
415+
_, err := fakeKubeClient.CoreV1().Pods(pod.Namespace).Create(tCtx, pod, metav1.CreateOptions{})
418416
if err != nil {
419417
t.Fatalf("unexpected error while creating pod: %v", err)
420418
}
421419
}
422420

423-
err = ec.syncHandler(ctx, tc.key)
421+
err = ec.syncHandler(tCtx, tc.key)
424422
if err != nil && !tc.expectedError {
425423
t.Fatalf("unexpected error while running handler: %v", err)
426424
}
427425
if err == nil && tc.expectedError {
428426
t.Fatalf("unexpected success")
429427
}
430428

431-
claims, err := fakeKubeClient.ResourceV1alpha3().ResourceClaims("").List(ctx, metav1.ListOptions{})
429+
claims, err := fakeKubeClient.ResourceV1alpha3().ResourceClaims("").List(tCtx, metav1.ListOptions{})
432430
if err != nil {
433431
t.Fatalf("unexpected error while listing claims: %v", err)
434432
}
435433
assert.Equal(t, normalizeClaims(tc.expectedClaims), normalizeClaims(claims.Items))
436434

437-
pods, err := fakeKubeClient.CoreV1().Pods("").List(ctx, metav1.ListOptions{})
435+
pods, err := fakeKubeClient.CoreV1().Pods("").List(tCtx, metav1.ListOptions{})
438436
if err != nil {
439437
t.Fatalf("unexpected error while listing pods: %v", err)
440438
}
@@ -455,6 +453,95 @@ func TestSyncHandler(t *testing.T) {
455453
}
456454
}
457455

456+
func TestResourceClaimEventHandler(t *testing.T) {
457+
tCtx := ktesting.Init(t)
458+
tCtx = ktesting.WithCancel(tCtx)
459+
460+
fakeKubeClient := createTestClient()
461+
setupMetrics()
462+
informerFactory := informers.NewSharedInformerFactory(fakeKubeClient, controller.NoResyncPeriodFunc())
463+
podInformer := informerFactory.Core().V1().Pods()
464+
claimInformer := informerFactory.Resource().V1alpha3().ResourceClaims()
465+
templateInformer := informerFactory.Resource().V1alpha3().ResourceClaimTemplates()
466+
claimClient := fakeKubeClient.ResourceV1alpha3().ResourceClaims(testNamespace)
467+
468+
_, err := NewController(tCtx.Logger(), fakeKubeClient, podInformer, claimInformer, templateInformer)
469+
tCtx.ExpectNoError(err, "creating ephemeral controller")
470+
471+
informerFactory.Start(tCtx.Done())
472+
stopInformers := func() {
473+
tCtx.Cancel("stopping informers")
474+
informerFactory.Shutdown()
475+
}
476+
defer stopInformers()
477+
478+
var em numMetrics
479+
480+
_, err = claimClient.Create(tCtx, testClaim, metav1.CreateOptions{})
481+
em.claims++
482+
ktesting.Step(tCtx, "create claim", func(tCtx ktesting.TContext) {
483+
tCtx.ExpectNoError(err)
484+
em.Eventually(tCtx)
485+
})
486+
487+
modifiedClaim := testClaim.DeepCopy()
488+
modifiedClaim.Labels = map[string]string{"foo": "bar"}
489+
_, err = claimClient.Update(tCtx, modifiedClaim, metav1.UpdateOptions{})
490+
ktesting.Step(tCtx, "modify claim", func(tCtx ktesting.TContext) {
491+
tCtx.ExpectNoError(err)
492+
em.Consistently(tCtx)
493+
})
494+
495+
_, err = claimClient.Update(tCtx, testClaimAllocated, metav1.UpdateOptions{})
496+
em.allocated++
497+
ktesting.Step(tCtx, "allocate claim", func(tCtx ktesting.TContext) {
498+
tCtx.ExpectNoError(err)
499+
em.Eventually(tCtx)
500+
})
501+
502+
modifiedClaim = testClaimAllocated.DeepCopy()
503+
modifiedClaim.Labels = map[string]string{"foo": "bar2"}
504+
_, err = claimClient.Update(tCtx, modifiedClaim, metav1.UpdateOptions{})
505+
ktesting.Step(tCtx, "modify claim", func(tCtx ktesting.TContext) {
506+
tCtx.ExpectNoError(err)
507+
em.Consistently(tCtx)
508+
})
509+
510+
otherClaimAllocated := testClaimAllocated.DeepCopy()
511+
otherClaimAllocated.Name += "2"
512+
_, err = claimClient.Create(tCtx, otherClaimAllocated, metav1.CreateOptions{})
513+
em.claims++
514+
em.allocated++
515+
ktesting.Step(tCtx, "create allocated claim", func(tCtx ktesting.TContext) {
516+
tCtx.ExpectNoError(err)
517+
em.Eventually(tCtx)
518+
})
519+
520+
_, err = claimClient.Update(tCtx, testClaim, metav1.UpdateOptions{})
521+
em.allocated--
522+
ktesting.Step(tCtx, "deallocate claim", func(tCtx ktesting.TContext) {
523+
tCtx.ExpectNoError(err)
524+
em.Eventually(tCtx)
525+
})
526+
527+
err = claimClient.Delete(tCtx, testClaim.Name, metav1.DeleteOptions{})
528+
em.claims--
529+
ktesting.Step(tCtx, "delete deallocated claim", func(tCtx ktesting.TContext) {
530+
tCtx.ExpectNoError(err)
531+
em.Eventually(tCtx)
532+
})
533+
534+
err = claimClient.Delete(tCtx, otherClaimAllocated.Name, metav1.DeleteOptions{})
535+
em.claims--
536+
em.allocated--
537+
ktesting.Step(tCtx, "delete allocated claim", func(tCtx ktesting.TContext) {
538+
tCtx.ExpectNoError(err)
539+
em.Eventually(tCtx)
540+
})
541+
542+
em.Consistently(tCtx)
543+
}
544+
458545
func makeClaim(name, namespace, classname string, owner *metav1.OwnerReference) *resourceapi.ResourceClaim {
459546
claim := &resourceapi.ResourceClaim{
460547
ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace},
@@ -596,6 +683,34 @@ func createResourceClaimReactor() func(action k8stesting.Action) (handled bool,
596683

597684
// Metrics helpers
598685

686+
type numMetrics struct {
687+
claims float64
688+
allocated float64
689+
}
690+
691+
func getNumMetric() (em numMetrics, err error) {
692+
em.claims, err = testutil.GetGaugeMetricValue(metrics.NumResourceClaims)
693+
if err != nil {
694+
return
695+
}
696+
em.allocated, err = testutil.GetGaugeMetricValue(metrics.NumAllocatedResourceClaims)
697+
return
698+
}
699+
700+
func (em numMetrics) Eventually(tCtx ktesting.TContext) {
701+
g := gomega.NewWithT(tCtx)
702+
tCtx.Helper()
703+
704+
g.Eventually(getNumMetric).WithTimeout(5 * time.Second).Should(gomega.Equal(em))
705+
}
706+
707+
func (em numMetrics) Consistently(tCtx ktesting.TContext) {
708+
g := gomega.NewWithT(tCtx)
709+
tCtx.Helper()
710+
711+
g.Consistently(getNumMetric).WithTimeout(time.Second).Should(gomega.Equal(em))
712+
}
713+
599714
type expectedMetrics struct {
600715
numCreated int
601716
numFailures int
@@ -604,12 +719,12 @@ type expectedMetrics struct {
604719
func expectMetrics(t *testing.T, em expectedMetrics) {
605720
t.Helper()
606721

607-
actualCreated, err := testutil.GetCounterMetricValue(ephemeralvolumemetrics.ResourceClaimCreateAttempts)
722+
actualCreated, err := testutil.GetCounterMetricValue(metrics.ResourceClaimCreateAttempts)
608723
handleErr(t, err, "ResourceClaimCreate")
609724
if actualCreated != float64(em.numCreated) {
610725
t.Errorf("Expected claims to be created %d, got %v", em.numCreated, actualCreated)
611726
}
612-
actualConflicts, err := testutil.GetCounterMetricValue(ephemeralvolumemetrics.ResourceClaimCreateFailures)
727+
actualConflicts, err := testutil.GetCounterMetricValue(metrics.ResourceClaimCreateFailures)
613728
handleErr(t, err, "ResourceClaimCreate/Conflict")
614729
if actualConflicts != float64(em.numFailures) {
615730
t.Errorf("Expected claims to have conflicts %d, got %v", em.numFailures, actualConflicts)
@@ -623,7 +738,9 @@ func handleErr(t *testing.T, err error, metricName string) {
623738
}
624739

625740
func setupMetrics() {
626-
ephemeralvolumemetrics.RegisterMetrics()
627-
ephemeralvolumemetrics.ResourceClaimCreateAttempts.Reset()
628-
ephemeralvolumemetrics.ResourceClaimCreateFailures.Reset()
741+
metrics.RegisterMetrics()
742+
metrics.ResourceClaimCreateAttempts.Reset()
743+
metrics.ResourceClaimCreateFailures.Reset()
744+
metrics.NumResourceClaims.Set(0)
745+
metrics.NumAllocatedResourceClaims.Set(0)
629746
}

pkg/controller/resourceclaim/metrics/metrics.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,22 @@ var (
4545
Help: "Number of ResourceClaims creation request failures",
4646
StabilityLevel: metrics.ALPHA,
4747
})
48+
// NumResourceClaims tracks the current number of ResourceClaims.
49+
NumResourceClaims = metrics.NewGauge(
50+
&metrics.GaugeOpts{
51+
Subsystem: ResourceClaimSubsystem,
52+
Name: "resource_claims",
53+
Help: "Number of ResourceClaims",
54+
StabilityLevel: metrics.ALPHA,
55+
})
56+
// NumAllocatedResourceClaims tracks the current number of allocated ResourceClaims.
57+
NumAllocatedResourceClaims = metrics.NewGauge(
58+
&metrics.GaugeOpts{
59+
Subsystem: ResourceClaimSubsystem,
60+
Name: "allocated_resource_claims",
61+
Help: "Number of allocated ResourceClaims",
62+
StabilityLevel: metrics.ALPHA,
63+
})
4864
)
4965

5066
var registerMetrics sync.Once
@@ -54,5 +70,7 @@ func RegisterMetrics() {
5470
registerMetrics.Do(func() {
5571
legacyregistry.MustRegister(ResourceClaimCreateAttempts)
5672
legacyregistry.MustRegister(ResourceClaimCreateFailures)
73+
legacyregistry.MustRegister(NumResourceClaims)
74+
legacyregistry.MustRegister(NumAllocatedResourceClaims)
5775
})
5876
}

0 commit comments

Comments
 (0)