Skip to content

Commit a21f3f0

Browse files
committed
kubelet: add DRAOperationsDuration metric
1 parent c6669ea commit a21f3f0

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

pkg/kubelet/cm/dra/manager.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package dra
1919
import (
2020
"context"
2121
"fmt"
22+
"strconv"
2223
"time"
2324

2425
v1 "k8s.io/api/core/v1"
@@ -35,6 +36,7 @@ import (
3536
"k8s.io/kubernetes/pkg/kubelet/cm/dra/state"
3637
"k8s.io/kubernetes/pkg/kubelet/config"
3738
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
39+
"k8s.io/kubernetes/pkg/kubelet/metrics"
3840
)
3941

4042
// draManagerStateFileName is the file name where dra manager stores its state
@@ -150,6 +152,13 @@ func (m *ManagerImpl) reconcileLoop(ctx context.Context) {
150152
// for each new resource requirement, process their responses and update the cached
151153
// containerResources on success.
152154
func (m *ManagerImpl) PrepareResources(ctx context.Context, pod *v1.Pod) error {
155+
startTime := time.Now()
156+
err := m.prepareResources(ctx, pod)
157+
metrics.DRAOperationsDuration.WithLabelValues("PrepareResources", strconv.FormatBool(err == nil)).Observe(time.Since(startTime).Seconds())
158+
return err
159+
}
160+
161+
func (m *ManagerImpl) prepareResources(ctx context.Context, pod *v1.Pod) error {
153162
logger := klog.FromContext(ctx)
154163
batches := make(map[string][]*drapb.Claim)
155164
resourceClaims := make(map[types.UID]*resourceapi.ResourceClaim)
@@ -369,6 +378,10 @@ func (m *ManagerImpl) GetResources(pod *v1.Pod, container *v1.Container) (*Conta
369378
// As such, calls to the underlying NodeUnprepareResource API are skipped for claims that have
370379
// already been successfully unprepared.
371380
func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error {
381+
var err error = nil
382+
defer func(startTime time.Time) {
383+
metrics.DRAOperationsDuration.WithLabelValues("UnprepareResources", strconv.FormatBool(err != nil)).Observe(time.Since(startTime).Seconds())
384+
}(time.Now())
372385
var claimNames []string
373386
for i := range pod.Spec.ResourceClaims {
374387
claimName, _, err := resourceclaim.Name(pod, &pod.Spec.ResourceClaims[i])
@@ -383,7 +396,8 @@ func (m *ManagerImpl) UnprepareResources(ctx context.Context, pod *v1.Pod) error
383396
}
384397
claimNames = append(claimNames, *claimName)
385398
}
386-
return m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
399+
err = m.unprepareResources(ctx, pod.UID, pod.Namespace, claimNames)
400+
return err
387401
}
388402

389403
func (m *ManagerImpl) unprepareResources(ctx context.Context, podUID types.UID, namespace string, claimNames []string) error {

pkg/kubelet/metrics/metrics.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
const (
3333
FirstNetworkPodStartSLIDurationKey = "first_network_pod_start_sli_duration_seconds"
3434
KubeletSubsystem = "kubelet"
35+
DRASubsystem = "dra"
3536
NodeNameKey = "node_name"
3637
NodeLabelKey = "node"
3738
NodeStartupPreKubeletKey = "node_startup_pre_kubelet_duration_seconds"
@@ -132,6 +133,9 @@ const (
132133
ContainerAlignedComputeResourcesScopeLabelKey = "scope"
133134
ContainerAlignedComputeResourcesBoundaryLabelKey = "boundary"
134135

136+
// Metric keys for DRA operations
137+
DRAOperationsDurationKey = "operations_duration_seconds"
138+
135139
// Values used in metric labels
136140
Container = "container"
137141
InitContainer = "init_container"
@@ -938,6 +942,18 @@ var (
938942
StabilityLevel: metrics.ALPHA,
939943
},
940944
)
945+
946+
// DRAOperationsDuration tracks the duration of the DRA PrepareResources and UnprepareResources requests.
947+
DRAOperationsDuration = metrics.NewHistogramVec(
948+
&metrics.HistogramOpts{
949+
Subsystem: DRASubsystem,
950+
Name: DRAOperationsDurationKey,
951+
Help: "Latency histogram in seconds for the duration of handling all ResourceClaims referenced by a pod when the pod starts or stops. Identified by the name of the operation (PrepareResources or UnprepareResources) and separated by the success of the operation. The number of failed operations is provided through the histogram's overall count.",
952+
Buckets: metrics.DefBuckets,
953+
StabilityLevel: metrics.ALPHA,
954+
},
955+
[]string{"operation_name", "is_error"},
956+
)
941957
)
942958

943959
var registerMetrics sync.Once
@@ -1030,6 +1046,10 @@ func Register(collectors ...metrics.StableCollector) {
10301046
legacyregistry.MustRegister(LifecycleHandlerHTTPFallbacks)
10311047
legacyregistry.MustRegister(LifecycleHandlerSleepTerminated)
10321048
legacyregistry.MustRegister(CgroupVersion)
1049+
1050+
if utilfeature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation) {
1051+
legacyregistry.MustRegister(DRAOperationsDuration)
1052+
}
10331053
})
10341054
}
10351055

0 commit comments

Comments
 (0)