Skip to content

Commit a50e4b8

Browse files
authored
Merge pull request kubernetes-csi#1107 from yati1998/metrics
Add group snapshot controller metrics
2 parents d505406 + 4a2f955 commit a50e4b8

File tree

4 files changed

+331
-3
lines changed

4 files changed

+331
-3
lines changed

pkg/common-controller/groupsnapshot_controller_helper.go

Lines changed: 95 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232

3333
crdv1alpha1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumegroupsnapshot/v1alpha1"
3434
crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
35+
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/metrics"
3536
"github.com/kubernetes-csi/external-snapshotter/v8/pkg/utils"
3637
)
3738

@@ -263,6 +264,14 @@ func (ctrl *csiSnapshotCommonController) deleteGroupSnapshot(groupSnapshot *crdv
263264
_ = ctrl.snapshotStore.Delete(groupSnapshot)
264265
klog.V(4).Infof("group snapshot %q deleted", utils.GroupSnapshotKey(groupSnapshot))
265266

267+
driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
268+
if err != nil {
269+
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for group snapshot %q: %v", utils.GroupSnapshotKey(groupSnapshot), err)
270+
} else {
271+
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteGroupSnapshotOperationName, groupSnapshot.UID)
272+
ctrl.metricsManager.RecordMetrics(deleteOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
273+
}
274+
266275
groupSnapshotContentName := ""
267276
if groupSnapshot.Status != nil && groupSnapshot.Status.BoundVolumeGroupSnapshotContentName != nil {
268277
groupSnapshotContentName = *groupSnapshot.Status.BoundVolumeGroupSnapshotContentName
@@ -376,9 +385,28 @@ func (ctrl *csiSnapshotCommonController) getGroupSnapshotContentFromStore(conten
376385
func (ctrl *csiSnapshotCommonController) syncUnreadyGroupSnapshot(groupSnapshot *crdv1alpha1.VolumeGroupSnapshot) error {
377386
uniqueGroupSnapshotName := utils.GroupSnapshotKey(groupSnapshot)
378387
klog.V(5).Infof("syncUnreadyGroupSnapshot %s", uniqueGroupSnapshotName)
379-
/*
380-
TODO: Add metrics
381-
*/
388+
driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
389+
if err != nil {
390+
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for groupsnapshot %q: %s", utils.GroupSnapshotKey(groupSnapshot), err)
391+
}
392+
393+
groupSnapshotProvisionType := metrics.DynamicGroupSnapshotType
394+
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
395+
groupSnapshotProvisionType = metrics.PreProvisionedGroupSnapshotType
396+
}
397+
398+
// Start metrics operations for volumegroupsnapshot
399+
if !utils.IsGroupSnapshotCreated(groupSnapshot) {
400+
// Only start CreateGroupSnapshot operation if the groupsnapshot has not been cut
401+
ctrl.metricsManager.OperationStart(
402+
metrics.NewOperationKey(metrics.CreateGroupSnapshotOperationName, groupSnapshot.UID),
403+
metrics.NewOperationValue(driverName, groupSnapshotProvisionType),
404+
)
405+
}
406+
ctrl.metricsManager.OperationStart(
407+
metrics.NewOperationKey(metrics.CreateGroupSnapshotAndReadyOperationName, groupSnapshot.UID),
408+
metrics.NewOperationValue(driverName, groupSnapshotProvisionType),
409+
)
382410

383411
// Pre-provisioned snapshot
384412
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
@@ -664,12 +692,20 @@ func (ctrl *csiSnapshotCommonController) updateGroupSnapshotStatus(groupSnapshot
664692
groupSnapshotClone := groupSnapshotObj.DeepCopy()
665693
groupSnapshotClone.Status = newStatus
666694

695+
// We need to record metrics before updating the status due to a bug causing cache entries after a failed UpdateStatus call.
696+
// Must meet the following criteria to emit a successful CreateGroupSnapshot status
697+
// 1. Previous status was nil OR Previous status had a nil CreationTime
698+
// 2. New status must be non-nil with a non-nil CreationTime
699+
driverName := groupSnapshotContent.Spec.Driver
700+
createOperationKey := metrics.NewOperationKey(metrics.CreateGroupSnapshotOperationName, groupSnapshot.UID)
701+
667702
// Must meet the following criteria to emit a successful CreateGroupSnapshot status
668703
// 1. Previous status was nil OR Previous status had a nil CreationTime
669704
// 2. New status must be non-nil with a non-nil CreationTime
670705
if !utils.IsGroupSnapshotCreated(groupSnapshotObj) && utils.IsGroupSnapshotCreated(groupSnapshotClone) {
671706
msg := fmt.Sprintf("GroupSnapshot %s was successfully created by the CSI driver.", utils.GroupSnapshotKey(groupSnapshot))
672707
ctrl.eventRecorder.Event(groupSnapshot, v1.EventTypeNormal, "GroupSnapshotCreated", msg)
708+
ctrl.metricsManager.RecordVolumeGroupSnapshotMetrics(createOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
673709
}
674710

675711
// Must meet the following criteria to emit a successful CreateGroupSnapshotAndReady status
@@ -678,6 +714,8 @@ func (ctrl *csiSnapshotCommonController) updateGroupSnapshotStatus(groupSnapshot
678714
if !utils.IsGroupSnapshotReady(groupSnapshotObj) && utils.IsGroupSnapshotReady(groupSnapshotClone) {
679715
msg := fmt.Sprintf("GroupSnapshot %s is ready to use.", utils.GroupSnapshotKey(groupSnapshot))
680716
ctrl.eventRecorder.Event(groupSnapshot, v1.EventTypeNormal, "GroupSnapshotReady", msg)
717+
createAndReadyOperation := metrics.NewOperationKey(metrics.CreateGroupSnapshotAndReadyOperationName, groupSnapshot.UID)
718+
ctrl.metricsManager.RecordMetrics(createAndReadyOperation, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
681719
}
682720

683721
newGroupSnapshotObj, err := ctrl.clientset.GroupsnapshotV1alpha1().VolumeGroupSnapshots(groupSnapshotClone.Namespace).UpdateStatus(context.TODO(), groupSnapshotClone, metav1.UpdateOptions{})
@@ -1126,6 +1164,21 @@ func (ctrl *csiSnapshotCommonController) addGroupSnapshotFinalizer(groupSnapshot
11261164
func (ctrl *csiSnapshotCommonController) processGroupSnapshotWithDeletionTimestamp(groupSnapshot *crdv1alpha1.VolumeGroupSnapshot) error {
11271165
klog.V(5).Infof("processGroupSnapshotWithDeletionTimestamp VolumeGroupSnapshot[%s]: %s", utils.GroupSnapshotKey(groupSnapshot), utils.GetGroupSnapshotStatusForLogging(groupSnapshot))
11281166

1167+
driverName, err := ctrl.getGroupSnapshotDriverName(groupSnapshot)
1168+
if err != nil {
1169+
klog.Errorf("failed to getGroupSnapshotDriverName while recording metrics for group snapshot %q: %v", utils.GroupSnapshotKey(groupSnapshot), err)
1170+
}
1171+
1172+
groupSnapshotProvisionType := metrics.DynamicGroupSnapshotType
1173+
if groupSnapshot.Spec.Source.VolumeGroupSnapshotContentName != nil {
1174+
groupSnapshotProvisionType = metrics.PreProvisionedGroupSnapshotType
1175+
}
1176+
1177+
// Processing delete, start operation metric
1178+
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteGroupSnapshotOperationName, groupSnapshot.UID)
1179+
deleteOperationValue := metrics.NewOperationValue(driverName, groupSnapshotProvisionType)
1180+
ctrl.metricsManager.OperationStart(deleteOperationKey, deleteOperationValue)
1181+
11291182
var groupSnapshotContentName string
11301183
if groupSnapshot.Status != nil && groupSnapshot.Status.BoundVolumeGroupSnapshotContentName != nil {
11311184
groupSnapshotContentName = *groupSnapshot.Status.BoundVolumeGroupSnapshotContentName
@@ -1297,3 +1350,42 @@ func (ctrl *csiSnapshotCommonController) removeGroupSnapshotFinalizer(groupSnaps
12971350
klog.V(5).Infof("Removed protection finalizer from volume group snapshot %s", utils.GroupSnapshotKey(groupSnapshot))
12981351
return nil
12991352
}
1353+
1354+
// getGroupSnapshotDriverName is a helper function to get driver from the VolumeGroupSnapshot.
1355+
// We try to get the driverName in multiple ways, as snapshot controller metrics depend on the correct driverName.
1356+
func (ctrl *csiSnapshotCommonController) getGroupSnapshotDriverName(vgs *crdv1alpha1.VolumeGroupSnapshot) (string, error) {
1357+
klog.V(5).Infof("getSnapshotDriverName: VolumeSnapshot[%s]", vgs.Name)
1358+
var driverName string
1359+
1360+
// Pre-Provisioned groupsnapshots have contentName as source
1361+
var contentName string
1362+
if vgs.Spec.Source.VolumeGroupSnapshotContentName != nil {
1363+
contentName = *vgs.Spec.Source.VolumeGroupSnapshotContentName
1364+
}
1365+
1366+
// Get Driver name from GroupSnapshotContent if we found a contentName
1367+
if contentName != "" {
1368+
content, err := ctrl.groupSnapshotContentLister.Get(contentName)
1369+
if err != nil {
1370+
klog.Errorf("getGroupSnapshotDriverName: failed to get groupSnapshotContent: %v", contentName)
1371+
} else {
1372+
driverName = content.Spec.Driver
1373+
}
1374+
1375+
if driverName != "" {
1376+
return driverName, nil
1377+
}
1378+
}
1379+
1380+
// Dynamic groupsnapshots will have a groupsnapshotclass with a driver
1381+
if vgs.Spec.VolumeGroupSnapshotClassName != nil {
1382+
class, err := ctrl.getSnapshotClass(*vgs.Spec.VolumeGroupSnapshotClassName)
1383+
if err != nil {
1384+
klog.Errorf("getGroupSnapshotDriverName: failed to get groupsnapshotClass: %v", *vgs.Spec.VolumeGroupSnapshotClassName)
1385+
} else {
1386+
driverName = class.Driver
1387+
}
1388+
}
1389+
1390+
return driverName, nil
1391+
}

pkg/metrics/metrics.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ type MetricsManager interface {
108108
// "Unknown" status of the passed-in operation is assumed.
109109
RecordMetrics(op OperationKey, status OperationStatus, driverName string)
110110

111+
// RecordVolumeGroupSnapshotMetrics records a metric for operations related to
112+
// VolumeGroupSnapshot
113+
RecordVolumeGroupSnapshotMetrics(op OperationKey, status OperationStatus, driverName string)
114+
111115
// GetRegistry() returns the metrics.KubeRegistry used by this metrics manager.
112116
GetRegistry() k8smetrics.KubeRegistry
113117
}

pkg/metrics/metrics_group.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
Copyright 2024 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package metrics
18+
19+
import (
20+
"time"
21+
)
22+
23+
const (
24+
// CreateGroupSnapshotOperationName is the operation that tracks how long the controller takes to create a groupsnapshot.
25+
// Specifically, the operation metric is emitted based on the following timestamps:
26+
// - Start_time: controller notices the first time that there is a new VolumeGroupSnapshot CR to dynamically provision a groupsnapshot
27+
// - End_time: controller notices that the CR has a status with CreationTime field set to be non-nil
28+
CreateGroupSnapshotOperationName = "CreateGroupSnapshot"
29+
30+
// CreateGroupSnapshotAndReadyOperationName is the operation that tracks how long the controller takes to create a groupsnapshot and for it to be ready.
31+
// Specifically, the operation metric is emitted based on the following timestamps:
32+
// - Start_time: controller notices the first time that there is a new VolumeGroupSnapshot CR(both dynamic and pre-provisioned cases)
33+
// - End_time: controller notices that the CR has a status with Ready To Use field set to be true
34+
CreateGroupSnapshotAndReadyOperationName = "CreateGroupSnapshotAndReady"
35+
36+
// DeleteGroupSnapshotOperationName is the operation that tracks how long a groupsnapshot deletion takes.
37+
// Specifically, the operation metric is emitted based on the following timestamps:
38+
// - Start_time: controller notices the first time that there is a deletion timestamp placed on the VolumeGroupSnapshot CR and the CR is ready to be deleted.
39+
// Note that if the CR is being used by a PVC for rehydration, the controller should *NOT* set the start_time.
40+
// - End_time: controller removed all finalizers on the VolumeGroupSnapshot CR such that the CR is ready to be removed in the API server.
41+
DeleteGroupSnapshotOperationName = "DeleteGroupSnapshot"
42+
// DynamicGroupSnapshotType represents a groupsnapshot that is being dynamically provisioned
43+
DynamicGroupSnapshotType = snapshotProvisionType("dynamic")
44+
// PreProvisionedGroupSnapshotType represents a groupsnapshot that is pre-provisioned
45+
PreProvisionedGroupSnapshotType = snapshotProvisionType("pre-provisioned")
46+
)
47+
48+
// RecordVolumeGroupMetrics emits operation metrics
49+
func (opMgr *operationMetricsManager) RecordVolumeGroupSnapshotMetrics(opKey OperationKey, opStatus OperationStatus, driverName string) {
50+
opMgr.mu.Lock()
51+
defer opMgr.mu.Unlock()
52+
opVal, exists := opMgr.cache[opKey]
53+
if !exists {
54+
// the operation has not been cached, return directly
55+
return
56+
}
57+
status := string(SnapshotStatusTypeUnknown)
58+
if opStatus != nil {
59+
status = opStatus.String()
60+
}
61+
62+
// if we do not know the driverName while recording metrics,
63+
// refer to the cached version instead.
64+
if driverName == "" || driverName == unknownDriverName {
65+
driverName = opVal.Driver
66+
}
67+
68+
operationDuration := time.Since(opVal.startTime).Seconds()
69+
opMgr.opLatencyMetrics.WithLabelValues(driverName, opKey.Name, opVal.SnapshotType, status).Observe(operationDuration)
70+
71+
// Report cancel metrics if we are deleting an unfinished VolumeGroupSnapshot
72+
if opKey.Name == DeleteGroupSnapshotOperationName {
73+
// check if we have a CreateGroupSnapshot operation pending for this
74+
createKey := NewOperationKey(CreateGroupSnapshotOperationName, opKey.ResourceID)
75+
obj, exists := opMgr.cache[createKey]
76+
if exists {
77+
// record a cancel metric if found
78+
opMgr.recordCancelMetricLocked(obj, createKey, operationDuration)
79+
}
80+
81+
// check if we have a CreateGroupSnapshotAndReady operation pending for this
82+
createAndReadyKey := NewOperationKey(CreateGroupSnapshotAndReadyOperationName, opKey.ResourceID)
83+
obj, exists = opMgr.cache[createAndReadyKey]
84+
if exists {
85+
// record a cancel metric if found
86+
opMgr.recordCancelMetricLocked(obj, createAndReadyKey, operationDuration)
87+
}
88+
}
89+
90+
delete(opMgr.cache, opKey)
91+
opMgr.opInFlight.Set(float64(len(opMgr.cache)))
92+
}

0 commit comments

Comments
 (0)