Skip to content

Commit 941821b

Browse files
committed
Add snapshot controller metrics
Signed-off-by: Grant Griffiths <[email protected]>
1 parent 684a1d3 commit 941821b

File tree

7 files changed

+655
-282
lines changed

7 files changed

+655
-282
lines changed

cmd/snapshot-controller/main.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"os"
2424
"os/signal"
25+
"sync"
2526
"time"
2627

2728
"k8s.io/client-go/kubernetes"
@@ -32,6 +33,7 @@ import (
3233

3334
"github.com/kubernetes-csi/csi-lib-utils/leaderelection"
3435
controller "github.com/kubernetes-csi/external-snapshotter/v3/pkg/common-controller"
36+
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics"
3537

3638
clientset "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned"
3739
snapshotscheme "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned/scheme"
@@ -48,6 +50,9 @@ var (
4850

4951
leaderElection = flag.Bool("leader-election", false, "Enables leader election.")
5052
leaderElectionNamespace = flag.String("leader-election-namespace", "", "The namespace where the leader election resource exists. Defaults to the pod namespace if not set.")
53+
54+
httpEndpoint = flag.String("http-endpoint", "", "The TCP network address where the HTTP server for diagnostics, including metrics, will listen (example: :8080). The default is empty string, which means the server is disabled.")
55+
metricsPath = flag.String("metrics-path", "/metrics", "The HTTP path where prometheus metrics will be exposed. Default is `/metrics`.")
5156
)
5257

5358
var (
@@ -87,6 +92,28 @@ func main() {
8792
factory := informers.NewSharedInformerFactory(snapClient, *resyncPeriod)
8893
coreFactory := coreinformers.NewSharedInformerFactory(kubeClient, *resyncPeriod)
8994

95+
// Create and register metrics manager
96+
metricsManager := metrics.NewMetricsManager()
97+
wg := &sync.WaitGroup{}
98+
wg.Add(1)
99+
if *httpEndpoint != "" {
100+
srv, err := metricsManager.StartMetricsEndpoint(*metricsPath, *httpEndpoint, promklog{}, wg)
101+
if err != nil {
102+
klog.Errorf("Failed to start metrics server: %s", err.Error())
103+
os.Exit(1)
104+
}
105+
defer func() {
106+
err := srv.Shutdown(context.Background())
107+
if err != nil {
108+
klog.Errorf("Failed to shutdown metrics server: %s", err.Error())
109+
}
110+
111+
klog.Infof("Metrics server successfully shutdown")
112+
wg.Done()
113+
}()
114+
klog.Infof("Metrics server successfully started on %s, %s", *httpEndpoint, *metricsPath)
115+
}
116+
90117
// Add Snapshot types to the default Kubernetes so events can be logged for them
91118
snapshotscheme.AddToScheme(scheme.Scheme)
92119

@@ -99,6 +126,7 @@ func main() {
99126
factory.Snapshot().V1beta1().VolumeSnapshotContents(),
100127
factory.Snapshot().V1beta1().VolumeSnapshotClasses(),
101128
coreFactory.Core().V1().PersistentVolumeClaims(),
129+
metricsManager,
102130
*resyncPeriod,
103131
)
104132

@@ -142,3 +170,9 @@ func buildConfig(kubeconfig string) (*rest.Config, error) {
142170
}
143171
return rest.InClusterConfig()
144172
}
173+
174+
type promklog struct{}
175+
176+
func (pl promklog) Println(v ...interface{}) {
177+
klog.Error(v...)
178+
}

pkg/common-controller/framework_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434
snapshotscheme "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned/scheme"
3535
informers "github.com/kubernetes-csi/external-snapshotter/client/v3/informers/externalversions"
3636
storagelisters "github.com/kubernetes-csi/external-snapshotter/client/v3/listers/volumesnapshot/v1beta1"
37+
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics"
3738
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils"
3839
v1 "k8s.io/api/core/v1"
3940
"k8s.io/apimachinery/pkg/api/resource"
@@ -734,6 +735,10 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte
734735
}
735736

736737
coreFactory := coreinformers.NewSharedInformerFactory(kubeClient, utils.NoResyncPeriodFunc())
738+
metricsManager := metrics.NewMetricsManager()
739+
wg := &sync.WaitGroup{}
740+
wg.Add(1)
741+
metricsManager.StartMetricsEndpoint("/metrics", "localhost:0", nil, wg)
737742

738743
ctrl := NewCSISnapshotCommonController(
739744
clientset,
@@ -742,6 +747,7 @@ func newTestController(kubeClient kubernetes.Interface, clientset clientset.Inte
742747
informerFactory.Snapshot().V1beta1().VolumeSnapshotContents(),
743748
informerFactory.Snapshot().V1beta1().VolumeSnapshotClasses(),
744749
coreFactory.Core().V1().PersistentVolumeClaims(),
750+
metricsManager,
745751
60*time.Second,
746752
)
747753

pkg/common-controller/snapshot_controller.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
klog "k8s.io/klog/v2"
3333

3434
crdv1 "github.com/kubernetes-csi/external-snapshotter/client/v3/apis/volumesnapshot/v1beta1"
35+
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics"
3536
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils"
3637
)
3738

@@ -235,6 +236,20 @@ func (ctrl *csiSnapshotCommonController) syncSnapshot(snapshot *crdv1.VolumeSnap
235236
// 2. Call checkandRemoveSnapshotFinalizersAndCheckandDeleteContent() with information obtained from step 1. This function name is very long but the name suggests what it does. It determines whether to remove finalizers on snapshot and whether to delete content.
236237
func (ctrl *csiSnapshotCommonController) processSnapshotWithDeletionTimestamp(snapshot *crdv1.VolumeSnapshot) error {
237238
klog.V(5).Infof("processSnapshotWithDeletionTimestamp VolumeSnapshot[%s]: %s", utils.SnapshotKey(snapshot), utils.GetSnapshotStatusForLogging(snapshot))
239+
driverName, err := ctrl.getSnapshotDriverName(snapshot)
240+
if err != nil {
241+
klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %v", utils.SnapshotKey(snapshot), err)
242+
}
243+
244+
snapshotProvisionType := metrics.DynamicSnapshotType
245+
if snapshot.Spec.Source.VolumeSnapshotContentName != nil {
246+
snapshotProvisionType = metrics.PreProvisionedSnapshotType
247+
}
248+
249+
// Processing delete, start operation metric
250+
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteSnapshotOperationName, snapshot.UID)
251+
deleteOperationValue := metrics.NewOperationValue(driverName, snapshotProvisionType)
252+
ctrl.metricsManager.OperationStart(deleteOperationKey, deleteOperationValue)
238253

239254
var contentName string
240255
if snapshot.Status != nil && snapshot.Status.BoundVolumeSnapshotContentName != nil {
@@ -269,6 +284,7 @@ func (ctrl *csiSnapshotCommonController) processSnapshotWithDeletionTimestamp(sn
269284
}
270285

271286
klog.V(5).Infof("processSnapshotWithDeletionTimestamp[%s]: delete snapshot content and remove finalizer from snapshot if needed", utils.SnapshotKey(snapshot))
287+
272288
return ctrl.checkandRemoveSnapshotFinalizersAndCheckandDeleteContent(snapshot, content, deleteContent)
273289
}
274290

@@ -388,6 +404,7 @@ func (ctrl *csiSnapshotCommonController) syncReadySnapshot(snapshot *crdv1.Volum
388404
// snapshot is bound but content is not pointing to the snapshot
389405
return ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotMisbound", "VolumeSnapshotContent is not bound to the VolumeSnapshot correctly")
390406
}
407+
391408
// everything is verified, return
392409
return nil
393410
}
@@ -396,20 +413,45 @@ func (ctrl *csiSnapshotCommonController) syncReadySnapshot(snapshot *crdv1.Volum
396413
func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.VolumeSnapshot) error {
397414
uniqueSnapshotName := utils.SnapshotKey(snapshot)
398415
klog.V(5).Infof("syncUnreadySnapshot %s", uniqueSnapshotName)
416+
driverName, err := ctrl.getSnapshotDriverName(snapshot)
417+
if err != nil {
418+
klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %s", utils.SnapshotKey(snapshot), err)
419+
}
420+
421+
snapshotProvisionType := metrics.DynamicSnapshotType
422+
if snapshot.Spec.Source.VolumeSnapshotContentName != nil {
423+
snapshotProvisionType = metrics.PreProvisionedSnapshotType
424+
}
425+
426+
// Start metrics operations
427+
if !utils.IsSnapshotCreated(snapshot) {
428+
// Only start CreateSnapshot operation if the snapshot has not been cut
429+
ctrl.metricsManager.OperationStart(
430+
metrics.NewOperationKey(metrics.CreateSnapshotOperationName, snapshot.UID),
431+
metrics.NewOperationValue(driverName, snapshotProvisionType),
432+
)
433+
}
434+
ctrl.metricsManager.OperationStart(
435+
metrics.NewOperationKey(metrics.CreateSnapshotAndReadyOperationName, snapshot.UID),
436+
metrics.NewOperationValue(driverName, snapshotProvisionType),
437+
)
399438

400439
// Pre-provisioned snapshot
401440
if snapshot.Spec.Source.VolumeSnapshotContentName != nil {
402441
content, err := ctrl.getPreprovisionedContentFromStore(snapshot)
403442
if err != nil {
404443
return err
405444
}
445+
406446
// if no content found yet, update status and return
407447
if content == nil {
408448
// can not find the desired VolumeSnapshotContent from cache store
409449
ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotContentMissing", "VolumeSnapshotContent is missing")
410450
klog.V(4).Infof("syncUnreadySnapshot[%s]: snapshot content %q requested but not found, will try again", utils.SnapshotKey(snapshot), *snapshot.Spec.Source.VolumeSnapshotContentName)
451+
411452
return fmt.Errorf("snapshot %s requests an non-existing content %s", utils.SnapshotKey(snapshot), *snapshot.Spec.Source.VolumeSnapshotContentName)
412453
}
454+
413455
// Set VolumeSnapshotRef UID
414456
newContent, err := ctrl.checkandBindSnapshotContent(snapshot, content)
415457
if err != nil {
@@ -426,8 +468,10 @@ func (ctrl *csiSnapshotCommonController) syncUnreadySnapshot(snapshot *crdv1.Vol
426468
ctrl.updateSnapshotErrorStatusWithEvent(snapshot, v1.EventTypeWarning, "SnapshotStatusUpdateFailed", fmt.Sprintf("Snapshot status update failed, %v", err))
427469
return err
428470
}
471+
429472
return nil
430473
}
474+
431475
// snapshot.Spec.Source.VolumeSnapshotContentName == nil - dynamically creating snapshot
432476
klog.V(5).Infof("getDynamicallyProvisionedContentFromStore for snapshot %s", uniqueSnapshotName)
433477
contentObj, err := ctrl.getDynamicallyProvisionedContentFromStore(snapshot)
@@ -1094,10 +1138,30 @@ func (ctrl *csiSnapshotCommonController) updateSnapshotStatus(snapshot *crdv1.Vo
10941138
if updated {
10951139
snapshotClone := snapshotObj.DeepCopy()
10961140
snapshotClone.Status = newStatus
1141+
1142+
// We need to record metrics before updating the status due to a bug causing cache entries after a failed UpdateStatus call.
1143+
// Must meet the following criteria to emit a successful CreateSnapshot status
1144+
// 1. Previous status was nil OR Previous status had a nil CreationTime
1145+
// 2. New status must be non-nil with a non-nil CreationTime
1146+
driverName := content.Spec.Driver
1147+
createOperationKey := metrics.NewOperationKey(metrics.CreateSnapshotOperationName, snapshot.UID)
1148+
if !utils.IsSnapshotCreated(snapshotObj) && utils.IsSnapshotCreated(snapshotClone) {
1149+
ctrl.metricsManager.RecordMetrics(createOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
1150+
}
1151+
1152+
// Must meet the following criteria to emit a successful CreateSnapshotAndReady status
1153+
// 1. Previous status was nil OR Previous status had a nil ReadyToUse OR Previous status had a false ReadyToUse
1154+
// 2. New status must be non-nil with a ReadyToUse as true
1155+
if !utils.IsSnapshotReady(snapshotObj) && utils.IsSnapshotReady(snapshotClone) {
1156+
createAndReadyOperation := metrics.NewOperationKey(metrics.CreateSnapshotAndReadyOperationName, snapshot.UID)
1157+
ctrl.metricsManager.RecordMetrics(createAndReadyOperation, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
1158+
}
1159+
10971160
newSnapshotObj, err := ctrl.clientset.SnapshotV1beta1().VolumeSnapshots(snapshotClone.Namespace).UpdateStatus(context.TODO(), snapshotClone, metav1.UpdateOptions{})
10981161
if err != nil {
10991162
return nil, newControllerUpdateError(utils.SnapshotKey(snapshot), err.Error())
11001163
}
1164+
11011165
return newSnapshotObj, nil
11021166
}
11031167

@@ -1177,6 +1241,45 @@ func (ctrl *csiSnapshotCommonController) getSnapshotClass(className string) (*cr
11771241
return class, nil
11781242
}
11791243

1244+
// getSnapshotDriverName is a helper function to get snapshot driver from the VolumeSnapshot.
1245+
// We try to get the driverName in multiple ways, as snapshot controller metrics depend on the correct driverName.
1246+
func (ctrl *csiSnapshotCommonController) getSnapshotDriverName(vs *crdv1.VolumeSnapshot) (string, error) {
1247+
klog.V(5).Infof("getSnapshotDriverName: VolumeSnapshot[%s]", vs.Name)
1248+
var driverName string
1249+
1250+
// Pre-Provisioned snapshots have contentName as source
1251+
var contentName string
1252+
if vs.Spec.Source.VolumeSnapshotContentName != nil {
1253+
contentName = *vs.Spec.Source.VolumeSnapshotContentName
1254+
}
1255+
1256+
// Get Driver name from SnapshotContent if we found a contentName
1257+
if contentName != "" {
1258+
content, err := ctrl.contentLister.Get(contentName)
1259+
if err != nil {
1260+
klog.Errorf("getSnapshotDriverName: failed to get snapshotContent: %v", contentName)
1261+
} else {
1262+
driverName = content.Spec.Driver
1263+
}
1264+
1265+
if driverName != "" {
1266+
return driverName, nil
1267+
}
1268+
}
1269+
1270+
// Dynamic snapshots will have a snapshotclass with a driver
1271+
if vs.Spec.VolumeSnapshotClassName != nil {
1272+
class, err := ctrl.getSnapshotClass(*vs.Spec.VolumeSnapshotClassName)
1273+
if err != nil {
1274+
klog.Errorf("getSnapshotDriverName: failed to get snapshotClass: %v", *vs.Spec.VolumeSnapshotClassName)
1275+
} else {
1276+
driverName = class.Driver
1277+
}
1278+
}
1279+
1280+
return driverName, nil
1281+
}
1282+
11801283
// SetDefaultSnapshotClass is a helper function to figure out the default snapshot class.
11811284
// For pre-provisioned case, it's an no-op.
11821285
// For dynamic provisioning, it gets the default SnapshotClasses in the system if there is any(could be multiple),

pkg/common-controller/snapshot_controller_base.go

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
clientset "github.com/kubernetes-csi/external-snapshotter/client/v3/clientset/versioned"
2525
storageinformers "github.com/kubernetes-csi/external-snapshotter/client/v3/informers/externalversions/volumesnapshot/v1beta1"
2626
storagelisters "github.com/kubernetes-csi/external-snapshotter/client/v3/listers/volumesnapshot/v1beta1"
27+
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/metrics"
2728
"github.com/kubernetes-csi/external-snapshotter/v3/pkg/utils"
2829

2930
v1 "k8s.io/api/core/v1"
@@ -60,6 +61,8 @@ type csiSnapshotCommonController struct {
6061
snapshotStore cache.Store
6162
contentStore cache.Store
6263

64+
metricsManager metrics.MetricsManager
65+
6366
resyncPeriod time.Duration
6467
}
6568

@@ -71,6 +74,7 @@ func NewCSISnapshotCommonController(
7174
volumeSnapshotContentInformer storageinformers.VolumeSnapshotContentInformer,
7275
volumeSnapshotClassInformer storageinformers.VolumeSnapshotClassInformer,
7376
pvcInformer coreinformers.PersistentVolumeClaimInformer,
77+
metricsManager metrics.MetricsManager,
7478
resyncPeriod time.Duration,
7579
) *csiSnapshotCommonController {
7680
broadcaster := record.NewBroadcaster()
@@ -80,14 +84,15 @@ func NewCSISnapshotCommonController(
8084
eventRecorder = broadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: fmt.Sprintf("snapshot-controller")})
8185

8286
ctrl := &csiSnapshotCommonController{
83-
clientset: clientset,
84-
client: client,
85-
eventRecorder: eventRecorder,
86-
resyncPeriod: resyncPeriod,
87-
snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
88-
contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
89-
snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"),
90-
contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"),
87+
clientset: clientset,
88+
client: client,
89+
eventRecorder: eventRecorder,
90+
resyncPeriod: resyncPeriod,
91+
snapshotStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
92+
contentStore: cache.NewStore(cache.DeletionHandlingMetaNamespaceKeyFunc),
93+
snapshotQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-snapshot"),
94+
contentQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "snapshot-controller-content"),
95+
metricsManager: metricsManager,
9196
}
9297

9398
ctrl.pvcLister = pvcInformer.Lister()
@@ -363,6 +368,7 @@ func (ctrl *csiSnapshotCommonController) updateSnapshot(snapshot *crdv1.VolumeSn
363368
if !newSnapshot {
364369
return nil
365370
}
371+
366372
err = ctrl.syncSnapshot(snapshot)
367373
if err != nil {
368374
if errors.IsConflict(err) {
@@ -407,6 +413,13 @@ func (ctrl *csiSnapshotCommonController) updateContent(content *crdv1.VolumeSnap
407413
func (ctrl *csiSnapshotCommonController) deleteSnapshot(snapshot *crdv1.VolumeSnapshot) {
408414
_ = ctrl.snapshotStore.Delete(snapshot)
409415
klog.V(4).Infof("snapshot %q deleted", utils.SnapshotKey(snapshot))
416+
driverName, err := ctrl.getSnapshotDriverName(snapshot)
417+
if err != nil {
418+
klog.Errorf("failed to getSnapshotDriverName while recording metrics for snapshot %q: %s", utils.SnapshotKey(snapshot), err)
419+
} else {
420+
deleteOperationKey := metrics.NewOperationKey(metrics.DeleteSnapshotOperationName, snapshot.UID)
421+
ctrl.metricsManager.RecordMetrics(deleteOperationKey, metrics.NewSnapshotOperationStatus(metrics.SnapshotStatusTypeSuccess), driverName)
422+
}
410423

411424
snapshotContentName := ""
412425
if snapshot.Status != nil && snapshot.Status.BoundVolumeSnapshotContentName != nil {
@@ -416,6 +429,7 @@ func (ctrl *csiSnapshotCommonController) deleteSnapshot(snapshot *crdv1.VolumeSn
416429
klog.V(5).Infof("deleteSnapshot[%q]: content not bound", utils.SnapshotKey(snapshot))
417430
return
418431
}
432+
419433
// sync the content when its snapshot is deleted. Explicitly sync'ing the
420434
// content here in response to snapshot deletion prevents the content from
421435
// waiting until the next sync period for its Release.

0 commit comments

Comments
 (0)