Skip to content

Commit 986171d

Browse files
authored
Merge pull request kubernetes#119185 from xing-yang/metrics_attach
Add reason to force detach metric
2 parents 42e1e72 + cca6601 commit 986171d

File tree

3 files changed

+61
-12
lines changed

3 files changed

+61
-12
lines changed

pkg/controller/volume/attachdetach/metrics/metrics.go

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,14 @@ import (
3535

3636
const pluginNameNotAvailable = "N/A"
3737

38+
const (
39+
// Force detach reason is timeout
40+
ForceDetachReasonTimeout = "timeout"
41+
// Force detach reason is the node has an out-of-service taint
42+
ForceDetachReasonOutOfService = "out-of-service"
43+
attachDetachController = "attach_detach_controller"
44+
)
45+
3846
var (
3947
inUseVolumeMetricDesc = metrics.NewDesc(
4048
metrics.BuildFQName("", "storage_count", "attachable_volumes_in_use"),
@@ -48,12 +56,15 @@ var (
4856
[]string{"plugin_name", "state"}, nil,
4957
metrics.ALPHA, "")
5058

51-
forcedDetachMetricCounter = metrics.NewCounter(
59+
ForceDetachMetricCounter = metrics.NewCounterVec(
5260
&metrics.CounterOpts{
61+
Subsystem: attachDetachController,
5362
Name: "attachdetach_controller_forced_detaches",
5463
Help: "Number of times the A/D Controller performed a forced detach",
5564
StabilityLevel: metrics.ALPHA,
56-
})
65+
},
66+
[]string{"reason"},
67+
)
5768
)
5869
var registerMetrics sync.Once
5970

@@ -75,7 +86,7 @@ func Register(pvcLister corelisters.PersistentVolumeClaimLister,
7586
pluginMgr,
7687
csiMigratedPluginManager,
7788
intreeToCSITranslator))
78-
legacyregistry.MustRegister(forcedDetachMetricCounter)
89+
legacyregistry.MustRegister(ForceDetachMetricCounter)
7990
})
8091
}
8192

@@ -209,6 +220,6 @@ func (collector *attachDetachStateCollector) getTotalVolumesCount() volumeCount
209220
}
210221

211222
// RecordForcedDetachMetric register a forced detach metric.
212-
func RecordForcedDetachMetric() {
213-
forcedDetachMetricCounter.Inc()
223+
func RecordForcedDetachMetric(forceDetachReason string) {
224+
ForceDetachMetricCounter.WithLabelValues(forceDetachReason).Inc()
214225
}

pkg/controller/volume/attachdetach/reconciler/reconciler.go

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -269,14 +269,21 @@ func (rc *reconciler) reconcile(ctx context.Context) {
269269
verifySafeToDetach := !(timeout || hasOutOfServiceTaint)
270270
err = rc.attacherDetacher.DetachVolume(logger, attachedVolume.AttachedVolume, verifySafeToDetach, rc.actualStateOfWorld)
271271
if err == nil {
272-
if !timeout {
272+
if verifySafeToDetach { // normal detach
273273
logger.Info("attacherDetacher.DetachVolume started", "node", klog.KRef("", string(attachedVolume.NodeName)), "volumeName", attachedVolume.VolumeName)
274-
} else {
275-
metrics.RecordForcedDetachMetric()
276-
logger.Info("attacherDetacher.DetachVolume started: this volume is not safe to detach, but maxWaitForUnmountDuration expired, force detaching",
277-
"duration", rc.maxWaitForUnmountDuration,
278-
"node", klog.KRef("", string(attachedVolume.NodeName)),
279-
"volumeName", attachedVolume.VolumeName)
274+
} else { // force detach
275+
if timeout {
276+
metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonTimeout)
277+
logger.Info("attacherDetacher.DetachVolume started: this volume is not safe to detach, but maxWaitForUnmountDuration expired, force detaching",
278+
"duration", rc.maxWaitForUnmountDuration,
279+
"node", klog.KRef("", string(attachedVolume.NodeName)),
280+
"volumeName", attachedVolume.VolumeName)
281+
} else {
282+
metrics.RecordForcedDetachMetric(metrics.ForceDetachReasonOutOfService)
283+
logger.Info("attacherDetacher.DetachVolume started: node has out-of-service taint, force detaching",
284+
"node", klog.KRef("", string(attachedVolume.NodeName)),
285+
"volumeName", attachedVolume.VolumeName)
286+
}
280287
}
281288
}
282289
if err != nil {

pkg/controller/volume/attachdetach/reconciler/reconciler_test.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package reconciler
1919
import (
2020
"context"
2121
"fmt"
22+
"sync"
2223
"testing"
2324
"time"
2425

@@ -30,10 +31,13 @@ import (
3031
"k8s.io/client-go/informers"
3132
"k8s.io/client-go/tools/record"
3233
featuregatetesting "k8s.io/component-base/featuregate/testing"
34+
"k8s.io/component-base/metrics/legacyregistry"
35+
metricstestutil "k8s.io/component-base/metrics/testutil"
3336
"k8s.io/klog/v2"
3437
"k8s.io/klog/v2/ktesting"
3538
"k8s.io/kubernetes/pkg/controller"
3639
"k8s.io/kubernetes/pkg/controller/volume/attachdetach/cache"
40+
"k8s.io/kubernetes/pkg/controller/volume/attachdetach/metrics"
3741
"k8s.io/kubernetes/pkg/controller/volume/attachdetach/statusupdater"
3842
controllervolumetesting "k8s.io/kubernetes/pkg/controller/volume/attachdetach/testing"
3943
"k8s.io/kubernetes/pkg/features"
@@ -51,6 +55,8 @@ const (
5155
volumeAttachedCheckTimeout = 5 * time.Second
5256
)
5357

58+
var registerMetrics sync.Once
59+
5460
// Calls Run()
5561
// Verifies there are no calls to attach or detach.
5662
func Test_Run_Positive_DoNothing(t *testing.T) {
@@ -221,6 +227,9 @@ func Test_Run_Positive_OneDesiredVolumeAttachThenDetachWithUnmountedVolume(t *te
221227
// Deletes the node/volume/pod tuple from desiredStateOfWorld cache without first marking the node/volume as unmounted.
222228
// Verifies there is one detach call and no (new) attach calls.
223229
func Test_Run_Positive_OneDesiredVolumeAttachThenDetachWithMountedVolume(t *testing.T) {
230+
registerMetrics.Do(func() {
231+
legacyregistry.MustRegister(metrics.ForceDetachMetricCounter)
232+
})
224233
// Arrange
225234
volumePluginMgr, fakePlugin := volumetesting.GetTestVolumePluginMgr(t)
226235
dsw := cache.NewDesiredStateOfWorld(volumePluginMgr)
@@ -287,6 +296,9 @@ func Test_Run_Positive_OneDesiredVolumeAttachThenDetachWithMountedVolume(t *test
287296
waitForAttachCallCount(t, 1 /* expectedAttachCallCount */, fakePlugin)
288297
verifyNewDetacherCallCount(t, false /* expectZeroNewDetacherCallCount */, fakePlugin)
289298
waitForDetachCallCount(t, 1 /* expectedDetachCallCount */, fakePlugin)
299+
300+
// Force detach metric due to timeout
301+
testForceDetachMetric(t, 1, metrics.ForceDetachReasonTimeout)
290302
}
291303

292304
// Populates desiredStateOfWorld cache with one node/volume/pod tuple.
@@ -852,6 +864,9 @@ func Test_Run_OneVolumeAttachAndDetachTimeoutNodesWithReadWriteOnce(t *testing.T
852864
// Verifies there is one detach call and no (new) attach calls.
853865
func Test_Run_OneVolumeDetachOnOutOfServiceTaintedNode(t *testing.T) {
854866
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.NodeOutOfServiceVolumeDetach, true)()
867+
registerMetrics.Do(func() {
868+
legacyregistry.MustRegister(metrics.ForceDetachMetricCounter)
869+
})
855870
// Arrange
856871
volumePluginMgr, fakePlugin := volumetesting.GetTestVolumePluginMgr(t)
857872
dsw := cache.NewDesiredStateOfWorld(volumePluginMgr)
@@ -920,6 +935,9 @@ func Test_Run_OneVolumeDetachOnOutOfServiceTaintedNode(t *testing.T) {
920935
waitForAttachCallCount(t, 1 /* expectedAttachCallCount */, fakePlugin)
921936
verifyNewDetacherCallCount(t, false /* expectZeroNewDetacherCallCount */, fakePlugin)
922937
waitForDetachCallCount(t, 1 /* expectedDetachCallCount */, fakePlugin)
938+
939+
// Force detach metric due to out-of-service taint
940+
testForceDetachMetric(t, 1, metrics.ForceDetachReasonOutOfService)
923941
}
924942

925943
// Populates desiredStateOfWorld cache with one node/volume/pod tuple.
@@ -1666,3 +1684,16 @@ func retryWithExponentialBackOff(initialDuration time.Duration, fn wait.Conditio
16661684
}
16671685
return wait.ExponentialBackoff(backoff, fn)
16681686
}
1687+
1688+
// verifies the force detach metric with reason
1689+
func testForceDetachMetric(t *testing.T, inputForceDetachMetricCounter int, reason string) {
1690+
t.Helper()
1691+
1692+
actualForceDetachMericCounter, err := metricstestutil.GetCounterMetricValue(metrics.ForceDetachMetricCounter.WithLabelValues(reason))
1693+
if err != nil {
1694+
t.Errorf("Error getting actualForceDetachMericCounter")
1695+
}
1696+
if actualForceDetachMericCounter != float64(inputForceDetachMetricCounter) {
1697+
t.Errorf("Expected desiredForceDetachMericCounter to be %d, got %v", inputForceDetachMetricCounter, actualForceDetachMericCounter)
1698+
}
1699+
}

0 commit comments

Comments
 (0)