Skip to content

Commit 1a510a1

Browse files
committed
e2e storage: avoid flaky test failure when watch dies
As discussed in kubernetes#93658, relying on a watch to deliver all events is not acceptable, not even for a test, because it can and did (at least for OpenShift testing) to test flakes. The solution in kubernetes#93658 was to replace log flooding with a clear test failure, but that didn't solve the flakiness. A better solution is to use a RetryWatcher which "under normal circumstances" (kubernetes#93777 (comment)) should always deliver all changes.
1 parent 97c5f1f commit 1a510a1

File tree

2 files changed

+31
-8
lines changed

2 files changed

+31
-8
lines changed

test/e2e/storage/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ go_library(
6666
"//staging/src/k8s.io/apiserver/pkg/authentication/serviceaccount:go_default_library",
6767
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
6868
"//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
69+
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
70+
"//staging/src/k8s.io/client-go/tools/watch:go_default_library",
6971
"//staging/src/k8s.io/cloud-provider/volume/helpers:go_default_library",
7072
"//staging/src/k8s.io/component-base/metrics/testutil:go_default_library",
7173
"//test/e2e/framework:go_default_library",

test/e2e/storage/csi_mock_volume.go

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ import (
3939
"k8s.io/apimachinery/pkg/util/wait"
4040
"k8s.io/apimachinery/pkg/watch"
4141
clientset "k8s.io/client-go/kubernetes"
42+
cachetools "k8s.io/client-go/tools/cache"
43+
watchtools "k8s.io/client-go/tools/watch"
4244
"k8s.io/kubernetes/pkg/controller/volume/scheduling"
4345
"k8s.io/kubernetes/test/e2e/framework"
4446
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -821,7 +823,20 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
821823

822824
ctx, cancel := context.WithTimeout(context.Background(), csiPodRunningTimeout)
823825
defer cancel()
824-
pvcWatch, err := f.ClientSet.CoreV1().PersistentVolumeClaims(f.Namespace.Name).Watch(ctx, metav1.ListOptions{})
826+
827+
// In contrast to the raw watch, RetryWatcher is expected to deliver all events even
828+
// when the underlying raw watch gets closed prematurely
829+
// (https://github.com/kubernetes/kubernetes/pull/93777#discussion_r467932080).
830+
// This is important because below the test is going to make assertions about the
831+
// PVC state changes.
832+
initResource, err := f.ClientSet.CoreV1().PersistentVolumeClaims(f.Namespace.Name).List(ctx, metav1.ListOptions{})
833+
framework.ExpectNoError(err, "Failed to fetch initial PVC resource")
834+
listWatcher := &cachetools.ListWatch{
835+
WatchFunc: func(listOptions metav1.ListOptions) (watch.Interface, error) {
836+
return f.ClientSet.CoreV1().PersistentVolumeClaims(f.Namespace.Name).Watch(ctx, listOptions)
837+
},
838+
}
839+
pvcWatch, err := watchtools.NewRetryWatcher(initResource.GetResourceVersion(), listWatcher)
825840
framework.ExpectNoError(err, "create PVC watch")
826841
defer pvcWatch.Stop()
827842

@@ -889,12 +904,14 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
889904
ginkgo.By("Checking PVC events")
890905
nodeAnnotationSet := false
891906
nodeAnnotationReset := false
907+
watchFailed := false
892908
loop:
893909
for {
894910
select {
895911
case event, ok := <-pvcWatch.ResultChan():
896912
if !ok {
897-
framework.Failf("PVC watch ended prematurely")
913+
watchFailed = true
914+
break loop
898915
}
899916

900917
framework.Logf("PVC event %s: %#v", event.Type, event.Object)
@@ -913,10 +930,8 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
913930
case watch.Deleted:
914931
break loop
915932
case watch.Error:
916-
// Can this occur when the apiserver is under heavy load?
917-
// If yes, then we should bail out of the test here early and
918-
// skip further checks instead of treating it as a test failure.
919-
framework.Failf("PVC watch failed prematurely: %v", event.Object)
933+
watchFailed = true
934+
break
920935
}
921936
case <-ctx.Done():
922937
framework.Failf("Timeout while waiting to observe PVC list")
@@ -932,7 +947,13 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
932947
}
933948
}
934949

935-
if test.lateBinding {
950+
switch {
951+
case watchFailed:
952+
// If the watch failed or stopped prematurely (which can happen at any time), then we cannot
953+
// verify whether the annotation was set as expected. This is still considered a successful
954+
// test.
955+
framework.Logf("PVC watch delivered incomplete data, cannot check annotation")
956+
case test.lateBinding:
936957
gomega.Expect(nodeAnnotationSet).To(gomega.BeTrue(), "selected-node should have been set")
937958
// Whether it gets reset depends on whether we have topology enabled. Without
938959
// it, rescheduling is unnecessary.
@@ -941,7 +962,7 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
941962
} else {
942963
gomega.Expect(nodeAnnotationReset).To(gomega.BeFalse(), "selected-node should not have been reset")
943964
}
944-
} else {
965+
default:
945966
gomega.Expect(nodeAnnotationSet).To(gomega.BeFalse(), "selected-node should not have been set")
946967
gomega.Expect(nodeAnnotationReset).To(gomega.BeFalse(), "selected-node should not have been reset")
947968
}

0 commit comments

Comments
 (0)