Skip to content

Commit 3352c44

Browse files
authored
Merge pull request kubernetes#93777 from pohly/capacity-test-flake-fix
e2e storage: avoid flaky test failure when watch dies
2 parents 3f579d8 + 1a510a1 commit 3352c44

File tree

2 files changed

+31
-8
lines changed

2 files changed

+31
-8
lines changed

test/e2e/storage/BUILD

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ go_library(
6666
"//staging/src/k8s.io/apiserver/pkg/authentication/serviceaccount:go_default_library",
6767
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
6868
"//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library",
69+
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
70+
"//staging/src/k8s.io/client-go/tools/watch:go_default_library",
6971
"//staging/src/k8s.io/cloud-provider/volume/helpers:go_default_library",
7072
"//staging/src/k8s.io/component-base/metrics/testutil:go_default_library",
7173
"//test/e2e/framework:go_default_library",

test/e2e/storage/csi_mock_volume.go

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ import (
3939
"k8s.io/apimachinery/pkg/util/wait"
4040
"k8s.io/apimachinery/pkg/watch"
4141
clientset "k8s.io/client-go/kubernetes"
42+
cachetools "k8s.io/client-go/tools/cache"
43+
watchtools "k8s.io/client-go/tools/watch"
4244
"k8s.io/kubernetes/pkg/controller/volume/scheduling"
4345
"k8s.io/kubernetes/test/e2e/framework"
4446
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
@@ -826,7 +828,20 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
826828

827829
ctx, cancel := context.WithTimeout(context.Background(), csiPodRunningTimeout)
828830
defer cancel()
829-
pvcWatch, err := f.ClientSet.CoreV1().PersistentVolumeClaims(f.Namespace.Name).Watch(ctx, metav1.ListOptions{})
831+
832+
// In contrast to the raw watch, RetryWatcher is expected to deliver all events even
833+
// when the underlying raw watch gets closed prematurely
834+
// (https://github.com/kubernetes/kubernetes/pull/93777#discussion_r467932080).
835+
// This is important because below the test is going to make assertions about the
836+
// PVC state changes.
837+
initResource, err := f.ClientSet.CoreV1().PersistentVolumeClaims(f.Namespace.Name).List(ctx, metav1.ListOptions{})
838+
framework.ExpectNoError(err, "Failed to fetch initial PVC resource")
839+
listWatcher := &cachetools.ListWatch{
840+
WatchFunc: func(listOptions metav1.ListOptions) (watch.Interface, error) {
841+
return f.ClientSet.CoreV1().PersistentVolumeClaims(f.Namespace.Name).Watch(ctx, listOptions)
842+
},
843+
}
844+
pvcWatch, err := watchtools.NewRetryWatcher(initResource.GetResourceVersion(), listWatcher)
830845
framework.ExpectNoError(err, "create PVC watch")
831846
defer pvcWatch.Stop()
832847

@@ -894,12 +909,14 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
894909
ginkgo.By("Checking PVC events")
895910
nodeAnnotationSet := false
896911
nodeAnnotationReset := false
912+
watchFailed := false
897913
loop:
898914
for {
899915
select {
900916
case event, ok := <-pvcWatch.ResultChan():
901917
if !ok {
902-
framework.Failf("PVC watch ended prematurely")
918+
watchFailed = true
919+
break loop
903920
}
904921

905922
framework.Logf("PVC event %s: %#v", event.Type, event.Object)
@@ -918,10 +935,8 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
918935
case watch.Deleted:
919936
break loop
920937
case watch.Error:
921-
// Can this occur when the apiserver is under heavy load?
922-
// If yes, then we should bail out of the test here early and
923-
// skip further checks instead of treating it as a test failure.
924-
framework.Failf("PVC watch failed prematurely: %v", event.Object)
938+
watchFailed = true
939+
break
925940
}
926941
case <-ctx.Done():
927942
framework.Failf("Timeout while waiting to observe PVC list")
@@ -937,7 +952,13 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
937952
}
938953
}
939954

940-
if test.lateBinding {
955+
switch {
956+
case watchFailed:
957+
// If the watch failed or stopped prematurely (which can happen at any time), then we cannot
958+
// verify whether the annotation was set as expected. This is still considered a successful
959+
// test.
960+
framework.Logf("PVC watch delivered incomplete data, cannot check annotation")
961+
case test.lateBinding:
941962
gomega.Expect(nodeAnnotationSet).To(gomega.BeTrue(), "selected-node should have been set")
942963
// Whether it gets reset depends on whether we have topology enabled. Without
943964
// it, rescheduling is unnecessary.
@@ -946,7 +967,7 @@ var _ = utils.SIGDescribe("CSI mock volume", func() {
946967
} else {
947968
gomega.Expect(nodeAnnotationReset).To(gomega.BeFalse(), "selected-node should not have been reset")
948969
}
949-
} else {
970+
default:
950971
gomega.Expect(nodeAnnotationSet).To(gomega.BeFalse(), "selected-node should not have been set")
951972
gomega.Expect(nodeAnnotationReset).To(gomega.BeFalse(), "selected-node should not have been reset")
952973
}

0 commit comments

Comments
 (0)