Skip to content

Commit d4fdfaf

Browse files
test/integration/garbagecollector: add test TestCascadingDeleteOnCRDConversionFailure which tests that GC controller cannot be blocked by a bad conversion webhook
Signed-off-by: haorenfsa <[email protected]> Co-authored-by: Andrew Sy Kim <[email protected]>
1 parent e8b1d7d commit d4fdfaf

File tree

2 files changed

+219
-47
lines changed

2 files changed

+219
-47
lines changed

pkg/controller/garbagecollector/garbagecollector_test.go

Lines changed: 79 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"reflect"
2525
"strings"
2626
"sync"
27+
"sync/atomic"
2728
"testing"
2829
"time"
2930

@@ -49,6 +50,7 @@ import (
4950
"k8s.io/apimachinery/pkg/util/json"
5051
"k8s.io/apimachinery/pkg/util/sets"
5152
"k8s.io/apimachinery/pkg/util/strategicpatch"
53+
"k8s.io/apimachinery/pkg/util/wait"
5254
"k8s.io/client-go/discovery"
5355
"k8s.io/client-go/informers"
5456
"k8s.io/client-go/kubernetes"
@@ -60,9 +62,11 @@ import (
6062
clientgotesting "k8s.io/client-go/testing"
6163
"k8s.io/client-go/tools/record"
6264
"k8s.io/client-go/util/workqueue"
65+
metricsutil "k8s.io/component-base/metrics/testutil"
6366
"k8s.io/controller-manager/pkg/informerfactory"
6467
"k8s.io/kubernetes/pkg/api/legacyscheme"
6568
c "k8s.io/kubernetes/pkg/controller"
69+
"k8s.io/kubernetes/pkg/controller/garbagecollector/metrics"
6670
"k8s.io/kubernetes/test/utils/ktesting"
6771
)
6872

@@ -846,7 +850,6 @@ func TestGarbageCollectorSync(t *testing.T) {
846850
PreferredResources: serverResources,
847851
Error: nil,
848852
Lock: sync.Mutex{},
849-
InterfaceUsedCount: 0,
850853
}
851854

852855
testHandler := &fakeActionHandler{
@@ -865,7 +868,24 @@ func TestGarbageCollectorSync(t *testing.T) {
865868
},
866869
},
867870
}
868-
srv, clientConfig := testServerAndClientConfig(testHandler.ServeHTTP)
871+
872+
testHandler2 := &fakeActionHandler{
873+
response: map[string]FakeResponse{
874+
"GET" + "/api/v1/secrets": {
875+
200,
876+
[]byte("{}"),
877+
},
878+
},
879+
}
880+
var secretSyncOK atomic.Bool
881+
var alternativeTestHandler = func(response http.ResponseWriter, request *http.Request) {
882+
if request.URL.Path == "/api/v1/secrets" && secretSyncOK.Load() {
883+
testHandler2.ServeHTTP(response, request)
884+
return
885+
}
886+
testHandler.ServeHTTP(response, request)
887+
}
888+
srv, clientConfig := testServerAndClientConfig(alternativeTestHandler)
869889
defer srv.Close()
870890
clientConfig.ContentConfig.NegotiatedSerializer = nil
871891
client, err := kubernetes.NewForConfig(clientConfig)
@@ -885,7 +905,7 @@ func TestGarbageCollectorSync(t *testing.T) {
885905

886906
sharedInformers := informers.NewSharedInformerFactory(client, 0)
887907

888-
tCtx := ktesting.Init(t)
908+
logger, tCtx := ktesting.NewTestContext(t)
889909
defer tCtx.Cancel("test has completed")
890910
alwaysStarted := make(chan struct{})
891911
close(alwaysStarted)
@@ -913,30 +933,49 @@ func TestGarbageCollectorSync(t *testing.T) {
913933

914934
// Wait until the sync discovers the initial resources
915935
time.Sleep(1 * time.Second)
936+
937+
err = expectSyncNotBlocked(fakeDiscoveryClient)
938+
if err != nil {
939+
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
940+
}
916941
assertMonitors(t, gc, "pods", "deployments")
917942

918943
// Simulate the discovery client returning an error
919944
fakeDiscoveryClient.setPreferredResources(nil, fmt.Errorf("error calling discoveryClient.ServerPreferredResources()"))
945+
946+
// Wait until sync discovers the change
920947
time.Sleep(1 * time.Second)
948+
// No monitor changes
921949
assertMonitors(t, gc, "pods", "deployments")
922950

923951
// Remove the error from being returned and see if the garbage collector sync is still working
924952
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
925-
time.Sleep(1 * time.Second)
953+
954+
err = expectSyncNotBlocked(fakeDiscoveryClient)
955+
if err != nil {
956+
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
957+
}
926958
assertMonitors(t, gc, "pods", "deployments")
927959

928960
// Simulate the discovery client returning a resource the restmapper can resolve, but will not sync caches
929961
fakeDiscoveryClient.setPreferredResources(unsyncableServerResources, nil)
962+
963+
// Wait until sync discovers the change
930964
time.Sleep(1 * time.Second)
931965
assertMonitors(t, gc, "pods", "secrets")
932966

933967
// Put the resources back to normal and ensure garbage collector sync recovers
934968
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
935-
time.Sleep(1 * time.Second)
969+
970+
err = expectSyncNotBlocked(fakeDiscoveryClient)
971+
if err != nil {
972+
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
973+
}
936974
assertMonitors(t, gc, "pods", "deployments")
937975

938976
// Partial discovery failure
939977
fakeDiscoveryClient.setPreferredResources(unsyncableServerResources, appsV1Error)
978+
// Wait until sync discovers the change
940979
time.Sleep(1 * time.Second)
941980
// Deployments monitor kept
942981
assertMonitors(t, gc, "pods", "deployments", "secrets")
@@ -945,35 +984,33 @@ func TestGarbageCollectorSync(t *testing.T) {
945984
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
946985
// Wait until sync discovers the change
947986
time.Sleep(1 * time.Second)
987+
err = expectSyncNotBlocked(fakeDiscoveryClient)
988+
if err != nil {
989+
t.Fatalf("Expected garbagecollector.Sync to still be running but it is blocked: %v", err)
990+
}
948991
// Unsyncable monitor removed
949992
assertMonitors(t, gc, "pods", "deployments")
950993

951-
// Add fake controller simulate the initial not-synced informer which will be synced at the end.
952-
fc := fakeController{}
953-
gc.dependencyGraphBuilder.monitors[schema.GroupVersionResource{
954-
Version: "v1",
955-
Resource: "secrets",
956-
}] = &monitor{controller: &fc}
957-
if gc.IsSynced(logger) {
958-
t.Fatal("cache from garbage collector should not be synced")
959-
}
960-
994+
// Simulate initial not-synced informer which will be synced at the end.
995+
metrics.GarbageCollectorResourcesSyncError.Reset()
961996
fakeDiscoveryClient.setPreferredResources(unsyncableServerResources, nil)
962997
time.Sleep(1 * time.Second)
963998
assertMonitors(t, gc, "pods", "secrets")
999+
if gc.IsSynced(logger) {
1000+
t.Fatal("cache from garbage collector should not be synced")
1001+
}
1002+
val, _ := metricsutil.GetCounterMetricValue(metrics.GarbageCollectorResourcesSyncError)
1003+
if val < 1 {
1004+
t.Fatalf("expect sync error metric > 0")
1005+
}
9641006

9651007
// The informer is synced now.
966-
fc.SetSynced(true)
967-
time.Sleep(1 * time.Second)
968-
assertMonitors(t, gc, "pods", "secrets")
969-
970-
if !gc.IsSynced(logger) {
971-
t.Fatal("cache from garbage collector should be synced")
1008+
secretSyncOK.Store(true)
1009+
if err := wait.PollUntilContextTimeout(tCtx, time.Second, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
1010+
return gc.IsSynced(logger), nil
1011+
}); err != nil {
1012+
t.Fatal(err)
9721013
}
973-
974-
fakeDiscoveryClient.setPreferredResources(serverResources, nil)
975-
time.Sleep(1 * time.Second)
976-
assertMonitors(t, gc, "pods", "deployments")
9771014
}
9781015

9791016
func assertMonitors(t *testing.T, gc *GarbageCollector, resources ...string) {
@@ -988,6 +1025,17 @@ func assertMonitors(t *testing.T, gc *GarbageCollector, resources ...string) {
9881025
}
9891026
}
9901027

1028+
func expectSyncNotBlocked(fakeDiscoveryClient *fakeServerResources) error {
1029+
before := fakeDiscoveryClient.getInterfaceUsedCount()
1030+
t := 1 * time.Second
1031+
time.Sleep(t)
1032+
after := fakeDiscoveryClient.getInterfaceUsedCount()
1033+
if before == after {
1034+
return fmt.Errorf("discoveryClient.ServerPreferredResources() not called over %v", t)
1035+
}
1036+
return nil
1037+
}
1038+
9911039
type fakeServerResources struct {
9921040
PreferredResources []*metav1.APIResourceList
9931041
Error error
@@ -1017,6 +1065,12 @@ func (f *fakeServerResources) setPreferredResources(resources []*metav1.APIResou
10171065
f.Error = err
10181066
}
10191067

1068+
func (f *fakeServerResources) getInterfaceUsedCount() int {
1069+
f.Lock.Lock()
1070+
defer f.Lock.Unlock()
1071+
return f.InterfaceUsedCount
1072+
}
1073+
10201074
func (*fakeServerResources) ServerPreferredNamespacedResources() ([]*metav1.APIResourceList, error) {
10211075
return nil, nil
10221076
}
@@ -2754,28 +2808,6 @@ func assertState(s state) step {
27542808

27552809
}
27562810

2757-
type fakeController struct {
2758-
synced bool
2759-
lock sync.Mutex
2760-
}
2761-
2762-
func (f *fakeController) Run(stopCh <-chan struct{}) {
2763-
}
2764-
2765-
func (f *fakeController) HasSynced() bool {
2766-
return f.synced
2767-
}
2768-
2769-
func (f *fakeController) SetSynced(synced bool) {
2770-
f.lock.Lock()
2771-
defer f.lock.Unlock()
2772-
f.synced = synced
2773-
}
2774-
2775-
func (f *fakeController) LastSyncResourceVersion() string {
2776-
return ""
2777-
}
2778-
27792811
// trackingWorkqueue implements RateLimitingInterface,
27802812
// allows introspection of the items in the queue,
27812813
// and treats AddAfter and AddRateLimited the same as Add

test/integration/garbagecollector/garbage_collector_test.go

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,29 @@ const oneValidOwnerPodName = "test.pod.3"
8181
const toBeDeletedRCName = "test.rc.1"
8282
const remainingRCName = "test.rc.2"
8383

84+
// testCert was generated from crypto/tls/generate_cert.go with the following command:
85+
//
86+
// go run generate_cert.go --rsa-bits 2048 --host 127.0.0.1,::1,example.com --ca --start-date "Jan 1 00:00:00 1970" --duration=1000000h
87+
var testCert = []byte(`-----BEGIN CERTIFICATE-----
88+
MIIDGDCCAgCgAwIBAgIQTKCKn99d5HhQVCLln2Q+eTANBgkqhkiG9w0BAQsFADAS
89+
MRAwDgYDVQQKEwdBY21lIENvMCAXDTcwMDEwMTAwMDAwMFoYDzIwODQwMTI5MTYw
90+
MDAwWjASMRAwDgYDVQQKEwdBY21lIENvMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8A
91+
MIIBCgKCAQEA1Z5/aTwqY706M34tn60l8ZHkanWDl8mM1pYf4Q7qg3zA9XqWLX6S
92+
4rTYDYCb4stEasC72lQnbEWHbthiQE76zubP8WOFHdvGR3mjAvHWz4FxvLOTheZ+
93+
3iDUrl6Aj9UIsYqzmpBJAoY4+vGGf+xHvuukHrVcFqR9ZuBdZuJ/HbbjUyuNr3X9
94+
erNIr5Ha17gVzf17SNbYgNrX9gbCeEB8Z9Ox7dVuJhLDkpF0T/B5Zld3BjyUVY/T
95+
cukU4dTVp6isbWPvCMRCZCCOpb+qIhxEjJ0n6tnPt8nf9lvDl4SWMl6X1bH+2EFa
96+
a8R06G0QI+XhwPyjXUyCR8QEOZPCR5wyqQIDAQABo2gwZjAOBgNVHQ8BAf8EBAMC
97+
AqQwEwYDVR0lBAwwCgYIKwYBBQUHAwEwDwYDVR0TAQH/BAUwAwEB/zAuBgNVHREE
98+
JzAlggtleGFtcGxlLmNvbYcEfwAAAYcQAAAAAAAAAAAAAAAAAAAAATANBgkqhkiG
99+
9w0BAQsFAAOCAQEAThqgJ/AFqaANsOp48lojDZfZBFxJQ3A4zfR/MgggUoQ9cP3V
100+
rxuKAFWQjze1EZc7J9iO1WvH98lOGVNRY/t2VIrVoSsBiALP86Eew9WucP60tbv2
101+
8/zsBDSfEo9Wl+Q/gwdEh8dgciUKROvCm76EgAwPGicMAgRsxXgwXHhS5e8nnbIE
102+
Ewaqvb5dY++6kh0Oz+adtNT5OqOwXTIRI67WuEe6/B3Z4LNVPQDIj7ZUJGNw8e6L
103+
F4nkUthwlKx4yEJHZBRuFPnO7Z81jNKuwL276+mczRH7piI6z9uyMV/JbEsOIxyL
104+
W6CzB7pZ9Nj1YLpgzc1r6oONHLokMJJIz/IvkQ==
105+
-----END CERTIFICATE-----`)
106+
84107
func newPod(podName, podNamespace string, ownerReferences []metav1.OwnerReference) *v1.Pod {
85108
for i := 0; i < len(ownerReferences); i++ {
86109
if len(ownerReferences[i].Kind) == 0 {
@@ -252,6 +275,7 @@ func setupWithServer(t *testing.T, result *kubeapiservertesting.TestServer, work
252275
logger := tCtx.Logger()
253276
alwaysStarted := make(chan struct{})
254277
close(alwaysStarted)
278+
255279
gc, err := garbagecollector.NewGarbageCollector(
256280
tCtx,
257281
clientSet,
@@ -1285,3 +1309,119 @@ func testCRDDeletion(t *testing.T, ctx *testContext, ns *v1.Namespace, definitio
12851309
t.Fatalf("failed waiting for dependent %q (owned by %q) to be deleted", dependent.GetName(), owner.GetName())
12861310
}
12871311
}
1312+
1313+
// TestCascadingDeleteOnCRDConversionFailure tests that a bad conversion webhook cannot block the entire GC controller.
1314+
// Historically, a cache sync failure from a single resource prevented GC controller from running. This test creates
1315+
// a CRD, updates the storage version with a bad conversion webhook and then runs a simple cascading delete test.
1316+
func TestCascadingDeleteOnCRDConversionFailure(t *testing.T) {
1317+
ctx := setup(t, 0)
1318+
defer ctx.tearDown()
1319+
gc, apiExtensionClient, dynamicClient, clientSet := ctx.gc, ctx.apiExtensionClient, ctx.dynamicClient, ctx.clientSet
1320+
1321+
ns := createNamespaceOrDie("gc-cache-sync-fail", clientSet, t)
1322+
defer deleteNamespaceOrDie(ns.Name, clientSet, t)
1323+
1324+
// Create a CRD with storage/serving version v1beta2. Then update the CRD with v1 as the storage version
1325+
// and an invalid conversion webhook. This should result in cache sync failures for the CRD from the GC controller.
1326+
def, dc := createRandomCustomResourceDefinition(t, apiExtensionClient, dynamicClient, ns.Name)
1327+
_, err := dc.Create(context.TODO(), newCRDInstance(def, ns.Name, names.SimpleNameGenerator.GenerateName("test")), metav1.CreateOptions{})
1328+
if err != nil {
1329+
t.Fatalf("Failed to create custom resource: %v", err)
1330+
}
1331+
1332+
def, err = apiExtensionClient.ApiextensionsV1().CustomResourceDefinitions().Get(context.TODO(), def.Name, metav1.GetOptions{})
1333+
if err != nil {
1334+
t.Fatalf("Failed to get custom resource: %v", err)
1335+
}
1336+
1337+
newDefinition := def.DeepCopy()
1338+
newDefinition.Spec.Conversion = &apiextensionsv1.CustomResourceConversion{
1339+
Strategy: apiextensionsv1.WebhookConverter,
1340+
Webhook: &apiextensionsv1.WebhookConversion{
1341+
ClientConfig: &apiextensionsv1.WebhookClientConfig{
1342+
Service: &apiextensionsv1.ServiceReference{
1343+
Name: "foobar",
1344+
Namespace: ns.Name,
1345+
},
1346+
CABundle: testCert,
1347+
},
1348+
ConversionReviewVersions: []string{
1349+
"v1", "v1beta1",
1350+
},
1351+
},
1352+
}
1353+
newDefinition.Spec.Versions = []apiextensionsv1.CustomResourceDefinitionVersion{
1354+
{
1355+
Name: "v1",
1356+
Served: true,
1357+
Storage: true,
1358+
Schema: apiextensionstestserver.AllowAllSchema(),
1359+
},
1360+
{
1361+
Name: "v1beta1",
1362+
Served: true,
1363+
Storage: false,
1364+
Schema: apiextensionstestserver.AllowAllSchema(),
1365+
},
1366+
}
1367+
1368+
_, err = apiExtensionClient.ApiextensionsV1().CustomResourceDefinitions().Update(context.TODO(), newDefinition, metav1.UpdateOptions{})
1369+
if err != nil {
1370+
t.Fatalf("Error updating CRD with conversion webhook: %v", err)
1371+
}
1372+
1373+
ctx.startGC(5)
1374+
1375+
rcClient := clientSet.CoreV1().ReplicationControllers(ns.Name)
1376+
podClient := clientSet.CoreV1().Pods(ns.Name)
1377+
1378+
toBeDeletedRC, err := rcClient.Create(context.TODO(), newOwnerRC(toBeDeletedRCName, ns.Name), metav1.CreateOptions{})
1379+
if err != nil {
1380+
t.Fatalf("Failed to create replication controller: %v", err)
1381+
}
1382+
1383+
rcs, err := rcClient.List(context.TODO(), metav1.ListOptions{})
1384+
if err != nil {
1385+
t.Fatalf("Failed to list replication controllers: %v", err)
1386+
}
1387+
if len(rcs.Items) != 1 {
1388+
t.Fatalf("Expect only 1 replication controller")
1389+
}
1390+
1391+
pod := newPod(garbageCollectedPodName, ns.Name, []metav1.OwnerReference{{UID: toBeDeletedRC.ObjectMeta.UID, Name: toBeDeletedRCName}})
1392+
_, err = podClient.Create(context.TODO(), pod, metav1.CreateOptions{})
1393+
if err != nil {
1394+
t.Fatalf("Failed to create Pod: %v", err)
1395+
}
1396+
1397+
pods, err := podClient.List(context.TODO(), metav1.ListOptions{})
1398+
if err != nil {
1399+
t.Fatalf("Failed to list pods: %v", err)
1400+
}
1401+
if len(pods.Items) != 1 {
1402+
t.Fatalf("Expect only 1 pods")
1403+
}
1404+
1405+
if err := rcClient.Delete(context.TODO(), toBeDeletedRCName, getNonOrphanOptions()); err != nil {
1406+
t.Fatalf("failed to delete replication controller: %v", err)
1407+
}
1408+
1409+
// sometimes the deletion of the RC takes long time to be observed by
1410+
// the gc, so wait for the garbage collector to observe the deletion of
1411+
// the toBeDeletedRC
1412+
if err := wait.PollUntilContextTimeout(context.Background(), 1*time.Second, 60*time.Second, true, func(ctx context.Context) (bool, error) {
1413+
return !gc.GraphHasUID(toBeDeletedRC.ObjectMeta.UID), nil
1414+
}); err != nil {
1415+
t.Fatal(err)
1416+
}
1417+
if err := integration.WaitForPodToDisappear(podClient, garbageCollectedPodName, 1*time.Second, 30*time.Second); err != nil {
1418+
t.Fatalf("expect pod %s to be garbage collected, got err= %v", garbageCollectedPodName, err)
1419+
}
1420+
1421+
// Check that the cache is still not synced after cascading delete succeeded
1422+
// If this check passes, check that the conversion webhook is correctly misconfigured
1423+
// to prevent watch cache from listing the CRD.
1424+
if ctx.gc.IsSynced(ctx.logger) {
1425+
t.Fatal("cache is not expected to be synced due to bad conversion webhook")
1426+
}
1427+
}

0 commit comments

Comments
 (0)