Skip to content

Commit 11a7118

Browse files
committed
fix: remediator missing custom resource events
Prior to this change, the remediator watches were only being started for new custom resources after the apply attempt had fully completed. This left some time after the object was applied that the remediator could miss events made by third-parties. Normally, this would be fine, because the remediator would revert any change after the watch was started. But if a DELETE event was missed, the object wouldn't be recreated until the next apply attempt. This change adds a CRD Controller to the remediator that watches CRDs and executes any registered handlers when the CRD is established, unestablished, or deleted. The remediator now registers CRD handlers for each resource type it watches, starting watchers as soon as possible, without waiting for the next apply attempt.
1 parent 9e80e8d commit 11a7118

17 files changed

+426
-161
lines changed

cmd/reconciler-manager/main.go

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"os"
2222

2323
"github.com/go-logr/logr"
24+
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
2425
apierrors "k8s.io/apimachinery/pkg/api/errors"
2526
"k8s.io/apimachinery/pkg/api/meta"
2627
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -33,6 +34,7 @@ import (
3334
"kpt.dev/configsync/pkg/profiler"
3435
"kpt.dev/configsync/pkg/reconcilermanager"
3536
"kpt.dev/configsync/pkg/reconcilermanager/controllers"
37+
"kpt.dev/configsync/pkg/util/customresource"
3638
"kpt.dev/configsync/pkg/util/log"
3739
ctrl "sigs.k8s.io/controller-runtime"
3840
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -95,9 +97,10 @@ func main() {
9597
}
9698
watchFleetMembership := fleetMembershipCRDExists(dynamicClient, mgr.GetRESTMapper(), &setupLog)
9799

98-
crdController := controllers.NewCRDReconciler(
100+
crdController := &controllers.CRDController{}
101+
crdMetaController := controllers.NewCRDMetaController(crdController, mgr.GetCache(),
99102
textlogger.NewLogger(textlogger.NewConfig()).WithName("controllers").WithName("CRD"))
100-
if err := crdController.Register(mgr); err != nil {
103+
if err := crdMetaController.Register(mgr); err != nil {
101104
setupLog.Error(err, "failed to register controller", "controller", "CRD")
102105
os.Exit(1)
103106
}
@@ -108,11 +111,15 @@ func main() {
108111
mgr.GetClient(), watcher, dynamicClient,
109112
textlogger.NewLogger(textlogger.NewConfig()).WithName("controllers").WithName(configsync.RepoSyncKind),
110113
mgr.GetScheme())
111-
crdController.SetCRDHandler(configsync.RepoSyncCRDName, func() error {
112-
if err := repoSyncController.Register(mgr, watchFleetMembership); err != nil {
113-
return fmt.Errorf("registering %s controller: %w", configsync.RepoSyncKind, err)
114+
crdController.SetReconciler(kinds.RepoSyncV1Beta1().GroupKind(), func(_ context.Context, crd *apiextensionsv1.CustomResourceDefinition) error {
115+
if customresource.IsEstablished(crd) {
116+
if err := repoSyncController.Register(mgr, watchFleetMembership); err != nil {
117+
return fmt.Errorf("registering %s controller: %w", configsync.RepoSyncKind, err)
118+
}
119+
setupLog.Info("RepoSync controller registration successful")
114120
}
115-
setupLog.Info("RepoSync controller registration successful")
121+
// Don't stop the RepoSync controller when its CRD is deleted,
122+
// otherwise we may miss RepoSync object deletion events.
116123
return nil
117124
})
118125
setupLog.Info("RepoSync controller registration scheduled")
@@ -122,11 +129,15 @@ func main() {
122129
mgr.GetClient(), watcher, dynamicClient,
123130
textlogger.NewLogger(textlogger.NewConfig()).WithName("controllers").WithName(configsync.RootSyncKind),
124131
mgr.GetScheme())
125-
crdController.SetCRDHandler(configsync.RootSyncCRDName, func() error {
126-
if err := rootSyncController.Register(mgr, watchFleetMembership); err != nil {
127-
return fmt.Errorf("registering %s controller: %w", configsync.RootSyncKind, err)
132+
crdController.SetReconciler(kinds.RootSyncV1Beta1().GroupKind(), func(_ context.Context, crd *apiextensionsv1.CustomResourceDefinition) error {
133+
if customresource.IsEstablished(crd) {
134+
if err := rootSyncController.Register(mgr, watchFleetMembership); err != nil {
135+
return fmt.Errorf("registering %s controller: %w", configsync.RootSyncKind, err)
136+
}
137+
setupLog.Info("RootSync controller registration successful")
128138
}
129-
setupLog.Info("RootSync controller registration successful")
139+
// Don't stop the RootSync controller when its CRD is deleted,
140+
// otherwise we may miss RootSync object deletion events.
130141
return nil
131142
})
132143
setupLog.Info("RootSync controller registration scheduled")

e2e/testcases/custom_resources_test.go

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,16 @@ func TestCRDDeleteBeforeRemoveCustomResourceV1(t *testing.T) {
9898
nt.T.Fatal(err)
9999
}
100100

101-
// Resource Conflict errors from the remediator are not exposed as errors
102-
// in the RootSync status. Instead, the error is recorded as a metric and
103-
// logged as a warning. Then the object is refreshed from the server and
104-
// re-enqueued for remediation.
101+
// Resource conflict errors are recorded as status errors when the
102+
// remediator watches are updated after an apply succeeds, but not when
103+
// watches are updated before the apply attempt or from watch events handled
104+
// by the remediator. So we don't expect to see a resource conflict error
105+
// in the RootSync status until after the next apply attempt fails, which
106+
// won't happen until the next automatic re-sync (1hr default).
105107
//
106-
// Validate that deleting the CRD of a managed CR causes at least of of the
107-
// following errors:
108+
// However, we do expect the remediator to get a deletion event for each
109+
// Anvil object after the Anvil CRD is deleted. This can be surfaced as one
110+
// of the following errors:
108111
// - NoResourceMatchError
109112
// - NoKindMatchError
110113
// - ObjectNotFound
@@ -117,7 +120,7 @@ func TestCRDDeleteBeforeRemoveCustomResourceV1(t *testing.T) {
117120
// TODO: distinguish between management conflict (unexpected manager annotation) and resource conflict (resource version change)
118121
nt.Must(nomostest.ValidateMetrics(nt,
119122
nomostest.ReconcilerErrorMetrics(nt, rootSyncLabels, firstCommitHash, metrics.ErrorSummary{
120-
Conflicts: 1,
123+
Conflicts: 1, // at least 1
121124
})))
122125

123126
// Reset discovery client to invalidate the cached Anvil CRD

manifests/base/kustomization.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ resources:
1919
# Applying hierarchyconfig-crd.yaml allows client-side validation of the HierarchyConfig resources.
2020
- ../hierarchyconfig-crd.yaml
2121
- ../namespace-selector-crd.yaml
22+
- ../ns-reconciler-cluster-scope-cluster-role.yaml
2223
- ../ns-reconciler-base-cluster-role.yaml
2324
- ../root-reconciler-base-cluster-role.yaml
2425
- ../otel-agent-cm.yaml
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# This ClusterRole is used by both root-reconcilers and ns-reconcilers.
16+
# It includes read access for cluster-scope resources.
17+
apiVersion: rbac.authorization.k8s.io/v1
18+
kind: ClusterRole
19+
metadata:
20+
name: configsync.gke.io:ns-reconciler:cluster-scope
21+
labels:
22+
configmanagement.gke.io/system: "true"
23+
configmanagement.gke.io/arch: "csmr"
24+
rules:
25+
- apiGroups: ["apiextensions.k8s.io"]
26+
resources: ["customresourcedefinitions"]
27+
verbs: ["get","list","watch"]

manifests/root-reconciler-base-cluster-role.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,3 +32,6 @@ rules:
3232
- apiGroups: ["kpt.dev"]
3333
resources: ["resourcegroups/status"]
3434
verbs: ["*"]
35+
- apiGroups: ["apiextensions.k8s.io"]
36+
resources: ["customresourcedefinitions"]
37+
verbs: ["get","list","watch"]

pkg/reconciler/reconciler.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"kpt.dev/configsync/pkg/parse/events"
3838
"kpt.dev/configsync/pkg/reconciler/finalizer"
3939
"kpt.dev/configsync/pkg/reconciler/namespacecontroller"
40+
"kpt.dev/configsync/pkg/reconcilermanager/controllers"
4041
"kpt.dev/configsync/pkg/remediator"
4142
"kpt.dev/configsync/pkg/remediator/conflict"
4243
"kpt.dev/configsync/pkg/remediator/watch"
@@ -219,10 +220,11 @@ func Run(opts Options) {
219220
klog.Fatalf("Error creating rest config for the remediator: %v", err)
220221
}
221222

223+
crdController := &controllers.CRDController{}
222224
conflictHandler := conflict.NewHandler()
223225
fightHandler := fight.NewHandler()
224226

225-
rem, err := remediator.New(opts.ReconcilerScope, opts.SyncName, cfgForWatch, baseApplier, conflictHandler, fightHandler, decls, opts.NumWorkers)
227+
rem, err := remediator.New(opts.ReconcilerScope, opts.SyncName, cfgForWatch, baseApplier, conflictHandler, fightHandler, crdController, decls, opts.NumWorkers)
226228
if err != nil {
227229
klog.Fatalf("Instantiating Remediator: %v", err)
228230
}
@@ -331,6 +333,12 @@ func Run(opts Options) {
331333
klog.Fatalf("Instantiating Controller Manager: %v", err)
332334
}
333335

336+
crdMetaController := controllers.NewCRDMetaController(crdController, mgr.GetCache(),
337+
textlogger.NewLogger(textlogger.NewConfig()).WithName("controllers").WithName("CRD"))
338+
if err := crdMetaController.Register(mgr); err != nil {
339+
klog.Fatalf("Instantiating CRD Controller: %v", err)
340+
}
341+
334342
// This cancelFunc will be used by the Finalizer to stop all the other
335343
// controllers (Parser & Remediator).
336344
ctx, stopControllers := context.WithCancel(signalCtx)

pkg/reconcilermanager/controllers/build_names.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,25 @@ import (
2222
)
2323

2424
const (
25+
// RepoSyncClusterScopeClusterRoleName is the name of the ClusterRole with
26+
// cluster-scoped read permissions for the namespace reconciler.
27+
// e.g. configsync.gke.io:ns-reconciler:cluster-scope
28+
RepoSyncClusterScopeClusterRoleName = configsync.GroupName + ":" + core.NsReconcilerPrefix + ":cluster-scope"
2529
// RepoSyncBaseClusterRoleName is the namespace reconciler permissions name.
2630
// e.g. configsync.gke.io:ns-reconciler
2731
RepoSyncBaseClusterRoleName = configsync.GroupName + ":" + core.NsReconcilerPrefix
2832
// RootSyncBaseClusterRoleName is the root reconciler base ClusterRole name.
2933
// e.g. configsync.gke.io:root-reconciler
3034
RootSyncBaseClusterRoleName = configsync.GroupName + ":" + core.RootReconcilerPrefix
35+
// RepoSyncClusterScopeClusterRoleBindingName is the name of the default
36+
// ClusterRoleBinding created for RepoSync objects. This contains basic
37+
// cluster-scoped permissions for RepoSync reconcilers
38+
// (e.g. CustomResourceDefinition watch).
39+
RepoSyncClusterScopeClusterRoleBindingName = RepoSyncClusterScopeClusterRoleName
3140
// RepoSyncBaseRoleBindingName is the name of the default RoleBinding created
32-
// for RepoSync objects. This contains basic permissions for RepoSync reconcilers
33-
//(e.g. RepoSync status update).
41+
// for RepoSync objects. This contains basic namespace-scoped permissions
42+
// for RepoSync reconcilers
43+
// (e.g. RepoSync status update).
3444
RepoSyncBaseRoleBindingName = RepoSyncBaseClusterRoleName
3545
// RootSyncLegacyClusterRoleBindingName is the name of the legacy ClusterRoleBinding created
3646
// for RootSync objects. It is always bound to cluster-admin.

0 commit comments

Comments
 (0)