Skip to content

Commit 2148a59

Browse files
sdowellgoogle-oss-prow[bot]
authored andcommitted
fix: force live get of ignore-mutation objects when apply fails (#1784)
The current design for ignore-mutation objects passes in the latest object from the cluster into the applier. A caching mechanism is used to minimize live Get requests to the cluster, and the cached object is updated by the remediator when it receives a watch event. There is currently no fallback reconciliation mechanism if the object is stale and the remediator failed to update the object cache (for example, if the remediator missed the watch event). This change adds a fallback mechanism by clearing the ignore-mutation object from the cache whenever the apply fails for such an object. Note that it is still possible for the object to become stale between the time that the object is fetched and the applier tries to apply the object, but this fallback mechanism should help guarantee eventual consistency when the apply fails due to a stale cache.
1 parent cac150f commit 2148a59

File tree

2 files changed

+105
-6
lines changed

2 files changed

+105
-6
lines changed

pkg/applier/applier.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ func (s *supervisor) UpdateStatusMode(ctx context.Context) error {
207207
})
208208
}
209209

210-
func (s *supervisor) processApplyEvent(ctx context.Context, e event.ApplyEvent, syncStats *stats.ApplyEventStats, objectStatusMap ObjectStatusMap, unknownTypeResources map[core.ID]struct{}, resourceMap map[core.ID]client.Object) status.Error {
210+
func (s *supervisor) processApplyEvent(ctx context.Context, e event.ApplyEvent, syncStats *stats.ApplyEventStats, objectStatusMap ObjectStatusMap, unknownTypeResources map[core.ID]struct{}, resourceMap map[core.ID]client.Object, declaredResources *declared.Resources) status.Error {
211211
id := idFrom(e.Identifier)
212212
syncStats.Add(e.Status)
213213

@@ -231,6 +231,15 @@ func (s *supervisor) processApplyEvent(ctx context.Context, e event.ApplyEvent,
231231
case event.ApplyFailed:
232232
objectStatus.Actuation = actuation.ActuationFailed
233233
handleMetrics(ctx, "update", e.Error)
234+
// If apply failed for an ignore-mutation object, delete it from the ignore cache.
235+
// Normally the cached object should be updated by the remediator when it
236+
// receives a watch event - This is a fallback to force a live lookup the
237+
// next time the applier runs.
238+
iObj, found := declaredResources.GetIgnored(id)
239+
if found {
240+
klog.Infof("Deleting object '%v' from the ignore cache (apply failed)", core.GKNN(iObj))
241+
declaredResources.DeleteIgnored(id)
242+
}
234243
switch e.Error.(type) {
235244
case *applyerror.UnknownTypeError:
236245
unknownTypeResources[id] = struct{}{}
@@ -604,7 +613,7 @@ func (s *supervisor) applyInner(ctx context.Context, eventHandler func(Event), d
604613
} else {
605614
klog.V(1).Info(e.ApplyEvent)
606615
}
607-
if err := s.processApplyEvent(ctx, e.ApplyEvent, syncStats.ApplyEvent, objStatusMap, unknownTypeResources, resourceMap); err != nil {
616+
if err := s.processApplyEvent(ctx, e.ApplyEvent, syncStats.ApplyEvent, objStatusMap, unknownTypeResources, resourceMap, declaredResources); err != nil {
608617
sendErrorEvent(err, eventHandler)
609618
}
610619
case event.PruneType:

pkg/applier/applier_test.go

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,7 @@ func TestApplyMutationIgnoredObjects(t *testing.T) {
420420
serverObjs []client.Object
421421
declaredObjs []client.Object
422422
cachedIgnoredObjs []client.Object
423+
applyEvents []event.Event
423424
expectedObjsToApply object.UnstructuredSet
424425
expectedItemsInIgnoreCache []client.Object
425426
}{
@@ -595,6 +596,54 @@ func TestApplyMutationIgnoredObjects(t *testing.T) {
595596
syncertest.IgnoreMutationAnnotation)),
596597
},
597598
},
599+
{
600+
name: "mutation-ignored object should be evicted from cache when apply fails",
601+
serverObjs: []client.Object{
602+
k8sobjects.NamespaceObject("test-ns",
603+
syncertest.ManagementEnabled,
604+
core.Label("foo", "baz")),
605+
},
606+
declaredObjs: []client.Object{
607+
k8sobjects.UnstructuredObject(kinds.Namespace(), core.Name("test-ns"),
608+
core.Label(metadata.ManagedByKey, metadata.ManagedByValue),
609+
core.Label(metadata.ApplySetPartOfLabel, applySetID),
610+
core.Label(metadata.DeclaredVersionLabel, "v1"),
611+
metadata.WithManagementMode(metadata.ManagementEnabled),
612+
core.Annotation(metadata.GitContextKey, gitContextOutput),
613+
core.Annotation(metadata.SyncTokenAnnotationKey, testGitCommit),
614+
core.Annotation(metadata.OwningInventoryKey, InventoryID(rootSyncName, configmanagement.ControllerNamespace)),
615+
difftest.ManagedBy(declared.RootScope, rootSyncName),
616+
core.Label("foo", "bar"),
617+
syncertest.ManagementEnabled,
618+
syncertest.IgnoreMutationAnnotation)},
619+
cachedIgnoredObjs: []client.Object{
620+
&queue.Deleted{
621+
Object: k8sobjects.UnstructuredObject(kinds.Namespace(),
622+
core.Name("test-ns"),
623+
syncertest.ManagementEnabled,
624+
syncertest.IgnoreMutationAnnotation)}},
625+
applyEvents: []event.Event{
626+
formApplyEvent(event.ApplyFailed,
627+
k8sobjects.UnstructuredObject(kinds.Namespace(), core.Name("test-ns")),
628+
fmt.Errorf("test error")),
629+
},
630+
// The object should be purged from the ignore cache due to ApplyFailed
631+
expectedItemsInIgnoreCache: nil,
632+
expectedObjsToApply: object.UnstructuredSet{
633+
asUnstructuredSanitizedObj(k8sobjects.UnstructuredObject(kinds.Namespace(), core.Name("test-ns"),
634+
core.Label(metadata.ManagedByKey, metadata.ManagedByValue),
635+
core.Label(metadata.ApplySetPartOfLabel, applySetID),
636+
core.Label(metadata.DeclaredVersionLabel, "v1"),
637+
metadata.WithManagementMode(metadata.ManagementEnabled),
638+
core.Annotation(metadata.GitContextKey, gitContextOutput),
639+
core.Annotation(metadata.SyncTokenAnnotationKey, testGitCommit),
640+
core.Annotation(metadata.OwningInventoryKey, InventoryID(rootSyncName, configmanagement.ControllerNamespace)),
641+
difftest.ManagedBy(declared.RootScope, rootSyncName),
642+
syncertest.ManagementEnabled,
643+
syncertest.IgnoreMutationAnnotation,
644+
core.Label("foo", "bar"))),
645+
},
646+
},
598647
}
599648

600649
for _, tc := range testcases {
@@ -608,7 +657,7 @@ func TestApplyMutationIgnoredObjects(t *testing.T) {
608657
syncScope := declared.Scope("test-namespace")
609658
syncName := "rs"
610659
fakeClient := testingfake.NewClient(t, core.Scheme, tc.serverObjs...)
611-
fakeKptApplier := newFakeKptApplier([]event.Event{})
660+
fakeKptApplier := newFakeKptApplier(tc.applyEvents)
612661
cs := &ClientSet{
613662
KptApplier: fakeKptApplier,
614663
Client: fakeClient,
@@ -915,6 +964,7 @@ func TestProcessApplyEvent(t *testing.T) {
915964
deploymentObj := newDeploymentObj()
916965
deploymentObjID := core.IDOf(deploymentObj)
917966
testObj1 := newTestObj("test-1")
967+
core.SetAnnotation(testObj1, metadata.LifecycleMutationAnnotation, metadata.IgnoreMutation)
918968
testObj1ID := core.IDOf(testObj1)
919969

920970
ctx := context.Background()
@@ -925,10 +975,15 @@ func TestProcessApplyEvent(t *testing.T) {
925975

926976
resourceMap := make(map[core.ID]client.Object)
927977
resourceMap[deploymentObjID] = deploymentObj
978+
resourceMap[testObj1ID] = testObj1
979+
980+
resources := &declared.Resources{}
981+
resources.UpdateIgnored(testObj1)
928982

929-
err := s.processApplyEvent(ctx, formApplyEvent(event.ApplyFailed, deploymentObj, fmt.Errorf("test error")).ApplyEvent, syncStats.ApplyEvent, objStatusMap, unknownTypeResources, resourceMap)
983+
// process failed apply of deploymentObj
984+
err := s.processApplyEvent(ctx, formApplyEvent(event.ApplyFailed, deploymentObj, fmt.Errorf("test error")).ApplyEvent, syncStats.ApplyEvent, objStatusMap, unknownTypeResources, resourceMap, resources)
930985
expectedError := ErrorForResourceWithResource(fmt.Errorf("test error"), deploymentObjID, deploymentObj)
931-
testutil.AssertEqual(t, expectedError, err, "expected processPruneEvent to error on apply %s", event.ApplyFailed)
986+
testutil.AssertEqual(t, expectedError, err, "expected processApplyEvent to error on apply %s", event.ApplyFailed)
932987

933988
expectedCSE := v1beta1.ConfigSyncError{
934989
Code: "2009",
@@ -945,8 +1000,10 @@ func TestProcessApplyEvent(t *testing.T) {
9451000
}},
9461001
}
9471002
testutil.AssertEqual(t, expectedCSE, err.ToCSE(), "expected CSEs to match")
1003+
testutil.AssertEqual(t, 1, len(resources.IgnoredObjects()))
9481004

949-
err = s.processApplyEvent(ctx, formApplyEvent(event.ApplySuccessful, testObj1, nil).ApplyEvent, syncStats.ApplyEvent, objStatusMap, unknownTypeResources, resourceMap)
1005+
// process successful apply of testObj1
1006+
err = s.processApplyEvent(ctx, formApplyEvent(event.ApplySuccessful, testObj1, nil).ApplyEvent, syncStats.ApplyEvent, objStatusMap, unknownTypeResources, resourceMap, resources)
9501007
assert.Nil(t, err, "expected processApplyEvent NOT to error on apply %s", event.ApplySuccessful)
9511008

9521009
expectedApplyStatus := stats.NewSyncStats()
@@ -965,6 +1022,39 @@ func TestProcessApplyEvent(t *testing.T) {
9651022
},
9661023
}
9671024
testutil.AssertEqual(t, expectedObjStatusMap, objStatusMap, "expected object status to match")
1025+
testutil.AssertEqual(t, 1, len(resources.IgnoredObjects()))
1026+
1027+
// process failed apply of testObj1 (ignore-mutation object)
1028+
err = s.processApplyEvent(ctx, formApplyEvent(event.ApplyFailed, testObj1, fmt.Errorf("test error")).ApplyEvent, syncStats.ApplyEvent, objStatusMap, unknownTypeResources, resourceMap, resources)
1029+
expectedError = ErrorForResourceWithResource(fmt.Errorf("test error"), testObj1ID, testObj1)
1030+
testutil.AssertEqual(t, expectedError, err, "expected processApplyEvent to error on apply %s", event.ApplyFailed)
1031+
1032+
expectedCSE = v1beta1.ConfigSyncError{
1033+
Code: "2009",
1034+
ErrorMessage: `KNV2009: failed to apply Test.configsync.test, test-namespace/test-1: test error
1035+
1036+
source: foo/test.yaml
1037+
namespace: test-namespace
1038+
metadata.name: test-1
1039+
group: configsync.test
1040+
version: v1
1041+
kind: Test
1042+
1043+
For more information, see https://g.co/cloud/acm-errors#knv2009`,
1044+
Resources: []v1beta1.ResourceRef{{
1045+
SourcePath: "foo/test.yaml",
1046+
Name: "test-1",
1047+
Namespace: "test-namespace",
1048+
GVK: metav1.GroupVersionKind{
1049+
Group: "configsync.test",
1050+
Version: "v1",
1051+
Kind: "Test",
1052+
},
1053+
}},
1054+
}
1055+
testutil.AssertEqual(t, expectedCSE, err.ToCSE(), "expected CSEs to match")
1056+
// The object should be removed from the IgnoredObjects cache when ApplyFailed
1057+
testutil.AssertEqual(t, 0, len(resources.IgnoredObjects()))
9681058

9691059
// TODO: test handleMetrics on success
9701060
// TODO: test unknownTypeResources on UnknownTypeError

0 commit comments

Comments
 (0)