Skip to content

Commit b390ed7

Browse files
Locally store and check resourceVersions
The initial `Get` in a Reconcile, for the object to be reconciled, reads from the controller-runtime cache. If a resource is updated and then immediately re-queued, the object retrieved from the cache may not have that update. Now, when updating a ConfigurationPolicy or OperatorPolicy, the controller locally saves the resulting resourceVersion, and then checks against that value when beginning a reconcile. Previously, ConfigurationPolicy had something similar, but it was using the `status.lastEvaluated` timestamp. That only has seconds precision, and so it was not catching every situation where the cache might be stale - in particular, this caused a status history test to be flakey. Refs: - https://issues.redhat.com/browse/ACM-20803 Signed-off-by: Justin Kulikauskas <[email protected]>
1 parent d2a3dae commit b390ed7

File tree

4 files changed

+74
-13
lines changed

4 files changed

+74
-13
lines changed

Makefile

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,6 @@ LOCAL_BIN ?= $(PWD)/bin
1818
export PATH := $(LOCAL_BIN):$(PATH)
1919
GOARCH = $(shell go env GOARCH)
2020
GOOS = $(shell go env GOOS)
21-
TESTARGS_DEFAULT := -v
22-
TESTARGS ?= $(TESTARGS_DEFAULT)
2321
CONTROLLER_NAME = $(shell cat COMPONENT_NAME 2> /dev/null)
2422
# Handle KinD configuration
2523
MANAGED_CLUSTER_SUFFIX ?=

controllers/configurationpolicy_controller.go

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ type ConfigurationPolicyReconciler struct {
193193
// The number of seconds before a policy is eligible for reevaluation in watch mode (throttles frequently evaluated
194194
// policies)
195195
EvalBackoffSeconds uint32
196-
// lastEvaluatedCache contains the value of ConfigurationPolicyStatus.LastEvaluated per ConfigurationPolicy UID.
196+
// lastEvaluatedCache contains the value of the last known ConfigurationPolicy resourceVersion per UID.
197197
// This is a workaround to account for race conditions where the status is updated but the controller-runtime cache
198198
// has not updated yet.
199199
lastEvaluatedCache sync.Map
@@ -404,12 +404,22 @@ func (r *ConfigurationPolicyReconciler) shouldEvaluatePolicy(
404404
}
405405

406406
if cachedLastEval, ok := r.lastEvaluatedCache.Load(policy.UID); ok {
407-
if cachedLastEval.(string) != policy.Status.LastEvaluated {
408-
log.V(1).Info(
409-
"The policy's status.lastEvaluated field is not synced in the controller-runtime cache. Will requeue.",
410-
)
407+
last, cachedConversionErr := strconv.Atoi(cachedLastEval.(string))
408+
current, realConversionErr := strconv.Atoi(policy.GetResourceVersion())
409+
410+
// Note: resourceVersion should be strictly monotonic *for a specific resource instance*,
411+
// but is not necessarily monotonic across different kinds and namespaces.
412+
// The comparison here is for a specific resource, and so it should be a valid check.
413+
if cachedConversionErr == nil && realConversionErr == nil {
414+
if last > current {
415+
log.V(1).Info("The policy from the controller-runtime cache is stale. Will requeue.")
411416

412-
return false, time.Second
417+
return false, time.Second
418+
}
419+
} else {
420+
log.Error(nil, "A resourceVersion could not be converted to an integer. Possibly evaluating regardless.",
421+
"cache", cachedLastEval, "cachedConversionErr", cachedConversionErr,
422+
"real", policy.GetResourceVersion(), "realConversionErr", realConversionErr)
413423
}
414424
}
415425

@@ -3696,7 +3706,7 @@ func (r *ConfigurationPolicyReconciler) addForUpdate(policy *policyv1.Configurat
36963706
policySystemErrorsCounter.WithLabelValues(parent, policy.GetName(), "status-update-failed").Add(1)
36973707

36983708
default:
3699-
r.lastEvaluatedCache.Store(policy.UID, policy.Status.LastEvaluated)
3709+
r.lastEvaluatedCache.Store(policy.UID, policy.GetResourceVersion())
37003710
}
37013711
}
37023712

controllers/configurationpolicy_controller_test.go

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1118,22 +1118,26 @@ func TestShouldEvaluatePolicy(t *testing.T) {
11181118

11191119
// Add a 60 second buffer to avoid race conditions
11201120
inFuture := time.Now().UTC().Add(60 * time.Second).Format(time.RFC3339)
1121+
futureResourceVersion := "900000" // 900_000
11211122
lastEvaluatedInFuture := &sync.Map{}
1122-
lastEvaluatedInFuture.Store(policy.UID, inFuture)
1123+
lastEvaluatedInFuture.Store(policy.UID, futureResourceVersion)
11231124

11241125
lastEvaluatedInvalid := &sync.Map{}
11251126
lastEvaluatedInvalid.Store(policy.UID, "Do or do not. There is no try.")
11261127

11271128
twelveSecsAgo := time.Now().UTC().Add(-12 * time.Second).Format(time.RFC3339)
1129+
recentResourceVersion := "700000" // 700_000
11281130
lastEvaluatedTwelveSecsAgo := &sync.Map{}
1129-
lastEvaluatedTwelveSecsAgo.Store(policy.UID, twelveSecsAgo)
1131+
lastEvaluatedTwelveSecsAgo.Store(policy.UID, recentResourceVersion)
11301132

11311133
twelveHoursAgo := time.Now().UTC().Add(-12 * time.Hour).Format(time.RFC3339)
1134+
oldResourceVersion := "20000" // 20_000
11321135
lastEvaluatedTwelveHoursAgo := &sync.Map{}
1133-
lastEvaluatedTwelveHoursAgo.Store(policy.UID, twelveHoursAgo)
1136+
lastEvaluatedTwelveHoursAgo.Store(policy.UID, oldResourceVersion)
11341137

11351138
tests := []struct {
11361139
testDescription string
1140+
resourceVersion string
11371141
lastEvaluated string
11381142
lastEvaluatedGeneration int64
11391143
evaluationInterval policyv1.EvaluationInterval
@@ -1147,6 +1151,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
11471151
}{
11481152
{
11491153
"Just evaluated and the generation is unchanged",
1154+
futureResourceVersion,
11501155
inFuture,
11511156
2,
11521157
policyv1.EvaluationInterval{Compliant: "10s", NonCompliant: "10s"},
@@ -1160,6 +1165,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
11601165
},
11611166
{
11621167
"The generation has changed",
1168+
futureResourceVersion,
11631169
inFuture,
11641170
1,
11651171
policyv1.EvaluationInterval{Compliant: "10s", NonCompliant: "10s"},
@@ -1173,6 +1179,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
11731179
},
11741180
{
11751181
"lastEvaluated not set",
1182+
futureResourceVersion,
11761183
"",
11771184
2,
11781185
policyv1.EvaluationInterval{Compliant: "10s", NonCompliant: "10s"},
@@ -1186,6 +1193,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
11861193
},
11871194
{
11881195
"Invalid lastEvaluated",
1196+
futureResourceVersion,
11891197
"Do or do not. There is no try.",
11901198
2,
11911199
policyv1.EvaluationInterval{Compliant: "10s", NonCompliant: "10s"},
@@ -1199,6 +1207,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
11991207
},
12001208
{
12011209
"Unknown compliance state",
1210+
futureResourceVersion,
12021211
inFuture,
12031212
2,
12041213
policyv1.EvaluationInterval{Compliant: "10s", NonCompliant: "10s"},
@@ -1212,6 +1221,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
12121221
},
12131222
{
12141223
"Default evaluation interval with a past lastEvaluated when compliant",
1224+
futureResourceVersion,
12151225
twelveSecsAgo,
12161226
2,
12171227
policyv1.EvaluationInterval{Compliant: "10s", NonCompliant: "10s"},
@@ -1225,6 +1235,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
12251235
},
12261236
{
12271237
"Default evaluation interval with a past lastEvaluated when noncompliant",
1238+
futureResourceVersion,
12281239
twelveSecsAgo,
12291240
2,
12301241
policyv1.EvaluationInterval{Compliant: "10s", NonCompliant: "10s"},
@@ -1238,6 +1249,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
12381249
},
12391250
{
12401251
"Never evaluation interval with past lastEvaluated when compliant",
1252+
futureResourceVersion,
12411253
twelveHoursAgo,
12421254
2,
12431255
policyv1.EvaluationInterval{Compliant: "never"},
@@ -1251,6 +1263,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
12511263
},
12521264
{
12531265
"Never evaluation interval with past lastEvaluated when noncompliant",
1266+
futureResourceVersion,
12541267
twelveHoursAgo,
12551268
2,
12561269
policyv1.EvaluationInterval{Compliant: "10s", NonCompliant: "never"},
@@ -1264,6 +1277,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
12641277
},
12651278
{
12661279
"Unset evaluation interval with past lastEvaluated when compliant",
1280+
futureResourceVersion,
12671281
twelveHoursAgo,
12681282
2,
12691283
policyv1.EvaluationInterval{},
@@ -1277,6 +1291,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
12771291
},
12781292
{
12791293
"Watch evaluation interval with past lastEvaluated when compliant",
1294+
futureResourceVersion,
12801295
twelveHoursAgo,
12811296
2,
12821297
policyv1.EvaluationInterval{Compliant: "watch", NonCompliant: "watch"},
@@ -1290,6 +1305,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
12901305
},
12911306
{
12921307
"Unset evaluation interval with past lastEvaluated when noncompliant",
1308+
futureResourceVersion,
12931309
twelveHoursAgo,
12941310
2,
12951311
policyv1.EvaluationInterval{},
@@ -1303,6 +1319,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
13031319
},
13041320
{
13051321
"Watch evaluation interval with past lastEvaluated when noncompliant",
1322+
futureResourceVersion,
13061323
twelveHoursAgo,
13071324
2,
13081325
policyv1.EvaluationInterval{NonCompliant: "watch"},
@@ -1316,6 +1333,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
13161333
},
13171334
{
13181335
"Invalid evaluation interval when compliant",
1336+
futureResourceVersion,
13191337
inFuture,
13201338
2,
13211339
policyv1.EvaluationInterval{Compliant: "Do or do not. There is no try."},
@@ -1329,6 +1347,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
13291347
},
13301348
{
13311349
"Invalid evaluation interval when noncompliant",
1350+
futureResourceVersion,
13321351
inFuture,
13331352
2,
13341353
policyv1.EvaluationInterval{NonCompliant: "Do or do not. There is no try."},
@@ -1342,6 +1361,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
13421361
},
13431362
{
13441363
"Custom evaluation interval that hasn't past yet when compliant",
1364+
futureResourceVersion,
13451365
twelveSecsAgo,
13461366
2,
13471367
policyv1.EvaluationInterval{Compliant: "12h"},
@@ -1355,6 +1375,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
13551375
},
13561376
{
13571377
"Custom evaluation interval that hasn't past yet when noncompliant",
1378+
futureResourceVersion,
13581379
twelveSecsAgo,
13591380
2,
13601381
policyv1.EvaluationInterval{NonCompliant: "12h"},
@@ -1368,6 +1389,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
13681389
},
13691390
{
13701391
"Deletion timestamp is non nil",
1392+
futureResourceVersion,
13711393
inFuture,
13721394
2,
13731395
policyv1.EvaluationInterval{NonCompliant: "12h"},
@@ -1381,6 +1403,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
13811403
},
13821404
{
13831405
"Finalizer and the controller is being deleted",
1406+
futureResourceVersion,
13841407
inFuture,
13851408
2,
13861409
policyv1.EvaluationInterval{NonCompliant: "12h"},
@@ -1394,6 +1417,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
13941417
},
13951418
{
13961419
"No finalizer and the controller is being deleted",
1420+
futureResourceVersion,
13971421
inFuture,
13981422
2,
13991423
policyv1.EvaluationInterval{NonCompliant: "12h"},
@@ -1407,6 +1431,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
14071431
},
14081432
{
14091433
"controller-runtime cache not yet synced",
1434+
recentResourceVersion,
14101435
twelveHoursAgo,
14111436
2,
14121437
policyv1.EvaluationInterval{NonCompliant: "12h"},
@@ -1416,7 +1441,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
14161441
nil,
14171442
false,
14181443
[]string{},
1419-
lastEvaluatedTwelveSecsAgo,
1444+
lastEvaluatedInFuture,
14201445
},
14211446
}
14221447

@@ -1429,6 +1454,7 @@ func TestShouldEvaluatePolicy(t *testing.T) {
14291454
policyCopy := policy.DeepCopy()
14301455

14311456
policyCopy.Status.LastEvaluated = test.lastEvaluated
1457+
policyCopy.SetResourceVersion(test.resourceVersion)
14321458
policyCopy.Status.LastEvaluatedGeneration = test.lastEvaluatedGeneration
14331459
policyCopy.Spec.EvaluationInterval = test.evaluationInterval
14341460
policyCopy.Status.ComplianceState = test.complianceState

controllers/operatorpolicy_controller.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"slices"
1515
"strconv"
1616
"strings"
17+
"sync"
1718
"time"
1819

1920
operatorv1 "github.com/operator-framework/api/pkg/operators/v1"
@@ -119,6 +120,10 @@ type OperatorPolicyReconciler struct {
119120
HubDynamicWatcher depclient.DynamicWatcher
120121
HubClient *kubernetes.Clientset
121122
ClusterName string
123+
// lastEvaluatedCache contains the value of the last known OperatorPolicy resourceVersion per UID.
124+
// This is a workaround to account for race conditions where the status is updated but the controller-runtime cache
125+
// has not updated yet.
126+
lastEvaluatedCache sync.Map
122127
}
123128

124129
// SetupWithManager sets up the controller with the Manager and will reconcile when the dynamic watcher
@@ -248,6 +253,26 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque
248253
return reconcile.Result{}, err
249254
}
250255

256+
if cachedLastEval, ok := r.lastEvaluatedCache.Load(policy.UID); ok {
257+
last, cachedConversionErr := strconv.Atoi(cachedLastEval.(string))
258+
current, realConversionErr := strconv.Atoi(policy.GetResourceVersion())
259+
260+
// Note: resourceVersion should be strictly monotonic *for a specific resource instance*,
261+
// but is not necessarily monotonic across different kinds and namespaces.
262+
// The comparison here is for a specific resource, and so it should be a valid check.
263+
if cachedConversionErr == nil && realConversionErr == nil {
264+
if last > current {
265+
opLog.V(1).Info("The policy from the controller-runtime cache is stale. Will requeue.")
266+
267+
return reconcile.Result{RequeueAfter: time.Second}, nil
268+
}
269+
} else {
270+
opLog.Error(nil, "A resourceVersion could not be converted to an integer. Evaluating anyway.",
271+
"cache", cachedLastEval, "cachedConversionErr", cachedConversionErr,
272+
"real", policy.GetResourceVersion(), "realConversionErr", realConversionErr)
273+
}
274+
}
275+
251276
originalStatus := *policy.Status.DeepCopy()
252277

253278
// Start query batch for caching and watching related objects
@@ -308,6 +333,8 @@ func (r *OperatorPolicyReconciler) Reconcile(ctx context.Context, req ctrl.Reque
308333
if statusChanged || !reflect.DeepEqual(policy.Status, originalStatus) {
309334
if err := r.Status().Update(ctx, policy); err != nil {
310335
errs = append(errs, err)
336+
} else {
337+
r.lastEvaluatedCache.Store(policy.UID, policy.GetResourceVersion())
311338
}
312339
}
313340

0 commit comments

Comments
 (0)