Skip to content

Commit cb9a7a0

Browse files
ryanzhang-ossRyan Zhang
andauthored
fix: handle resource snapshot missing but work already synced and add cro/ro annotation to all the works (#936)
Co-authored-by: Ryan Zhang <[email protected]>
1 parent 6b81bdb commit cb9a7a0

File tree

11 files changed

+398
-43
lines changed

11 files changed

+398
-43
lines changed

apis/cluster/v1beta1/zz_generated.deepcopy.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apis/placement/v1alpha1/zz_generated.deepcopy.go

Lines changed: 3 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apis/placement/v1beta1/commons.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,15 @@ const (
5050
// The format is {workPrefix}-configMap-uuid
5151
WorkNameWithConfigEnvelopeFmt = "%s-configmap-%s"
5252

53+
// ParentClusterResourceOverrideSnapshotHashAnnotation is the annotation to work that contains the hash of the parent cluster resource override snapshot list.
54+
ParentClusterResourceOverrideSnapshotHashAnnotation = fleetPrefix + "parent-cluster-resource-override-snapshot-hash"
55+
56+
// ParentResourceOverrideSnapshotHashAnnotation is the annotation to work that contains the hash of the parent resource override snapshot list.
57+
ParentResourceOverrideSnapshotHashAnnotation = fleetPrefix + "parent-resource-override-snapshot-hash"
58+
59+
// ParentResourceSnapshotNameAnnotation is the annotation applied to work that contains the name of the master resource snapshot that generates the work.
60+
ParentResourceSnapshotNameAnnotation = fleetPrefix + "parent-resource-snapshot-name"
61+
5362
// ParentResourceSnapshotIndexLabel is the label applied to work that contains the index of the resource snapshot that generates the work.
5463
ParentResourceSnapshotIndexLabel = fleetPrefix + "parent-resource-snapshot-index"
5564

apis/placement/v1beta1/zz_generated.deepcopy.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apis/v1alpha1/zz_generated.deepcopy.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/controllers/rollout/controller.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ func createUpdateInfo(binding *fleetv1beta1.ClusterResourceBinding, crp *fleetv1
281281
desiredBinding.Spec.ResourceSnapshotName = latestResourceSnapshot.Name
282282
// update the resource apply strategy when controller rolls out the new changes
283283
desiredBinding.Spec.ApplyStrategy = crp.Spec.Strategy.ApplyStrategy
284+
// TODO: check the size of the cro and ro to not exceed the limit
284285
desiredBinding.Spec.ClusterResourceOverrideSnapshots = cro
285286
desiredBinding.Spec.ResourceOverrideSnapshots = ro
286287
return toBeUpdatedBinding{
@@ -520,6 +521,7 @@ func calculateMaxToAdd(crp *fleetv1beta1.ClusterResourcePlacement, targetNumber
520521
upperBoundReadyNumber, "maxNumberOfBindingsToAdd", maxNumberToAdd)
521522
return maxNumberToAdd
522523
}
524+
523525
func (r *Reconciler) calculateRealTarget(crp *fleetv1beta1.ClusterResourcePlacement, schedulerTargetedBinds []*fleetv1beta1.ClusterResourceBinding) int {
524526
crpKObj := klog.KObj(crp)
525527
// calculate the target number of bindings

pkg/controllers/workgenerator/controller.go

Lines changed: 84 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,14 @@ import (
4949
"go.goms.io/fleet/pkg/utils/controller"
5050
"go.goms.io/fleet/pkg/utils/informer"
5151
"go.goms.io/fleet/pkg/utils/labels"
52+
"go.goms.io/fleet/pkg/utils/resource"
5253
)
5354

5455
var (
5556
// maxFailedResourcePlacementLimit indicates the max number of failed resource placements to include in the status.
5657
maxFailedResourcePlacementLimit = 100
5758

58-
errResourceSnapshotNotFound = errors.New("the master resource snapshot is not found")
59+
errResourceSnapshotNotFound = fmt.Errorf("the master resource snapshot is not found")
5960
)
6061

6162
// Reconciler watches binding objects and generate work objects in the designated cluster namespace
@@ -135,18 +136,17 @@ func (r *Reconciler) Reconcile(ctx context.Context, req controllerruntime.Reques
135136

136137
workUpdated := false
137138
overrideSucceeded := false
138-
// Reset the conditions and failed placements.
139-
for i := condition.OverriddenCondition; i < condition.TotalCondition; i++ {
140-
resourceBinding.RemoveCondition(string(i.ResourceBindingConditionType()))
141-
}
142-
resourceBinding.Status.FailedPlacements = nil
143139
// list all the corresponding works
144140
works, syncErr := r.listAllWorksAssociated(ctx, &resourceBinding)
145141
if syncErr == nil {
146142
// generate and apply the workUpdated works if we have all the works
147143
overrideSucceeded, workUpdated, syncErr = r.syncAllWork(ctx, &resourceBinding, works, cluster)
148144
}
149-
145+
// Reset the conditions and failed placements.
146+
for i := condition.OverriddenCondition; i < condition.TotalCondition; i++ {
147+
resourceBinding.RemoveCondition(string(i.ResourceBindingConditionType()))
148+
}
149+
resourceBinding.Status.FailedPlacements = nil
150150
if overrideSucceeded {
151151
overrideReason := condition.OverriddenSucceededReason
152152
overrideMessage := "Successfully applied the override rules on the resources"
@@ -375,10 +375,28 @@ func (r *Reconciler) listAllWorksAssociated(ctx context.Context, resourceBinding
375375
func (r *Reconciler) syncAllWork(ctx context.Context, resourceBinding *fleetv1beta1.ClusterResourceBinding, existingWorks map[string]*fleetv1beta1.Work, cluster clusterv1beta1.MemberCluster) (bool, bool, error) {
376376
updateAny := atomic.NewBool(false)
377377
resourceBindingRef := klog.KObj(resourceBinding)
378+
// the hash256 function can can handle empty list https://go.dev/play/p/_4HW17fooXM
379+
resourceOverrideSnapshotHash, err := resource.HashOf(resourceBinding.Spec.ResourceOverrideSnapshots)
380+
if err != nil {
381+
return false, false, controller.NewUnexpectedBehaviorError(err)
382+
}
383+
clusterResourceOverrideSnapshotHash, err := resource.HashOf(resourceBinding.Spec.ClusterResourceOverrideSnapshots)
384+
if err != nil {
385+
return false, false, controller.NewUnexpectedBehaviorError(err)
386+
}
387+
// TODO: check all work synced first before fetching the snapshots after we put ParentResourceOverrideSnapshotHashAnnotation and ParentClusterResourceOverrideSnapshotHashAnnotation in all the work objects
378388

379389
// Gather all the resource resourceSnapshots
380390
resourceSnapshots, err := r.fetchAllResourceSnapshots(ctx, resourceBinding)
381391
if err != nil {
392+
if errors.Is(err, errResourceSnapshotNotFound) {
393+
// the resourceIndex is deleted but the works might still be up to date with the binding.
394+
if areAllWorkSynced(existingWorks, resourceBinding, resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash) {
395+
klog.V(2).InfoS("All the works are synced with the resourceBinding even if the resource snapshot index is removed", "resourceBinding", resourceBindingRef)
396+
return true, false, nil
397+
}
398+
return false, false, controller.NewUserError(err)
399+
}
382400
// TODO(RZ): handle errResourceNotFullyCreated error so we don't need to wait for all the snapshots to be created
383401
return false, false, err
384402
}
@@ -422,7 +440,7 @@ func (r *Reconciler) syncAllWork(ctx context.Context, resourceBinding *fleetv1be
422440
if uResource.GetObjectKind().GroupVersionKind() == utils.ConfigMapGVK &&
423441
len(uResource.GetAnnotations()[fleetv1beta1.EnvelopeConfigMapAnnotation]) != 0 {
424442
// get a work object for the enveloped configMap
425-
work, err := r.getConfigMapEnvelopWorkObj(ctx, workNamePrefix, resourceBinding, snapshot, &uResource)
443+
work, err := r.getConfigMapEnvelopWorkObj(ctx, workNamePrefix, resourceBinding, snapshot, &uResource, resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash)
426444
if err != nil {
427445
return true, false, err
428446
}
@@ -438,7 +456,7 @@ func (r *Reconciler) syncAllWork(ctx context.Context, resourceBinding *fleetv1be
438456
// generate a work object for the manifests even if there is nothing to place
439457
// to allow CRP to collect the status of the placement
440458
// TODO (RZ): revisit to see if we need this hack
441-
work := generateSnapshotWorkObj(workNamePrefix, resourceBinding, snapshot, simpleManifests)
459+
work := generateSnapshotWorkObj(workNamePrefix, resourceBinding, snapshot, simpleManifests, resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash)
442460
activeWork[work.Name] = work
443461
newWork = append(newWork, work)
444462

@@ -485,6 +503,32 @@ func (r *Reconciler) syncAllWork(ctx context.Context, resourceBinding *fleetv1be
485503
return true, updateAny.Load(), nil
486504
}
487505

506+
// areAllWorkSynced checks if all the works are synced with the resource binding.
507+
func areAllWorkSynced(existingWorks map[string]*fleetv1beta1.Work, resourceBinding *fleetv1beta1.ClusterResourceBinding, _, _ string) bool {
508+
syncedCondition := resourceBinding.GetCondition(string(fleetv1beta1.ResourceBindingWorkSynchronized))
509+
if !condition.IsConditionStatusTrue(syncedCondition, resourceBinding.Generation) {
510+
// The binding has to be synced first before we can check the works
511+
return false
512+
}
513+
// TODO: check resourceOverrideSnapshotHash and clusterResourceOverrideSnapshotHash after all the work has the ParentResourceOverrideSnapshotHashAnnotation and ParentClusterResourceOverrideSnapshotHashAnnotation
514+
resourceSnapshotName := resourceBinding.Spec.ResourceSnapshotName
515+
for _, work := range existingWorks {
516+
recordedName, exist := work.Annotations[fleetv1beta1.ParentResourceSnapshotNameAnnotation]
517+
if !exist {
518+
// TODO: remove this block after all the work has the ParentResourceSnapshotNameAnnotation
519+
// the parent resource snapshot name is not recorded in the work, we need to construct it from the labels
520+
crpName := resourceBinding.Labels[fleetv1beta1.CRPTrackingLabel]
521+
index, _ := labels.ExtractResourceSnapshotIndexFromWork(work)
522+
recordedName = fmt.Sprintf(fleetv1beta1.ResourceSnapshotNameFmt, crpName, index)
523+
}
524+
if recordedName != resourceSnapshotName {
525+
klog.V(2).InfoS("The work is not synced with the resourceBinding", "work", klog.KObj(work), "resourceBinding", klog.KObj(resourceBinding), "annotationExist", exist, "recordedName", recordedName, "resourceSnapshotName", resourceSnapshotName)
526+
return false
527+
}
528+
}
529+
return true
530+
}
531+
488532
// fetchAllResourceSnapshots gathers all the resource snapshots for the resource binding.
489533
func (r *Reconciler) fetchAllResourceSnapshots(ctx context.Context, resourceBinding *fleetv1beta1.ClusterResourceBinding) (map[string]*fleetv1beta1.ClusterResourceSnapshot, error) {
490534
// fetch the master snapshot first
@@ -504,7 +548,7 @@ func (r *Reconciler) fetchAllResourceSnapshots(ctx context.Context, resourceBind
504548
// getConfigMapEnvelopWorkObj first try to locate a work object for the corresponding envelopObj of type configMap.
505549
// we create a new one if the work object doesn't exist. We do this to avoid repeatedly delete and create the same work object.
506550
func (r *Reconciler) getConfigMapEnvelopWorkObj(ctx context.Context, workNamePrefix string, resourceBinding *fleetv1beta1.ClusterResourceBinding,
507-
resourceSnapshot *fleetv1beta1.ClusterResourceSnapshot, envelopeObj *unstructured.Unstructured) (*fleetv1beta1.Work, error) {
551+
resourceSnapshot *fleetv1beta1.ClusterResourceSnapshot, envelopeObj *unstructured.Unstructured, resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash string) (*fleetv1beta1.Work, error) {
508552
// we group all the resources in one configMap to one work
509553
manifest, err := extractResFromConfigMap(envelopeObj)
510554
if err != nil {
@@ -514,6 +558,7 @@ func (r *Reconciler) getConfigMapEnvelopWorkObj(ctx context.Context, workNamePre
514558
}
515559
klog.V(2).InfoS("Successfully extract the enveloped resources from the configMap", "numOfResources", len(manifest),
516560
"snapshot", klog.KObj(resourceSnapshot), "resourceBinding", klog.KObj(resourceBinding), "configMapWrapper", klog.KObj(envelopeObj))
561+
517562
// Try to see if we already have a work represent the same enveloped object for this CRP in the same cluster
518563
// The ParentResourceSnapshotIndexLabel can change between snapshots so we have to exclude that label in the match
519564
envelopWorkLabelMatcher := client.MatchingLabels{
@@ -544,6 +589,11 @@ func (r *Reconciler) getConfigMapEnvelopWorkObj(ctx context.Context, workNamePre
544589
fleetv1beta1.EnvelopeNameLabel: envelopeObj.GetName(),
545590
fleetv1beta1.EnvelopeNamespaceLabel: envelopeObj.GetNamespace(),
546591
},
592+
Annotations: map[string]string{
593+
fleetv1beta1.ParentResourceSnapshotNameAnnotation: resourceBinding.Spec.ResourceSnapshotName,
594+
fleetv1beta1.ParentResourceOverrideSnapshotHashAnnotation: resourceOverrideSnapshotHash,
595+
fleetv1beta1.ParentClusterResourceOverrideSnapshotHashAnnotation: clusterResourceOverrideSnapshotHash,
596+
},
547597
OwnerReferences: []metav1.OwnerReference{
548598
{
549599
APIVersion: fleetv1beta1.GroupVersion.String(),
@@ -567,16 +617,19 @@ func (r *Reconciler) getConfigMapEnvelopWorkObj(ctx context.Context, workNamePre
567617
klog.ErrorS(controller.NewUnexpectedBehaviorError(fmt.Errorf("find %d work representing configMap", len(workList.Items))),
568618
"snapshot", klog.KObj(resourceSnapshot), "resourceBinding", klog.KObj(resourceBinding), "configMapWrapper", klog.KObj(envelopeObj))
569619
}
570-
// we just pick the first one if there are more than one.
571620
work := workList.Items[0]
572621
work.Labels[fleetv1beta1.ParentResourceSnapshotIndexLabel] = resourceSnapshot.Labels[fleetv1beta1.ResourceIndexLabel]
622+
work.Annotations[fleetv1beta1.ParentResourceSnapshotNameAnnotation] = resourceBinding.Spec.ResourceSnapshotName
623+
work.Annotations[fleetv1beta1.ParentResourceOverrideSnapshotHashAnnotation] = resourceOverrideSnapshotHash
624+
work.Annotations[fleetv1beta1.ParentClusterResourceOverrideSnapshotHashAnnotation] = clusterResourceOverrideSnapshotHash
573625
work.Spec.Workload.Manifests = manifest
574626
work.Spec.ApplyStrategy = resourceBinding.Spec.ApplyStrategy
575627
return &work, nil
576628
}
577629

578630
// generateSnapshotWorkObj generates the work object for the corresponding snapshot
579-
func generateSnapshotWorkObj(workName string, resourceBinding *fleetv1beta1.ClusterResourceBinding, resourceSnapshot *fleetv1beta1.ClusterResourceSnapshot, manifest []fleetv1beta1.Manifest) *fleetv1beta1.Work {
631+
func generateSnapshotWorkObj(workName string, resourceBinding *fleetv1beta1.ClusterResourceBinding, resourceSnapshot *fleetv1beta1.ClusterResourceSnapshot,
632+
manifest []fleetv1beta1.Manifest, resourceOverrideSnapshotHash, clusterResourceOverrideSnapshotHash string) *fleetv1beta1.Work {
580633
return &fleetv1beta1.Work{
581634
ObjectMeta: metav1.ObjectMeta{
582635
Name: workName,
@@ -586,6 +639,11 @@ func generateSnapshotWorkObj(workName string, resourceBinding *fleetv1beta1.Clus
586639
fleetv1beta1.CRPTrackingLabel: resourceBinding.Labels[fleetv1beta1.CRPTrackingLabel],
587640
fleetv1beta1.ParentResourceSnapshotIndexLabel: resourceSnapshot.Labels[fleetv1beta1.ResourceIndexLabel],
588641
},
642+
Annotations: map[string]string{
643+
fleetv1beta1.ParentResourceSnapshotNameAnnotation: resourceBinding.Spec.ResourceSnapshotName,
644+
fleetv1beta1.ParentResourceOverrideSnapshotHashAnnotation: resourceOverrideSnapshotHash,
645+
fleetv1beta1.ParentClusterResourceOverrideSnapshotHashAnnotation: clusterResourceOverrideSnapshotHash,
646+
},
589647
OwnerReferences: []metav1.OwnerReference{
590648
{
591649
APIVersion: fleetv1beta1.GroupVersion.String(),
@@ -619,6 +677,7 @@ func (r *Reconciler) upsertWork(ctx context.Context, newWork, existingWork *flee
619677
"resourceSnapshot", resourceSnapshotObj, "work", workObj)
620678
return true, nil
621679
}
680+
// TODO: remove the compare after we did the check on all work in the sync all
622681
// check if we need to update the existing work object
623682
workResourceIndex, err := labels.ExtractResourceSnapshotIndexFromWork(existingWork)
624683
if err != nil {
@@ -628,12 +687,20 @@ func (r *Reconciler) upsertWork(ctx context.Context, newWork, existingWork *flee
628687
// we already checked the label in fetchAllResourceSnapShots function so no need to check again
629688
resourceIndex, _ := labels.ExtractResourceIndexFromClusterResourceSnapshot(resourceSnapshot)
630689
if workResourceIndex == resourceIndex {
631-
// no need to do anything if the work is generated from the same resource snapshot group since the resource snapshot is immutable.
632-
klog.V(2).InfoS("Work is already associated with the desired resourceSnapshot", "resourceIndex", resourceIndex, "work", workObj, "resourceSnapshot", resourceSnapshotObj)
633-
return false, nil
690+
// no need to do anything if the work is generated from the same snapshots
691+
if existingWork.Annotations[fleetv1beta1.ParentResourceOverrideSnapshotHashAnnotation] == newWork.Annotations[fleetv1beta1.ParentResourceOverrideSnapshotHashAnnotation] &&
692+
existingWork.Annotations[fleetv1beta1.ParentClusterResourceOverrideSnapshotHashAnnotation] == newWork.Annotations[fleetv1beta1.ParentClusterResourceOverrideSnapshotHashAnnotation] {
693+
klog.V(2).InfoS("Work is not associated with the desired override snapshots", "existingROHash", existingWork.Annotations[fleetv1beta1.ParentResourceOverrideSnapshotHashAnnotation],
694+
"existingCROHash", existingWork.Annotations[fleetv1beta1.ParentClusterResourceOverrideSnapshotHashAnnotation], "work", workObj)
695+
return false, nil
696+
}
697+
klog.V(2).InfoS("Work is already associated with the desired resourceSnapshot but still not having the right override snapshots", "resourceIndex", resourceIndex, "work", workObj, "resourceSnapshot", resourceSnapshotObj)
634698
}
635-
// need to update the existing work, only two possible changes:
636-
existingWork.Labels[fleetv1beta1.ParentResourceSnapshotIndexLabel] = resourceSnapshot.Labels[fleetv1beta1.ResourceIndexLabel]
699+
// need to copy the new work to the existing work, only 5 possible changes:
700+
existingWork.Labels[fleetv1beta1.ParentResourceSnapshotIndexLabel] = newWork.Labels[fleetv1beta1.ParentResourceSnapshotIndexLabel]
701+
existingWork.Annotations[fleetv1beta1.ParentResourceSnapshotNameAnnotation] = newWork.Annotations[fleetv1beta1.ParentResourceSnapshotNameAnnotation]
702+
existingWork.Annotations[fleetv1beta1.ParentResourceOverrideSnapshotHashAnnotation] = newWork.Annotations[fleetv1beta1.ParentResourceOverrideSnapshotHashAnnotation]
703+
existingWork.Annotations[fleetv1beta1.ParentClusterResourceOverrideSnapshotHashAnnotation] = newWork.Annotations[fleetv1beta1.ParentClusterResourceOverrideSnapshotHashAnnotation]
637704
existingWork.Spec.Workload.Manifests = newWork.Spec.Workload.Manifests
638705
if err := r.Client.Update(ctx, existingWork); err != nil {
639706
klog.ErrorS(err, "Failed to update the work associated with the resourceSnapshot", "resourceSnapshot", resourceSnapshotObj, "work", workObj)

0 commit comments

Comments
 (0)