Skip to content

Commit 6335950

Browse files
committed
cluster/topology: use cached MD list in get current state
1 parent 4920e6e commit 6335950

File tree

3 files changed

+117
-8
lines changed

3 files changed

+117
-8
lines changed

internal/controllers/topology/cluster/current_state.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,8 +161,11 @@ func (r *Reconciler) getCurrentMachineDeploymentState(ctx context.Context, bluep
161161
state := make(scope.MachineDeploymentsStateMap)
162162

163163
// List all the machine deployments in the current cluster and in a managed topology.
164+
// Note: This is a cached list call. We ensure in reconcile_state that the cache is up-to-date
165+
// after we create/update a MachineDeployment and we double-check if an MD already exists before
166+
// we create it.
164167
md := &clusterv1.MachineDeploymentList{}
165-
err := r.APIReader.List(ctx, md,
168+
err := r.Client.List(ctx, md,
166169
client.MatchingLabels{
167170
clusterv1.ClusterNameLabel: cluster.Name,
168171
clusterv1.ClusterTopologyOwnedLabel: "",

internal/controllers/topology/cluster/reconcile_state.go

Lines changed: 85 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,16 @@ import (
2020
"context"
2121
"fmt"
2222
"strings"
23+
"time"
2324

2425
"github.com/pkg/errors"
2526
corev1 "k8s.io/api/core/v1"
2627
apierrors "k8s.io/apimachinery/pkg/api/errors"
2728
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2829
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
30+
"k8s.io/apimachinery/pkg/util/sets"
2931
"k8s.io/apimachinery/pkg/util/validation/field"
32+
"k8s.io/apimachinery/pkg/util/wait"
3033
"k8s.io/apiserver/pkg/storage/names"
3134
"k8s.io/klog/v2"
3235
ctrl "sigs.k8s.io/controller-runtime"
@@ -446,11 +449,29 @@ func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.S
446449
diff := calculateMachineDeploymentDiff(s.Current.MachineDeployments, s.Desired.MachineDeployments)
447450

448451
// Create MachineDeployments.
449-
for _, mdTopologyName := range diff.toCreate {
450-
md := s.Desired.MachineDeployments[mdTopologyName]
451-
if err := r.createMachineDeployment(ctx, s, md); err != nil {
452+
if len(diff.toCreate) > 0 {
453+
// In current state we only got the MD list via a cached call.
454+
// As a consequence, in order to prevent the creation of duplicate MD due to stale reads,
455+
// we are now using a live client to double-check here that the MachineDeployment
456+
// to be created doesn't exist yet.
457+
currentMDTopologyNames, err := r.getCurrentMachineDeployments(ctx, s)
458+
if err != nil {
452459
return err
453460
}
461+
for _, mdTopologyName := range diff.toCreate {
462+
md := s.Desired.MachineDeployments[mdTopologyName]
463+
464+
// Skip the MD creation if the MD already exists.
465+
if currentMDTopologyNames.Has(mdTopologyName) {
466+
log := tlog.LoggerFrom(ctx).WithMachineDeployment(md.Object)
467+
log.V(3).Infof(fmt.Sprintf("Skipping creation of MachineDeployment %s because MachineDeployment for topology %s already exists (only considered creation because of stale cache)", tlog.KObj{Obj: md.Object}, mdTopologyName))
468+
continue
469+
}
470+
471+
if err := r.createMachineDeployment(ctx, s, md); err != nil {
472+
return err
473+
}
474+
}
454475
}
455476

456477
// Update MachineDeployments.
@@ -472,6 +493,32 @@ func (r *Reconciler) reconcileMachineDeployments(ctx context.Context, s *scope.S
472493
return nil
473494
}
474495

496+
// getCurrentMachineDeployments gets the current list of MachineDeployments via the APIReader.
497+
func (r *Reconciler) getCurrentMachineDeployments(ctx context.Context, s *scope.Scope) (sets.Set[string], error) {
498+
// TODO: We should consider using PartialObjectMetadataList here. Currently this doesn't work as our
499+
// implementation for topology dryrun doesn't support PartialObjectMetadataList.
500+
mdList := &clusterv1.MachineDeploymentList{}
501+
err := r.APIReader.List(ctx, mdList,
502+
client.MatchingLabels{
503+
clusterv1.ClusterNameLabel: s.Current.Cluster.Name,
504+
clusterv1.ClusterTopologyOwnedLabel: "",
505+
},
506+
client.InNamespace(s.Current.Cluster.Namespace),
507+
)
508+
if err != nil {
509+
return nil, errors.Wrap(err, "failed to read MachineDeployments for managed topology")
510+
}
511+
512+
currentMDs := sets.Set[string]{}
513+
for _, md := range mdList.Items {
514+
mdTopologyName, ok := md.ObjectMeta.Labels[clusterv1.ClusterTopologyMachineDeploymentNameLabel]
515+
if ok || mdTopologyName != "" {
516+
currentMDs.Insert(mdTopologyName)
517+
}
518+
}
519+
return currentMDs, nil
520+
}
521+
475522
// createMachineDeployment creates a MachineDeployment and the corresponding Templates.
476523
func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope, md *scope.MachineDeploymentState) error {
477524
// Do not create the MachineDeployment if it is marked as pending create.
@@ -517,6 +564,23 @@ func (r *Reconciler) createMachineDeployment(ctx context.Context, s *scope.Scope
517564
}
518565
r.recorder.Eventf(cluster, corev1.EventTypeNormal, createEventReason, "Created %q", tlog.KObj{Obj: md.Object})
519566

567+
// Wait until MachineDeployment is visible in the cache.
568+
// Note: We have to do this because otherwise using a cached client in current state could
569+
// miss a newly created MachineDeployment (because the cache might be stale).
570+
err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) {
571+
key := client.ObjectKey{Namespace: md.Object.Namespace, Name: md.Object.Name}
572+
if err := r.Client.Get(ctx, key, &clusterv1.MachineDeployment{}); err != nil {
573+
if apierrors.IsNotFound(err) {
574+
return false, nil
575+
}
576+
return false, err
577+
}
578+
return true, nil
579+
})
580+
if err != nil {
581+
return errors.Wrapf(err, "failed to create %s: failed waiting for MachineDeployment to be visible in cache", md.Object.Kind)
582+
}
583+
520584
// If the MachineDeployment has defined a MachineHealthCheck reconcile it.
521585
if md.MachineHealthCheck != nil {
522586
if err := r.reconcileMachineHealthCheck(ctx, nil, md.MachineHealthCheck); err != nil {
@@ -588,6 +652,24 @@ func (r *Reconciler) updateMachineDeployment(ctx context.Context, s *scope.Scope
588652
}
589653
r.recorder.Eventf(cluster, corev1.EventTypeNormal, updateEventReason, "Updated %q%s", tlog.KObj{Obj: currentMD.Object}, logMachineDeploymentVersionChange(currentMD.Object, desiredMD.Object))
590654

655+
// Wait until MachineDeployment is updated in the cache.
656+
// Note: We have to do this because otherwise using a cached client in current state could
657+
// return a stale state of a MachineDeployment we just patched (because the cache might be stale).
658+
// Note: It is good enough to check that the resource version changed. Other controllers might have updated the
659+
// MachineDeployment as well, but the combination of the patch call above without a conflict and a changed resource
660+
// version here guarantees that we see the changes of our own update.
661+
err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, 5*time.Second, true, func(ctx context.Context) (bool, error) {
662+
key := client.ObjectKey{Namespace: currentMD.Object.GetNamespace(), Name: currentMD.Object.GetName()}
663+
cachedMD := &clusterv1.MachineDeployment{}
664+
if err := r.Client.Get(ctx, key, cachedMD); err != nil {
665+
return false, err
666+
}
667+
return currentMD.Object.GetResourceVersion() != cachedMD.GetResourceVersion(), nil
668+
})
669+
if err != nil {
670+
return errors.Wrapf(err, "failed to patch %s: failed waiting for MachineDeployment to be updated in cache", tlog.KObj{Obj: currentMD.Object})
671+
}
672+
591673
// We want to call both cleanup functions even if one of them fails to clean up as much as possible.
592674
return nil
593675
}

internal/controllers/topology/cluster/reconcile_state_test.go

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1778,6 +1778,7 @@ func TestReconcileMachineDeployments(t *testing.T) {
17781778
tests := []struct {
17791779
name string
17801780
current []*scope.MachineDeploymentState
1781+
currentOnlyAPIServer []*scope.MachineDeploymentState
17811782
desired []*scope.MachineDeploymentState
17821783
upgradeTracker *scope.UpgradeTracker
17831784
want []*scope.MachineDeploymentState
@@ -1792,6 +1793,14 @@ func TestReconcileMachineDeployments(t *testing.T) {
17921793
want: []*scope.MachineDeploymentState{md1},
17931794
wantErr: false,
17941795
},
1796+
{
1797+
name: "Should skip creating desired MachineDeployment if it already exists in the apiserver (even if it is not in current state)",
1798+
current: nil,
1799+
currentOnlyAPIServer: []*scope.MachineDeploymentState{md1},
1800+
desired: []*scope.MachineDeploymentState{md1},
1801+
want: []*scope.MachineDeploymentState{md1},
1802+
wantErr: false,
1803+
},
17951804
{
17961805
name: "Should not create desired MachineDeployment if the current does not exists yet and it marked as pending create",
17971806
current: nil,
@@ -1916,17 +1925,28 @@ func TestReconcileMachineDeployments(t *testing.T) {
19161925
}
19171926

19181927
currentMachineDeploymentStates := toMachineDeploymentTopologyStateMap(tt.current)
1919-
s := scope.New(builder.Cluster(metav1.NamespaceDefault, "cluster-1").Build())
1928+
s := scope.New(builder.Cluster(namespace.GetName(), "cluster-1").Build())
19201929
s.Current.MachineDeployments = currentMachineDeploymentStates
19211930

1931+
// currentOnlyAPIServer MDs only exist in the APIserver but are not part of s.Current.
1932+
// This simulates that getCurrentMachineDeploymentState in current_state.go read a stale MD list.
1933+
for _, s := range tt.currentOnlyAPIServer {
1934+
mdState := prepareMachineDeploymentState(s, namespace.GetName())
1935+
1936+
g.Expect(env.PatchAndWait(ctx, mdState.InfrastructureMachineTemplate, client.ForceOwnership, client.FieldOwner(structuredmerge.TopologyManagerName))).To(Succeed())
1937+
g.Expect(env.PatchAndWait(ctx, mdState.BootstrapTemplate, client.ForceOwnership, client.FieldOwner(structuredmerge.TopologyManagerName))).To(Succeed())
1938+
g.Expect(env.PatchAndWait(ctx, mdState.Object, client.ForceOwnership, client.FieldOwner(structuredmerge.TopologyManagerName))).To(Succeed())
1939+
}
1940+
19221941
s.Desired = &scope.ClusterState{MachineDeployments: toMachineDeploymentTopologyStateMap(tt.desired)}
19231942

19241943
if tt.upgradeTracker != nil {
19251944
s.UpgradeTracker = tt.upgradeTracker
19261945
}
19271946

19281947
r := Reconciler{
1929-
Client: env,
1948+
Client: env.GetClient(),
1949+
APIReader: env.GetAPIReader(),
19301950
patchHelperFactory: serverSideApplyPatchHelperFactory(env, ssa.NewCache()),
19311951
recorder: env.GetEventRecorderFor("test"),
19321952
}
@@ -2676,7 +2696,8 @@ func TestReconcileMachineDeploymentMachineHealthCheck(t *testing.T) {
26762696
s.Desired = &scope.ClusterState{MachineDeployments: toMachineDeploymentTopologyStateMap(tt.desired)}
26772697

26782698
r := Reconciler{
2679-
Client: env,
2699+
Client: env.GetClient(),
2700+
APIReader: env.GetAPIReader(),
26802701
patchHelperFactory: serverSideApplyPatchHelperFactory(env, ssa.NewCache()),
26812702
recorder: env.GetEventRecorderFor("test"),
26822703
}
@@ -2712,7 +2733,10 @@ func newFakeMachineDeploymentTopologyState(name string, infrastructureMachineTem
27122733
Object: builder.MachineDeployment(metav1.NamespaceDefault, name).
27132734
WithInfrastructureTemplate(infrastructureMachineTemplate).
27142735
WithBootstrapTemplate(bootstrapTemplate).
2715-
WithLabels(map[string]string{clusterv1.ClusterTopologyMachineDeploymentNameLabel: name + "-topology"}).
2736+
WithLabels(map[string]string{
2737+
clusterv1.ClusterTopologyMachineDeploymentNameLabel: name + "-topology",
2738+
clusterv1.ClusterTopologyOwnedLabel: "",
2739+
}).
27162740
WithClusterName("cluster-1").
27172741
WithReplicas(1).
27182742
WithDefaulter(true).

0 commit comments

Comments
 (0)