Skip to content

Commit 5e59e96

Browse files
committed
fix: rollout and wait for Cilium DaemonSet
1 parent 78cd1e7 commit 5e59e96

File tree

2 files changed

+188
-35
lines changed

2 files changed

+188
-35
lines changed

pkg/handlers/generic/lifecycle/cni/cilium/handler.go

Lines changed: 121 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212
"github.com/spf13/pflag"
1313
appsv1 "k8s.io/api/apps/v1"
1414
corev1 "k8s.io/api/core/v1"
15+
apierrors "k8s.io/apimachinery/pkg/api/errors"
1516
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1617
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
1718
"sigs.k8s.io/cluster-api/controllers/remote"
@@ -228,6 +229,7 @@ func (c *CiliumCNI) apply(
228229
c.client,
229230
helmChart,
230231
).
232+
WithValueTemplater(templateValues).
231233
WithDefaultWaiter()
232234
case "":
233235
resp.SetStatus(runtimehooksv1.ResponseStatusFailure)
@@ -259,65 +261,144 @@ func runApply(
259261
return err
260262
}
261263

264+
// It is possible to disable kube-proxy and migrate to Cilium's kube-proxy replacement feature in a running cluster.
265+
// In this case, we need to wait for Cilium to be restated with new configuration and then cleanup kube-proxy.
266+
262267
// If skip kube-proxy is not set, return early.
263-
// Otherwise, wait for Cilium to be rolled out and then cleanup kube-proxy if installed.
264268
if !capiutils.ShouldSkipKubeProxy(cluster) {
265269
return nil
266270
}
267271

272+
remoteClient, err := remote.NewClusterClient(
273+
ctx,
274+
"",
275+
client,
276+
ctrlclient.ObjectKeyFromObject(cluster),
277+
)
278+
if err != nil {
279+
return fmt.Errorf("error creating remote cluster client: %w", err)
280+
}
281+
282+
// If kube-proxy is not installed, assume that the one-time migration of kube-proxy to Cilium is complete and return early.
283+
isKubeProxyInstalled, err := isKubeProxyInstalled(ctx, remoteClient)
284+
if err != nil {
285+
return fmt.Errorf("failed to check if kube-proxy is installed: %w", err)
286+
}
287+
if !isKubeProxyInstalled {
288+
return nil
289+
}
290+
291+
log.Info(
292+
fmt.Sprintf(
293+
"Waiting for Cilium ConfigMap to be updated with new configuration for cluster %s",
294+
ctrlclient.ObjectKeyFromObject(cluster),
295+
),
296+
)
297+
if err := waitForCiliumConfigMapToBeUpdatedWithKubeProxyReplacement(ctx, remoteClient); err != nil {
298+
return fmt.Errorf("failed to wait for Cilium ConfigMap to be updated: %w", err)
299+
}
300+
268301
log.Info(
269-
fmt.Sprintf("Waiting for Cilium to be ready for cluster %s", ctrlclient.ObjectKeyFromObject(cluster)),
302+
fmt.Sprintf(
303+
"Trigger a rollout of Cilium DaemonSet Pods for cluster %s",
304+
ctrlclient.ObjectKeyFromObject(cluster),
305+
),
270306
)
271-
if err := waitForCiliumToBeReady(ctx, client, cluster); err != nil {
272-
return fmt.Errorf("failed to wait for Cilium to be ready: %w", err)
307+
if err := forceCiliumRollout(ctx, remoteClient); err != nil {
308+
return fmt.Errorf("failed to force trigger a rollout of Cilium DaemonSet Pods: %w", err)
273309
}
274310

275311
log.Info(
276312
fmt.Sprintf("Cleaning up kube-proxy for cluster %s", ctrlclient.ObjectKeyFromObject(cluster)),
277313
)
278-
if err := cleanupKubeProxy(ctx, client, cluster); err != nil {
314+
if err := cleanupKubeProxy(ctx, remoteClient); err != nil {
279315
return fmt.Errorf("failed to cleanup kube-proxy: %w", err)
280316
}
281317

282318
return nil
283319
}
284320

285321
const (
322+
kubeProxyReplacementConfigKey = "kube-proxy-replacement"
323+
ciliumConfigMapName = "cilium-config"
324+
325+
restartedAtAnnotation = "caren.nutanix.com/restartedAt"
326+
286327
kubeProxyName = "kube-proxy"
287328
kubeProxyNamespace = "kube-system"
288329
)
289330

290-
func waitForCiliumToBeReady(
291-
ctx context.Context,
292-
c ctrlclient.Client,
293-
cluster *clusterv1.Cluster,
294-
) error {
295-
remoteClient, err := remote.NewClusterClient(
331+
// Use vars to override in integration tests.
332+
var (
333+
waitInterval = 1 * time.Second
334+
waitTimeout = 30 * time.Second
335+
)
336+
337+
func waitForCiliumConfigMapToBeUpdatedWithKubeProxyReplacement(ctx context.Context, c ctrlclient.Client) error {
338+
cm := &corev1.ConfigMap{
339+
ObjectMeta: metav1.ObjectMeta{
340+
Name: ciliumConfigMapName,
341+
Namespace: defaultCiliumNamespace,
342+
},
343+
}
344+
if err := wait.ForObject(
296345
ctx,
297-
"",
298-
c,
299-
ctrlclient.ObjectKeyFromObject(cluster),
300-
)
301-
if err != nil {
302-
return fmt.Errorf("error creating remote cluster client: %w", err)
346+
wait.ForObjectInput[*corev1.ConfigMap]{
347+
Reader: c,
348+
Target: cm.DeepCopy(),
349+
Check: func(_ context.Context, obj *corev1.ConfigMap) (bool, error) {
350+
return obj.Data[kubeProxyReplacementConfigKey] == "true", nil
351+
},
352+
Interval: waitInterval,
353+
Timeout: waitTimeout,
354+
},
355+
); err != nil {
356+
return fmt.Errorf("failed to wait for ConfigMap %s to be updated: %w", ctrlclient.ObjectKeyFromObject(cm), err)
303357
}
358+
return nil
359+
}
304360

361+
func forceCiliumRollout(ctx context.Context, c ctrlclient.Client) error {
305362
ds := &appsv1.DaemonSet{
306363
ObjectMeta: metav1.ObjectMeta{
307364
Name: defaultCiliumReleaseName,
308365
Namespace: defaultCiliumNamespace,
309366
},
310367
}
368+
if err := c.Get(ctx, ctrlclient.ObjectKeyFromObject(ds), ds); err != nil {
369+
return fmt.Errorf("failed to get cilium daemon set: %w", err)
370+
}
371+
372+
// Update the DaemonSet to force a rollout.
373+
annotations := ds.Spec.Template.Annotations
374+
if annotations == nil {
375+
annotations = make(map[string]string)
376+
}
377+
if _, ok := annotations[restartedAtAnnotation]; !ok {
378+
// Only set the annotation once to avoid a race conditition where rollouts are triggered repeatedly.
379+
annotations[restartedAtAnnotation] = time.Now().UTC().Format(time.RFC3339Nano)
380+
}
381+
ds.Spec.Template.Annotations = annotations
382+
if err := c.Update(ctx, ds); err != nil {
383+
return fmt.Errorf("failed to update cilium daemon set: %w", err)
384+
}
385+
311386
if err := wait.ForObject(
312387
ctx,
313388
wait.ForObjectInput[*appsv1.DaemonSet]{
314-
Reader: remoteClient,
389+
Reader: c,
315390
Target: ds.DeepCopy(),
316391
Check: func(_ context.Context, obj *appsv1.DaemonSet) (bool, error) {
317-
return obj.Status.NumberAvailable == obj.Status.DesiredNumberScheduled && obj.Status.NumberUnavailable == 0, nil
392+
if obj.Generation != obj.Status.ObservedGeneration {
393+
return false, nil
394+
}
395+
isUpdated := obj.Status.NumberAvailable == obj.Status.DesiredNumberScheduled &&
396+
// We're forcing a rollout so we expect the UpdatedNumberScheduled to be always set.
397+
obj.Status.UpdatedNumberScheduled == obj.Status.DesiredNumberScheduled
398+
return isUpdated, nil
318399
},
319-
Interval: 1 * time.Second,
320-
Timeout: 30 * time.Second,
400+
Interval: waitInterval,
401+
Timeout: waitTimeout,
321402
},
322403
); err != nil {
323404
return fmt.Errorf(
@@ -331,17 +412,7 @@ func waitForCiliumToBeReady(
331412
}
332413

333414
// cleanupKubeProxy cleans up kube-proxy DaemonSet and ConfigMap on the remote cluster when kube-proxy is disabled.
334-
func cleanupKubeProxy(ctx context.Context, c ctrlclient.Client, cluster *clusterv1.Cluster) error {
335-
remoteClient, err := remote.NewClusterClient(
336-
ctx,
337-
"",
338-
c,
339-
ctrlclient.ObjectKeyFromObject(cluster),
340-
)
341-
if err != nil {
342-
return fmt.Errorf("error creating remote cluster client: %w", err)
343-
}
344-
415+
func cleanupKubeProxy(ctx context.Context, c ctrlclient.Client) error {
345416
objs := []ctrlclient.Object{
346417
&appsv1.DaemonSet{
347418
ObjectMeta: metav1.ObjectMeta{
@@ -357,10 +428,27 @@ func cleanupKubeProxy(ctx context.Context, c ctrlclient.Client, cluster *cluster
357428
},
358429
}
359430
for _, obj := range objs {
360-
if err := ctrlclient.IgnoreNotFound(remoteClient.Delete(ctx, obj)); err != nil {
431+
if err := ctrlclient.IgnoreNotFound(c.Delete(ctx, obj)); err != nil {
361432
return fmt.Errorf("failed to delete %s/%s: %w", obj.GetNamespace(), obj.GetName(), err)
362433
}
363434
}
364435

365436
return nil
366437
}
438+
439+
func isKubeProxyInstalled(ctx context.Context, c ctrlclient.Client) (bool, error) {
440+
ds := &appsv1.DaemonSet{
441+
ObjectMeta: metav1.ObjectMeta{
442+
Name: kubeProxyName,
443+
Namespace: kubeProxyNamespace,
444+
},
445+
}
446+
err := c.Get(ctx, ctrlclient.ObjectKeyFromObject(ds), ds)
447+
if err != nil {
448+
if apierrors.IsNotFound(err) {
449+
return false, nil
450+
}
451+
return false, err
452+
}
453+
return true, nil
454+
}

pkg/handlers/generic/lifecycle/cni/cilium/handler_integration_test.go

Lines changed: 67 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ package cilium
55

66
import (
77
"fmt"
8+
"time"
89

910
"github.com/go-logr/logr"
1011
. "github.com/onsi/ginkgo/v2"
@@ -51,7 +52,7 @@ var _ = Describe("Test runApply", func() {
5152
Expect(err).To(BeNil())
5253
Expect(configMap).ToNot(BeNil())
5354

54-
By("Should not delete when the addon is not applied")
55+
By("Should not delete kube-proxy when the addon is not applied")
5556
err = runApply(
5657
ctx,
5758
c,
@@ -70,14 +71,61 @@ var _ = Describe("Test runApply", func() {
7071
Expect(err).To(BeNil())
7172
Expect(configMap).ToNot(BeNil())
7273

73-
By("Should delete kube-proxy when skip kube-proxy is set")
74+
By("Should not delete kube-proxy when Cilium DaemonSet is not updated")
7475
cluster.Spec.Topology.ControlPlane.Metadata.Annotations = map[string]string{
7576
controlplanev1.SkipKubeProxyAnnotation: "",
7677
}
78+
// Speed up the test.
79+
waitTimeout = 1 * time.Second
80+
err = runApply(ctx, c, cluster, strategy, cluster.Namespace, logr.Discard())
81+
Expect(err).ToNot(BeNil())
82+
83+
// Verify that the kube-proxy DaemonSet and ConfigMap are not deleted when Cilium DaemonSet is not updated
84+
err = remoteClient.Get(ctx, ctrlclient.ObjectKey{Name: kubeProxyName, Namespace: kubeProxyNamespace}, daemonSet)
85+
Expect(err).To(BeNil())
86+
Expect(daemonSet).ToNot(BeNil())
87+
err = remoteClient.Get(ctx, ctrlclient.ObjectKey{Name: kubeProxyName, Namespace: kubeProxyNamespace}, configMap)
88+
Expect(err).To(BeNil())
89+
Expect(configMap).ToNot(BeNil())
90+
91+
By("Should delete kube-proxy when skip kube-proxy is set")
92+
// Update the status of the Cilium DaemonSet to simulate a roll out.
93+
ciliumDaemonSet := &appsv1.DaemonSet{
94+
ObjectMeta: metav1.ObjectMeta{
95+
Name: defaultCiliumReleaseName,
96+
Namespace: defaultCiliumNamespace,
97+
},
98+
}
99+
err = remoteClient.Get(ctx, ctrlclient.ObjectKey{Name: defaultCiliumReleaseName, Namespace: defaultCiliumNamespace}, ciliumDaemonSet)
100+
Expect(err).To(BeNil())
101+
ciliumDaemonSet.Status = appsv1.DaemonSetStatus{
102+
ObservedGeneration: 2,
103+
NumberAvailable: 2,
104+
DesiredNumberScheduled: 2,
105+
UpdatedNumberScheduled: 2,
106+
NumberUnavailable: 0,
107+
}
108+
Expect(remoteClient.Status().Update(ctx, ciliumDaemonSet)).To(Succeed())
77109

78110
err = runApply(ctx, c, cluster, strategy, cluster.Namespace, logr.Discard())
79111
Expect(err).To(BeNil())
80112

113+
// Verify that the Cilium DaemonSet was updated.
114+
ciliumDaemonSet = &appsv1.DaemonSet{
115+
ObjectMeta: metav1.ObjectMeta{
116+
Name: defaultCiliumReleaseName,
117+
Namespace: defaultCiliumNamespace,
118+
},
119+
}
120+
err = remoteClient.Get(
121+
ctx,
122+
ctrlclient.ObjectKeyFromObject(ciliumDaemonSet),
123+
ciliumDaemonSet,
124+
)
125+
Expect(err).To(BeNil())
126+
Expect(ciliumDaemonSet).ToNot(BeNil())
127+
Expect(ciliumDaemonSet.Spec.Template.Annotations).To(HaveKey(restartedAtAnnotation))
128+
81129
// Verify that the kube-proxy DaemonSet and ConfigMap are deleted.
82130
err = remoteClient.Get(ctx, ctrlclient.ObjectKey{Name: kubeProxyName, Namespace: kubeProxyNamespace}, daemonSet)
83131
Expect(err).ToNot(BeNil())
@@ -158,13 +206,15 @@ func setupTestCluster(
158206
}
159207
Expect(remoteClient.Create(ctx, configMap)).To(Succeed())
160208

209+
// Cilium DaemonSet, Pods and ConfigMap
161210
ciliumDaemonSet := &appsv1.DaemonSet{
162211
ObjectMeta: metav1.ObjectMeta{
163212
Name: defaultCiliumReleaseName,
164213
Namespace: defaultCiliumNamespace,
165214
Labels: map[string]string{
166215
"app": defaultCiliumReleaseName,
167216
},
217+
Generation: 1,
168218
},
169219
Spec: appsv1.DaemonSetSpec{
170220
Selector: &metav1.LabelSelector{
@@ -192,6 +242,21 @@ func setupTestCluster(
192242
},
193243
}
194244
Expect(remoteClient.Create(ctx, ciliumDaemonSet)).To(Succeed())
245+
ciliumDaemonSet.Status = appsv1.DaemonSetStatus{
246+
ObservedGeneration: 1,
247+
}
248+
Expect(remoteClient.Status().Update(ctx, ciliumDaemonSet)).To(Succeed())
249+
250+
configMap = &corev1.ConfigMap{
251+
ObjectMeta: metav1.ObjectMeta{
252+
Name: ciliumConfigMapName,
253+
Namespace: defaultCiliumNamespace,
254+
},
255+
Data: map[string]string{
256+
kubeProxyReplacementConfigKey: "true",
257+
},
258+
}
259+
Expect(remoteClient.Create(ctx, configMap)).To(Succeed())
195260

196261
return cluster, remoteClient
197262
}

0 commit comments

Comments
 (0)