Skip to content

Commit d9c2bfe

Browse files
authored
Make wait for autopilot plan resilient to node restarts (#3068)
* Make Wait for autopilot plan resilient to node restarts * add tests
1 parent cdf0571 commit d9c2bfe

File tree

2 files changed

+189
-9
lines changed

2 files changed

+189
-9
lines changed

pkg-new/upgrade/upgrade.go

Lines changed: 39 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package upgrade
22

33
import (
44
"context"
5+
"errors"
56
"fmt"
67
"reflect"
78
"time"
@@ -23,7 +24,8 @@ import (
2324
"github.com/replicatedhq/embedded-cluster/pkg/runtimeconfig"
2425
"github.com/replicatedhq/embedded-cluster/pkg/support"
2526
"github.com/sirupsen/logrus"
26-
"k8s.io/apimachinery/pkg/api/errors"
27+
k8serrors "k8s.io/apimachinery/pkg/api/errors"
28+
"k8s.io/apimachinery/pkg/util/wait"
2729
"sigs.k8s.io/controller-runtime/pkg/client"
2830
)
2931

@@ -347,9 +349,9 @@ func upgradeExtensions(ctx context.Context, cli client.Client, hcli helm.Client,
347349
func createAutopilotPlan(ctx context.Context, cli client.Client, rc runtimeconfig.RuntimeConfig, desiredVersion string, in *ecv1beta1.Installation, meta *ectypes.ReleaseMetadata, logger logrus.FieldLogger) error {
348350
var plan apv1b2.Plan
349351
okey := client.ObjectKey{Name: "autopilot"}
350-
if err := cli.Get(ctx, okey, &plan); err != nil && !errors.IsNotFound(err) {
352+
if err := cli.Get(ctx, okey, &plan); err != nil && !k8serrors.IsNotFound(err) {
351353
return fmt.Errorf("get upgrade plan: %w", err)
352-
} else if errors.IsNotFound(err) {
354+
} else if k8serrors.IsNotFound(err) {
353355
// if the kubernetes version has changed we create an upgrade command
354356
logger.WithField("version", desiredVersion).Info("Starting k0s autopilot upgrade plan")
355357

@@ -364,15 +366,43 @@ func createAutopilotPlan(ctx context.Context, cli client.Client, rc runtimeconfi
364366
}
365367

366368
func waitForAutopilotPlan(ctx context.Context, cli client.Client, logger logrus.FieldLogger) (apv1b2.Plan, error) {
367-
for {
368-
var plan apv1b2.Plan
369-
if err := cli.Get(ctx, client.ObjectKey{Name: "autopilot"}, &plan); err != nil {
370-
return plan, fmt.Errorf("get upgrade plan: %w", err)
369+
backoff := wait.Backoff{
370+
Duration: time.Second,
371+
Factor: 2.0,
372+
Steps: 75, // ~25 minutes with exponential backoff (1s, 2s, 4s, 8s, 16s, then 20s capped)
373+
Cap: 20 * time.Second,
374+
}
375+
376+
var plan apv1b2.Plan
377+
var lastErr error
378+
379+
err := wait.ExponentialBackoffWithContext(ctx, backoff, func(ctx context.Context) (bool, error) {
380+
err := cli.Get(ctx, client.ObjectKey{Name: "autopilot"}, &plan)
381+
if err != nil {
382+
lastErr = fmt.Errorf("get autopilot plan: %w", err)
383+
return false, nil
371384
}
385+
372386
if autopilot.HasThePlanEnded(plan) {
373-
return plan, nil
387+
return true, nil
374388
}
389+
375390
logger.WithField("plan_id", plan.Spec.ID).Info("An autopilot upgrade is in progress")
376-
time.Sleep(5 * time.Second)
391+
return false, nil
392+
})
393+
394+
if err != nil {
395+
if errors.Is(err, context.Canceled) {
396+
if lastErr != nil {
397+
err = errors.Join(err, lastErr)
398+
}
399+
return apv1b2.Plan{}, err
400+
} else if lastErr != nil {
401+
return apv1b2.Plan{}, fmt.Errorf("timed out waiting for autopilot plan: %w", lastErr)
402+
} else {
403+
return apv1b2.Plan{}, fmt.Errorf("timed out waiting for autopilot plan")
404+
}
377405
}
406+
407+
return plan, nil
378408
}

pkg-new/upgrade/upgrade_test.go

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,13 @@ package upgrade
33
import (
44
"context"
55
"encoding/json"
6+
"fmt"
7+
"sync/atomic"
68
"testing"
79

10+
apv1b2 "github.com/k0sproject/k0s/pkg/apis/autopilot/v1beta2"
811
k0sv1beta1 "github.com/k0sproject/k0s/pkg/apis/k0s/v1beta1"
12+
"github.com/k0sproject/k0s/pkg/autopilot/controller/plans/core"
913
ecv1beta1 "github.com/replicatedhq/embedded-cluster/kinds/apis/v1beta1"
1014
"github.com/sirupsen/logrus"
1115
"github.com/stretchr/testify/assert"
@@ -369,3 +373,149 @@ config:
369373
})
370374
}
371375
}
376+
377+
func TestWaitForAutopilotPlan_Success(t *testing.T) {
378+
logger := logrus.New()
379+
logger.SetLevel(logrus.ErrorLevel)
380+
381+
scheme := runtime.NewScheme()
382+
require.NoError(t, apv1b2.Install(scheme))
383+
384+
plan := &apv1b2.Plan{
385+
ObjectMeta: metav1.ObjectMeta{
386+
Name: "autopilot",
387+
},
388+
Status: apv1b2.PlanStatus{
389+
State: core.PlanCompleted,
390+
},
391+
}
392+
393+
cli := fake.NewClientBuilder().
394+
WithScheme(scheme).
395+
WithObjects(plan).
396+
Build()
397+
398+
result, err := waitForAutopilotPlan(t.Context(), cli, logger)
399+
require.NoError(t, err)
400+
assert.Equal(t, "autopilot", result.Name)
401+
}
402+
403+
func TestWaitForAutopilotPlan_RetriesOnTransientErrors(t *testing.T) {
404+
logger := logrus.New()
405+
logger.SetLevel(logrus.ErrorLevel)
406+
407+
scheme := runtime.NewScheme()
408+
require.NoError(t, apv1b2.Install(scheme))
409+
410+
// Plan that starts completed
411+
plan := &apv1b2.Plan{
412+
ObjectMeta: metav1.ObjectMeta{
413+
Name: "autopilot",
414+
},
415+
Status: apv1b2.PlanStatus{
416+
State: core.PlanCompleted,
417+
},
418+
}
419+
420+
// Mock client that fails first 3 times, then succeeds
421+
var callCount atomic.Int32
422+
cli := &mockClientWithRetries{
423+
Client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(plan).Build(),
424+
failCount: 3,
425+
currentCount: &callCount,
426+
}
427+
428+
result, err := waitForAutopilotPlan(t.Context(), cli, logger)
429+
require.NoError(t, err)
430+
assert.Equal(t, "autopilot", result.Name)
431+
assert.Equal(t, int32(4), callCount.Load(), "Should have retried 3 times before succeeding")
432+
}
433+
434+
func TestWaitForAutopilotPlan_ContextCanceled(t *testing.T) {
435+
logger := logrus.New()
436+
logger.SetLevel(logrus.ErrorLevel)
437+
438+
scheme := runtime.NewScheme()
439+
require.NoError(t, apv1b2.Install(scheme))
440+
441+
ctx, cancel := context.WithCancel(t.Context())
442+
cancel() // Cancel immediately
443+
444+
cli := fake.NewClientBuilder().WithScheme(scheme).Build()
445+
446+
_, err := waitForAutopilotPlan(ctx, cli, logger)
447+
require.Error(t, err)
448+
assert.Contains(t, err.Error(), "context canceled")
449+
}
450+
451+
func TestWaitForAutopilotPlan_WaitsForCompletion(t *testing.T) {
452+
logger := logrus.New()
453+
logger.SetLevel(logrus.ErrorLevel)
454+
455+
scheme := runtime.NewScheme()
456+
require.NoError(t, apv1b2.Install(scheme))
457+
458+
// Plan that starts in progress, then completes after some time
459+
plan := &apv1b2.Plan{
460+
ObjectMeta: metav1.ObjectMeta{
461+
Name: "autopilot",
462+
},
463+
Spec: apv1b2.PlanSpec{
464+
ID: "test-plan",
465+
},
466+
Status: apv1b2.PlanStatus{
467+
State: core.PlanSchedulable,
468+
},
469+
}
470+
471+
cli := &mockClientWithStateChange{
472+
Client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(plan).Build(),
473+
plan: plan,
474+
callsUntil: 3, // Will complete after 3 calls
475+
}
476+
477+
result, err := waitForAutopilotPlan(t.Context(), cli, logger)
478+
require.NoError(t, err)
479+
assert.Equal(t, "autopilot", result.Name)
480+
assert.Equal(t, core.PlanCompleted, result.Status.State)
481+
}
482+
483+
// Mock client that fails N times before succeeding
484+
type mockClientWithRetries struct {
485+
client.Client
486+
failCount int
487+
currentCount *atomic.Int32
488+
}
489+
490+
func (m *mockClientWithRetries) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
491+
count := m.currentCount.Add(1)
492+
if count <= int32(m.failCount) {
493+
return fmt.Errorf("connection refused")
494+
}
495+
return m.Client.Get(ctx, key, obj, opts...)
496+
}
497+
498+
// Mock client that changes plan state after N calls
499+
type mockClientWithStateChange struct {
500+
client.Client
501+
plan *apv1b2.Plan
502+
callCount int
503+
callsUntil int
504+
}
505+
506+
func (m *mockClientWithStateChange) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
507+
m.callCount++
508+
err := m.Client.Get(ctx, key, obj, opts...)
509+
if err != nil {
510+
return err
511+
}
512+
513+
// After N calls, mark the plan as completed
514+
if m.callCount >= m.callsUntil {
515+
if plan, ok := obj.(*apv1b2.Plan); ok {
516+
plan.Status.State = core.PlanCompleted
517+
}
518+
}
519+
520+
return nil
521+
}

0 commit comments

Comments
 (0)