fix downtime causing bug during rolling deployments (#3817)

gwuah · web-flow · commit d8dca836f4cd · 2024-08-08T18:44:15.000+02:00
* fix bug causing downtime during rolling deployments

* fix bug in recoverable deploys

* fix bug causing test to fail

* refactor statuslogger to machine id mapper

* refactor pool sizing

* use errgroup instead of waitgroups and errchan
diff --git a/internal/command/deploy/machines_deploymachinesapp.go b/internal/command/deploy/machines_deploymachinesapp.go
@@ -758,39 +758,74 @@ func (md *machineDeployment) updateUsingRollingStrategy(parentCtx context.Contex
 
 	for group, entries := range entriesByGroup {
 		entries := entries
-		startIdx += len(entries)
+
+		warmMachines := lo.Filter(entries, func(e *machineUpdateEntry, i int) bool {
+			return e.leasableMachine.Machine().State == "started"
+		})
+		coldMachines := lo.Filter(entries, func(e *machineUpdateEntry, i int) bool {
+			return e.leasableMachine.Machine().State != "started"
+		})
+
 		groupsPool.Go(func(ctx context.Context) error {
-			return md.updateEntriesGroup(ctx, group, entries, sl, startIdx-len(entries))
+			eg, ctx := errgroup.WithContext(ctx)
+
+			eg.Go(func() (err error) {
+				poolSize := len(coldMachines)
+				if poolSize >= STOPPED_MACHINES_POOL_SIZE {
+					poolSize = STOPPED_MACHINES_POOL_SIZE
+				}
+
+				if len(coldMachines) > 0 {
+					// for cold machines, we can update all of them at once.
+					// there's no need for protection against downtime since the machines are already stopped
+					startIdx += len(coldMachines)
+					return md.updateEntriesGroup(ctx, group, coldMachines, sl, startIdx-len(coldMachines), poolSize)
+				}
+
+				return nil
+			})
+
+			eg.Go(func() (err error) {
+				// for warm machines, we update them in chunks of size, md.maxUnavailable.
+				// this is to prevent downtime/low-latency during deployments
+				startIdx += len(warmMachines)
+				poolSize := md.getPoolSize(len(warmMachines))
+				if len(warmMachines) > 0 {
+					return md.updateEntriesGroup(ctx, group, warmMachines, sl, startIdx-len(warmMachines), poolSize)
+				}
+				return nil
+			})
+
+			return eg.Wait()
 		})
 	}
 
 	err := groupsPool.Wait()
 	if err != nil {
 		span.RecordError(err)
 	}
+
 	return err
 }
 
-func (md *machineDeployment) updateEntriesGroup(parentCtx context.Context, group string, entries []*machineUpdateEntry, sl statuslogger.StatusLogger, startIdx int) error {
+func (md *machineDeployment) getPoolSize(totalMachines int) int {
+	switch mu := md.maxUnavailable; {
+	case mu >= 1:
+		return int(mu)
+	default:
+		return int(math.Ceil(float64(totalMachines) * mu))
+	}
+}
+
+func (md *machineDeployment) updateEntriesGroup(parentCtx context.Context, group string, entries []*machineUpdateEntry, sl statuslogger.StatusLogger, startIdx int, poolSize int) error {
 	parentCtx, span := tracing.GetTracer().Start(parentCtx, "update_entries_in_group", trace.WithAttributes(
 		attribute.Int("start_id", startIdx),
 		attribute.String("group", group),
 		attribute.Int("max_unavailable", int(md.maxUnavailable)),
+		attribute.Int("pool_size", poolSize),
 	))
 	defer span.End()
 
-	var poolSize int
-	switch mu := md.maxUnavailable; {
-	case mu >= 1:
-		poolSize = int(mu)
-	case mu > 0:
-		poolSize = int(math.Ceil(float64(len(entries)) * mu))
-	default:
-		return fmt.Errorf("Invalid --max-unavailable value: %v", mu)
-	}
-
-	span.SetAttributes(attribute.Int("pool_size", poolSize))
-
 	updatePool := pool.New().
 		WithErrors().
 		WithMaxGoroutines(poolSize).
@@ -801,6 +836,7 @@ func (md *machineDeployment) updateEntriesGroup(parentCtx context.Context, group
 		e := e
 		eCtx := statuslogger.NewContext(parentCtx, sl.Line(startIdx+idx))
 		fmtID := e.leasableMachine.FormattedMachineId()
+		span.SetAttributes(attribute.String("state", e.leasableMachine.Machine().State))
 
 		statusRunning := func() {
 			statuslogger.LogfStatus(eCtx,
diff --git a/internal/command/deploy/plan.go b/internal/command/deploy/plan.go
@@ -4,7 +4,6 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"math"
 	"strings"
 	"sync"
 	"time"
@@ -22,6 +21,36 @@ import (
 	"golang.org/x/sync/errgroup"
 )
 
+const (
+	STOPPED_MACHINES_POOL_SIZE = 30
+)
+
+type MachineLogger struct {
+	store map[string]statuslogger.StatusLine
+	sl    statuslogger.StatusLogger
+}
+
+func NewMachineLogger(store map[string]statuslogger.StatusLine, sl statuslogger.StatusLogger) *MachineLogger {
+	return &MachineLogger{
+		store: store,
+		sl:    sl,
+	}
+}
+
+func (m *MachineLogger) initFromMachinePairs(mp []machinePairing) {
+	for idx, machPair := range mp {
+		if machPair.oldMachine != nil {
+			m.store[machPair.oldMachine.ID] = m.sl.Line(idx)
+		} else if machPair.newMachine != nil {
+			m.store[machPair.newMachine.ID] = m.sl.Line(idx)
+		}
+	}
+}
+
+func (m *MachineLogger) getLoggerFromID(id string) statuslogger.StatusLine {
+	return m.store[id]
+}
+
 type AppState struct {
 	Machines []*fly.Machine
 }
@@ -130,6 +159,13 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
 		defer sl.Destroy(false)
 	}
 
+	machineLogger := NewMachineLogger(
+		map[string]statuslogger.StatusLine{},
+		sl,
+	)
+
+	machineLogger.initFromMachinePairs(machineTuples)
+
 	machPairByProcessGroup := lo.GroupBy(machineTuples, func(machPair machinePairing) string {
 		if machPair.oldMachine != nil {
 			return machPair.oldMachine.ProcessGroup()
@@ -140,15 +176,7 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
 		}
 	})
 
-	var poolSize int
-	switch mu := md.maxUnavailable; {
-	case mu >= 1:
-		poolSize = int(mu)
-	case mu > 0:
-		poolSize = int(math.Ceil(float64(len(machineTuples)) * mu))
-	default:
-		return fmt.Errorf("Invalid --max-unavailable value: %v", mu)
-	}
+	poolSize := md.getPoolSize(len(machineTuples))
 
 	if !settings.skipLeaseAcquisition {
 		attempts := 0
@@ -158,7 +186,7 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
 		}()
 
 		for {
-			err := md.acquireLeases(ctx, machineTuples, poolSize, sl)
+			err := md.acquireLeases(ctx, machineTuples, poolSize, machineLogger)
 			if err == nil {
 				break
 			}
@@ -172,39 +200,78 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
 		}
 
 		defer func() {
-			err := md.releaseLeases(ctx, machineTuples, sl)
+			err := md.releaseLeases(ctx, machineTuples, machineLogger)
 			if err != nil {
 				fmt.Fprintln(md.io.ErrOut, "Failed to release leases:", err)
 				span.RecordError(err)
 			}
 		}()
 	}
 
-	statusLines := map[string]statuslogger.StatusLine{}
-	for idx, machPair := range machineTuples {
-		if machPair.oldMachine != nil {
-			statusLines[machPair.oldMachine.ID] = sl.Line(idx)
-		} else if machPair.newMachine != nil {
-			statusLines[machPair.newMachine.ID] = sl.Line(idx)
-		}
-	}
-
 	pgroup := errgroup.Group{}
 	pgroup.SetLimit(rollingStrategyMaxConcurrentGroups)
 
 	// We want to update by process group
 	for _, machineTuples := range machPairByProcessGroup {
 		machineTuples := machineTuples
 		pgroup.Go(func() error {
-			err := md.updateProcessGroup(ctx, machineTuples, statusLines, poolSize)
-			if err != nil && strings.Contains(err.Error(), "lease currently held by") {
-				err := &unrecoverableError{err: err}
+			eg, ctx := errgroup.WithContext(ctx)
+
+			warmMachines := lo.Filter(machineTuples, func(e machinePairing, i int) bool {
+				if e.oldMachine != nil && e.oldMachine.State == "started" {
+					return true
+				}
+				if e.newMachine != nil && e.newMachine.State == "started" {
+					return true
+				}
+				return false
+			})
+
+			coldMachines := lo.Filter(machineTuples, func(e machinePairing, i int) bool {
+				if e.oldMachine != nil && e.oldMachine.State != "started" {
+					return true
+				}
+				if e.newMachine != nil && e.newMachine.State != "started" {
+					return true
+				}
+				return false
+			})
+
+			eg.Go(func() (err error) {
+				poolSize := len(coldMachines)
+				if poolSize >= STOPPED_MACHINES_POOL_SIZE {
+					poolSize = STOPPED_MACHINES_POOL_SIZE
+				}
+
+				if len(coldMachines) > 0 {
+					// for cold machines, we can update all of them at once.
+					// there's no need for protection against downtime since the machines are already stopped
+					return md.updateProcessGroup(ctx, coldMachines, machineLogger, poolSize)
+				}
+
+				return nil
+			})
+
+			eg.Go(func() (err error) {
+				// for warm machines, we update them in chunks of size, md.maxUnavailable.
+				// this is to prevent downtime/low-latency during deployments
+				poolSize := md.getPoolSize(len(warmMachines))
+				if len(warmMachines) > 0 {
+					return md.updateProcessGroup(ctx, warmMachines, machineLogger, poolSize)
+				}
+				return nil
+			})
+
+			err := eg.Wait()
+			if err != nil {
 				span.RecordError(err)
+				if strings.Contains(err.Error(), "lease currently held by") {
+					err = &unrecoverableError{err: err}
+				}
 				return err
 			}
 
-			span.RecordError(err)
-			return err
+			return nil
 		})
 	}
 
@@ -259,7 +326,7 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
 	return nil
 }
 
-func (md *machineDeployment) updateProcessGroup(ctx context.Context, machineTuples []machinePairing, statusLines map[string]statuslogger.StatusLine, poolSize int) error {
+func (md *machineDeployment) updateProcessGroup(ctx context.Context, machineTuples []machinePairing, machineLogger *MachineLogger, poolSize int) error {
 	ctx, span := tracing.GetTracer().Start(ctx, "update_process_group")
 	defer span.End()
 
@@ -277,9 +344,9 @@ func (md *machineDeployment) updateProcessGroup(ctx context.Context, machineTupl
 
 			var sl statuslogger.StatusLine
 			if oldMachine != nil {
-				sl = statusLines[oldMachine.ID]
+				sl = machineLogger.getLoggerFromID(oldMachine.ID)
 			} else if newMachine != nil {
-				sl = statusLines[newMachine.ID]
+				sl = machineLogger.getLoggerFromID(newMachine.ID)
 			}
 
 			err := md.updateMachineWChecks(ctx, oldMachine, newMachine, sl, md.io, machineCheckResult)
@@ -300,18 +367,15 @@ func (md *machineDeployment) updateProcessGroup(ctx context.Context, machineTupl
 	return nil
 }
 
-func (md *machineDeployment) acquireLeases(ctx context.Context, machineTuples []machinePairing, poolSize int, statusLogger statuslogger.StatusLogger) error {
+func (md *machineDeployment) acquireLeases(ctx context.Context, machineTuples []machinePairing, poolSize int, machToLogger *MachineLogger) error {
 	ctx, span := tracing.GetTracer().Start(ctx, "acquire_leases")
 
 	leaseGroup := errgroup.Group{}
 	leaseGroup.SetLimit(poolSize)
 
-	for idx, machineTuple := range machineTuples {
+	for _, machineTuple := range machineTuples {
 		machineTuple := machineTuple
-		idx := idx
-
 		leaseGroup.Go(func() error {
-			sl := statusLogger.Line(idx)
 
 			var machine *fly.Machine
 			if machineTuple.oldMachine != nil {
@@ -321,6 +385,7 @@ func (md *machineDeployment) acquireLeases(ctx context.Context, machineTuples []
 			} else {
 				return nil
 			}
+			sl := machToLogger.getLoggerFromID(machine.ID)
 
 			if machine.LeaseNonce != "" {
 				sl.LogStatus(statuslogger.StatusRunning, fmt.Sprintf("Already have lease for %s", machine.ID))
@@ -351,20 +416,18 @@ func (md *machineDeployment) acquireLeases(ctx context.Context, machineTuples []
 	return nil
 }
 
-func (md *machineDeployment) releaseLeases(ctx context.Context, machineTuples []machinePairing, statusLogger statuslogger.StatusLogger) error {
+func (md *machineDeployment) releaseLeases(ctx context.Context, machineTuples []machinePairing, machToLogger *MachineLogger) error {
 	ctx = context.WithoutCancel(ctx)
 	ctx, span := tracing.GetTracer().Start(ctx, "release_leases")
 	defer span.End()
 
 	leaseGroup := errgroup.Group{}
 	leaseGroup.SetLimit(len(machineTuples))
 
-	for idx, machineTuple := range machineTuples {
+	for _, machineTuple := range machineTuples {
 		machineTuple := machineTuple
-		idx := idx
 
 		leaseGroup.Go(func() error {
-			sl := statusLogger.Line(idx)
 
 			var machine *fly.Machine
 			if machineTuple.oldMachine != nil {
@@ -375,6 +438,8 @@ func (md *machineDeployment) releaseLeases(ctx context.Context, machineTuples []
 				return nil
 			}
 
+			sl := machToLogger.getLoggerFromID(machine.ID)
+
 			sl.LogStatus(statuslogger.StatusRunning, fmt.Sprintf("Clearing lease for %s", machine.ID))
 			if machine.LeaseNonce == "" {
 				sl.LogStatus(statuslogger.StatusSuccess, fmt.Sprintf("Cleared lease for %s", machine.ID))