Skip to content

Commit 975af7b

Browse files
authored
Improve machine wait and update reliability (#4765)
* A stopped machine won't start after an update * fixxxxx * Cache preflight region auto-selection Resolve primary and secondary regions once per process instead of re-querying per test env. Keep fallback failures as warnings and move successful auto-selection to debug logging to reduce noise. * fix unit tests * go go go * poor man version of wait for settled state * split that test * Fail machine update when settled state is failed * Add machine wait command with resilient retries * Use machine wait in preflight deploy detach tests * Handle lease refresh safely after machine destruction * greater than 1
1 parent c12bb49 commit 975af7b

File tree

13 files changed

+322
-164
lines changed

13 files changed

+322
-164
lines changed

internal/command/deploy/machines_launchinput.go

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package deploy
22

33
import (
44
"fmt"
5+
"slices"
56
"strconv"
67
"strings"
78

@@ -297,18 +298,14 @@ func skipLaunch(origMachineRaw *fly.Machine, mConfig *fly.MachineConfig) bool {
297298
}
298299

299300
switch {
300-
case state == fly.MachineStateStarted:
301+
case slices.Contains([]string{fly.MachineStateStarted, "starting", "failed"}, state):
301302
return false
302303
case len(mConfig.Standbys) > 0:
303304
return true
304-
case state == fly.MachineStateStopped, state == fly.MachineStateSuspended:
305-
for _, s := range mConfig.Services {
306-
if (s.Autostop != nil && *s.Autostop != fly.MachineAutostopOff) || (s.Autostart != nil && *s.Autostart) {
307-
return true
308-
}
309-
}
305+
case origMachineRaw == nil:
306+
return false
310307
}
311-
return false
308+
return true
312309
}
313310

314311
// updateContainerImage sets container.Image = mConfig.Image in any container where image == "."

internal/command/deploy/machines_launchinput_test.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import (
1515
)
1616

1717
func makeTerminalLoggerQuiet(tb testing.TB) {
18-
var originalLogger = terminal.DefaultLogger
18+
originalLogger := terminal.DefaultLogger
1919
terminal.DefaultLogger = logger.New(os.Stdout, logger.Error, true)
2020

2121
tb.Cleanup(func() {
@@ -85,6 +85,7 @@ func testLaunchInputForBasic(t *testing.T) {
8585
Region: li.Region,
8686
Config: helpers.Clone(li.Config),
8787
HostStatus: fly.HostStatusOk,
88+
State: fly.MachineStateStarted,
8889
}
8990
// also must preserve any user's added metadata except for known fly metadata keys
9091
origMachineRaw.Config.Metadata["user-added-me"] = "keep it"
@@ -104,6 +105,7 @@ func testLaunchInputForBasic(t *testing.T) {
104105
Region: li.Region,
105106
Config: helpers.Clone(li.Config),
106107
HostStatus: fly.HostStatusOk,
108+
State: fly.MachineStateStarted,
107109
}
108110
want.Config.Image = "super/globe"
109111
want.Config.Env["NOT_SET_ON_RESTART_ONLY"] = "true"
@@ -253,7 +255,6 @@ func testLaunchInputForOnMounts(t *testing.T) {
253255
assert.Equal(t, "ab1234567890", li.ID)
254256
assert.True(t, li.RequiresReplacement)
255257
assert.Empty(t, li.Config.Mounts)
256-
257258
}
258259

259260
// test mounts with auto volume resize

internal/command/deploy/machines_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ func Test_resolveUpdatedMachineConfig_Mounts(t *testing.T) {
279279
}, li)
280280

281281
origMachine := &fly.Machine{
282+
State: fly.MachineStateStarted,
282283
HostStatus: fly.HostStatusOk,
283284
Config: &fly.MachineConfig{
284285
Mounts: []fly.MachineMount{{
@@ -329,6 +330,7 @@ func Test_resolveUpdatedMachineConfig_restartOnly(t *testing.T) {
329330
md.img = "SHOULD-NOT-USE-THIS-TAG"
330331

331332
origMachine := &fly.Machine{
333+
State: fly.MachineStateStarted,
332334
HostStatus: fly.HostStatusOk,
333335
ID: "OrigID",
334336
Config: &fly.MachineConfig{
@@ -371,6 +373,7 @@ func Test_resolveUpdatedMachineConfig_restartOnlyProcessGroup(t *testing.T) {
371373
md.img = "SHOULD-NOT-USE-THIS-TAG"
372374

373375
origMachine := &fly.Machine{
376+
State: fly.MachineStateStarted,
374377
HostStatus: fly.HostStatusOk,
375378
ID: "OrigID",
376379
Config: &fly.MachineConfig{

internal/command/machine/machine.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Machines REST fly.`
2929
newStart(),
3030
newStop(),
3131
newStatus(),
32+
newWait(),
3233
newProxy(),
3334
newClone(),
3435
newUpdate(),

internal/command/machine/update.go

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import (
1616
"github.com/superfly/flyctl/internal/flag"
1717
"github.com/superfly/flyctl/internal/flyerr"
1818
mach "github.com/superfly/flyctl/internal/machine"
19-
"github.com/superfly/flyctl/internal/watch"
2019
)
2120

2221
func newUpdate() *cobra.Command {
@@ -79,9 +78,7 @@ func newUpdate() *cobra.Command {
7978

8079
func runUpdate(ctx context.Context) (err error) {
8180
var (
82-
io = iostreams.FromContext(ctx)
83-
colorize = io.ColorScheme()
84-
81+
io = iostreams.FromContext(ctx)
8582
autoConfirm = flag.GetBool(ctx, "yes")
8683
skipHealthChecks = flag.GetBool(ctx, "skip-health-checks")
8784
skipStart = flag.GetBool(ctx, "skip-start")
@@ -169,20 +166,10 @@ func runUpdate(ctx context.Context) (err error) {
169166
Descript: timeoutErr.Description(),
170167
Suggest: "Try increasing the --wait-timeout",
171168
}
172-
173169
}
174170
return err
175171
}
176172

177-
if !(input.SkipLaunch || flag.GetDetach(ctx)) {
178-
fmt.Fprintln(io.Out, colorize.Green("==> "+"Monitoring health checks"))
179-
180-
if err := watch.MachinesChecks(ctx, appName, []*fly.Machine{machine}); err != nil {
181-
return err
182-
}
183-
fmt.Fprintln(io.Out)
184-
}
185-
186173
fmt.Fprintf(io.Out, "\nMonitor machine status here:\nhttps://fly.io/apps/%s/machines/%s\n", appName, machine.ID)
187174

188175
return nil

internal/command/machine/wait.go

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
package machine
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"net/http"
8+
"strings"
9+
"time"
10+
11+
"github.com/spf13/cobra"
12+
"github.com/superfly/fly-go/flaps"
13+
"github.com/superfly/flyctl/internal/appconfig"
14+
"github.com/superfly/flyctl/internal/command"
15+
"github.com/superfly/flyctl/internal/flag"
16+
"github.com/superfly/flyctl/internal/flapsutil"
17+
"github.com/superfly/flyctl/iostreams"
18+
)
19+
20+
func newWait() *cobra.Command {
21+
const (
22+
short = "Wait for a machine to reach a state"
23+
long = short + "\n"
24+
25+
usage = "wait [id]"
26+
)
27+
28+
cmd := command.New(usage, short, long, runMachineWait,
29+
command.RequireSession,
30+
command.LoadAppNameIfPresent,
31+
)
32+
33+
cmd.Args = cobra.RangeArgs(0, 1)
34+
35+
flag.Add(
36+
cmd,
37+
flag.App(),
38+
flag.AppConfig(),
39+
selectFlag,
40+
flag.String{
41+
Name: "state",
42+
Description: "Machine state to wait for",
43+
Default: "settled",
44+
},
45+
flag.Duration{
46+
Name: "wait-timeout",
47+
Shorthand: "w",
48+
Description: "Time duration to wait for the machine to reach the requested state.",
49+
Default: 5 * time.Minute,
50+
},
51+
)
52+
53+
return cmd
54+
}
55+
56+
func runMachineWait(ctx context.Context) error {
57+
var (
58+
io = iostreams.FromContext(ctx)
59+
desiredState = flag.GetString(ctx, "state")
60+
waitTimeout = flag.GetDuration(ctx, "wait-timeout")
61+
)
62+
63+
if desiredState == "" {
64+
return fmt.Errorf("state cannot be empty")
65+
}
66+
67+
machineID := flag.FirstArg(ctx)
68+
haveMachineID := len(flag.Args(ctx)) > 0
69+
machine, ctx, err := selectOneMachine(ctx, "", machineID, haveMachineID)
70+
if err != nil {
71+
return err
72+
}
73+
74+
appName := appconfig.NameFromContext(ctx)
75+
client := flapsutil.ClientFromContext(ctx)
76+
77+
fmt.Fprintf(io.Out, "Waiting up to %s for machine %s to reach %q...\n", waitTimeout, machine.ID, desiredState)
78+
79+
startedWaitAt := time.Now()
80+
const maxAttempts = 3
81+
82+
for attempt := 1; attempt <= maxAttempts; attempt++ {
83+
remainingTimeout := waitTimeout
84+
if waitTimeout > 0 {
85+
remainingTimeout = waitTimeout - time.Since(startedWaitAt)
86+
if remainingTimeout <= 0 {
87+
return fmt.Errorf("machine %s did not reach %q within %s", machine.ID, desiredState, waitTimeout)
88+
}
89+
}
90+
91+
err = client.Wait(ctx, appName, machine, desiredState, remainingTimeout)
92+
if err == nil {
93+
break
94+
}
95+
96+
if attempt == maxAttempts || !isRetryableWaitError(err) {
97+
return fmt.Errorf("machine %s did not reach %q within %s: %w", machine.ID, desiredState, waitTimeout, err)
98+
}
99+
100+
fmt.Fprintf(io.Out, "Retrying wait for machine %s due to transient error: %v\n", machine.ID, err)
101+
102+
machine, err = client.Get(ctx, appName, machine.ID)
103+
if err != nil {
104+
return fmt.Errorf("machine %s could not be refetched before retrying wait: %w", machine.ID, err)
105+
}
106+
107+
retryDelay := retryDelayForAttempt(attempt)
108+
if waitTimeout > 0 {
109+
remainingTimeout = waitTimeout - time.Since(startedWaitAt)
110+
if remainingTimeout <= 0 {
111+
return fmt.Errorf("machine %s did not reach %q within %s", machine.ID, desiredState, waitTimeout)
112+
}
113+
if retryDelay > remainingTimeout {
114+
retryDelay = remainingTimeout
115+
}
116+
}
117+
118+
select {
119+
case <-time.After(retryDelay):
120+
case <-ctx.Done():
121+
return ctx.Err()
122+
}
123+
}
124+
125+
if desiredState == "settled" {
126+
machine, err = client.Get(ctx, appName, machine.ID)
127+
if err != nil {
128+
return fmt.Errorf("machine %s reached settled state but could not fetch final state: %w", machine.ID, err)
129+
}
130+
desiredState = machine.State
131+
}
132+
133+
fmt.Fprintf(io.Out, "Machine %s reached state %q\n", machine.ID, desiredState)
134+
135+
return nil
136+
}
137+
138+
func isRetryableWaitError(err error) bool {
139+
if err == nil {
140+
return false
141+
}
142+
143+
message := strings.ToLower(err.Error())
144+
if strings.Contains(message, "currently replaced") {
145+
return true
146+
}
147+
148+
var flapsErr *flaps.FlapsError
149+
if errors.As(err, &flapsErr) {
150+
if flapsErr.ResponseStatusCode == http.StatusTooManyRequests || (flapsErr.ResponseStatusCode >= 500 && flapsErr.ResponseStatusCode < 600) {
151+
return true
152+
}
153+
}
154+
155+
transientSubstrings := []string{
156+
"connection reset by peer",
157+
"connection refused",
158+
"network is unreachable",
159+
"temporary failure in name resolution",
160+
"i/o timeout",
161+
"timeout",
162+
"eof",
163+
}
164+
165+
for _, s := range transientSubstrings {
166+
if strings.Contains(message, s) {
167+
return true
168+
}
169+
}
170+
171+
return false
172+
}
173+
174+
func retryDelayForAttempt(attempt int) time.Duration {
175+
if attempt <= 0 {
176+
return 500 * time.Millisecond
177+
}
178+
179+
delay := 500 * time.Millisecond
180+
for i := 1; i < attempt; i++ {
181+
delay *= 2
182+
}
183+
184+
return delay
185+
}

internal/machine/leasable_machine.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ func (lm *leasableMachine) Destroy(ctx context.Context, kill bool) error {
142142

143143
func (lm *leasableMachine) Cordon(ctx context.Context) error {
144144
if lm.IsDestroyed() {
145-
return fmt.Errorf("cannon cordon machine %s that was already destroyed", lm.machine.ID)
145+
return fmt.Errorf("cannot cordon machine %s that was already destroyed", lm.machine.ID)
146146
}
147147

148148
return lm.flapsClient.Cordon(ctx, lm.appName, lm.machine.ID, lm.leaseNonce)
@@ -535,6 +535,10 @@ func (lm *leasableMachine) refreshLeaseUntilCanceled(ctx context.Context, durati
535535

536536
for {
537537
time.Sleep(b.Duration())
538+
if lm.IsDestroyed() {
539+
return
540+
}
541+
538542
switch err := lm.RefreshLease(ctx, duration); {
539543
case err == nil:
540544
// good times
@@ -556,7 +560,7 @@ func (lm *leasableMachine) ReleaseLease(ctx context.Context) error {
556560

557561
nonce := lm.leaseNonce
558562
lm.resetLease()
559-
if nonce == "" {
563+
if nonce == "" || lm.IsDestroyed() {
560564
return nil
561565
}
562566

internal/machine/update.go

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -85,25 +85,23 @@ func Update(ctx context.Context, appName string, m *fly.Machine, input *fly.Laun
8585
return fmt.Errorf("could not update machine %s: %w", m.ID, err)
8686
}
8787

88-
waitForAction := "start"
89-
if input.SkipLaunch || m.Config.Schedule != "" || m.State != fly.MachineStateStarted {
90-
waitForAction = "stop"
91-
}
92-
9388
waitTimeout := time.Second * 300
9489
if input.Timeout != 0 {
9590
waitTimeout = time.Duration(input.Timeout) * time.Second
9691
}
9792

98-
if err := WaitForStartOrStop(ctx, appName, updatedMachine, waitForAction, waitTimeout); err != nil {
99-
return err
93+
state, err := WaitForState(ctx, appName, updatedMachine, "settled", waitTimeout)
94+
if err != nil {
95+
return fmt.Errorf("error while waiting for machine to update: %w", err)
96+
}
97+
98+
if state == "failed" {
99+
return fmt.Errorf("machine %s update failed: machine entered %q state", m.ID, state)
100100
}
101101

102-
if !input.SkipLaunch {
103-
if !input.SkipHealthChecks {
104-
if err := watch.MachinesChecks(ctx, appName, []*fly.Machine{updatedMachine}); err != nil {
105-
return fmt.Errorf("failed to wait for health checks to pass: %w", err)
106-
}
102+
if state == "started" && !input.SkipHealthChecks {
103+
if err := watch.MachinesChecks(ctx, appName, []*fly.Machine{updatedMachine}); err != nil {
104+
return fmt.Errorf("failed to wait for health checks to pass: %w", err)
107105
}
108106
}
109107

0 commit comments

Comments
 (0)