4
4
"context"
5
5
"errors"
6
6
"fmt"
7
- "math"
8
7
"strings"
9
8
"sync"
10
9
"time"
@@ -22,6 +21,36 @@ import (
22
21
"golang.org/x/sync/errgroup"
23
22
)
24
23
24
+ const (
25
+ STOPPED_MACHINES_POOL_SIZE = 30
26
+ )
27
+
28
+ type MachineLogger struct {
29
+ store map [string ]statuslogger.StatusLine
30
+ sl statuslogger.StatusLogger
31
+ }
32
+
33
+ func NewMachineLogger (store map [string ]statuslogger.StatusLine , sl statuslogger.StatusLogger ) * MachineLogger {
34
+ return & MachineLogger {
35
+ store : store ,
36
+ sl : sl ,
37
+ }
38
+ }
39
+
40
+ func (m * MachineLogger ) initFromMachinePairs (mp []machinePairing ) {
41
+ for idx , machPair := range mp {
42
+ if machPair .oldMachine != nil {
43
+ m .store [machPair .oldMachine .ID ] = m .sl .Line (idx )
44
+ } else if machPair .newMachine != nil {
45
+ m .store [machPair .newMachine .ID ] = m .sl .Line (idx )
46
+ }
47
+ }
48
+ }
49
+
50
+ func (m * MachineLogger ) getLoggerFromID (id string ) statuslogger.StatusLine {
51
+ return m .store [id ]
52
+ }
53
+
25
54
type AppState struct {
26
55
Machines []* fly.Machine
27
56
}
@@ -130,6 +159,13 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
130
159
defer sl .Destroy (false )
131
160
}
132
161
162
+ machineLogger := NewMachineLogger (
163
+ map [string ]statuslogger.StatusLine {},
164
+ sl ,
165
+ )
166
+
167
+ machineLogger .initFromMachinePairs (machineTuples )
168
+
133
169
machPairByProcessGroup := lo .GroupBy (machineTuples , func (machPair machinePairing ) string {
134
170
if machPair .oldMachine != nil {
135
171
return machPair .oldMachine .ProcessGroup ()
@@ -140,15 +176,7 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
140
176
}
141
177
})
142
178
143
- var poolSize int
144
- switch mu := md .maxUnavailable ; {
145
- case mu >= 1 :
146
- poolSize = int (mu )
147
- case mu > 0 :
148
- poolSize = int (math .Ceil (float64 (len (machineTuples )) * mu ))
149
- default :
150
- return fmt .Errorf ("Invalid --max-unavailable value: %v" , mu )
151
- }
179
+ poolSize := md .getPoolSize (len (machineTuples ))
152
180
153
181
if ! settings .skipLeaseAcquisition {
154
182
attempts := 0
@@ -158,7 +186,7 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
158
186
}()
159
187
160
188
for {
161
- err := md .acquireLeases (ctx , machineTuples , poolSize , sl )
189
+ err := md .acquireLeases (ctx , machineTuples , poolSize , machineLogger )
162
190
if err == nil {
163
191
break
164
192
}
@@ -172,39 +200,78 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
172
200
}
173
201
174
202
defer func () {
175
- err := md .releaseLeases (ctx , machineTuples , sl )
203
+ err := md .releaseLeases (ctx , machineTuples , machineLogger )
176
204
if err != nil {
177
205
fmt .Fprintln (md .io .ErrOut , "Failed to release leases:" , err )
178
206
span .RecordError (err )
179
207
}
180
208
}()
181
209
}
182
210
183
- statusLines := map [string ]statuslogger.StatusLine {}
184
- for idx , machPair := range machineTuples {
185
- if machPair .oldMachine != nil {
186
- statusLines [machPair .oldMachine .ID ] = sl .Line (idx )
187
- } else if machPair .newMachine != nil {
188
- statusLines [machPair .newMachine .ID ] = sl .Line (idx )
189
- }
190
- }
191
-
192
211
pgroup := errgroup.Group {}
193
212
pgroup .SetLimit (rollingStrategyMaxConcurrentGroups )
194
213
195
214
// We want to update by process group
196
215
for _ , machineTuples := range machPairByProcessGroup {
197
216
machineTuples := machineTuples
198
217
pgroup .Go (func () error {
199
- err := md .updateProcessGroup (ctx , machineTuples , statusLines , poolSize )
200
- if err != nil && strings .Contains (err .Error (), "lease currently held by" ) {
201
- err := & unrecoverableError {err : err }
218
+ eg , ctx := errgroup .WithContext (ctx )
219
+
220
+ warmMachines := lo .Filter (machineTuples , func (e machinePairing , i int ) bool {
221
+ if e .oldMachine != nil && e .oldMachine .State == "started" {
222
+ return true
223
+ }
224
+ if e .newMachine != nil && e .newMachine .State == "started" {
225
+ return true
226
+ }
227
+ return false
228
+ })
229
+
230
+ coldMachines := lo .Filter (machineTuples , func (e machinePairing , i int ) bool {
231
+ if e .oldMachine != nil && e .oldMachine .State != "started" {
232
+ return true
233
+ }
234
+ if e .newMachine != nil && e .newMachine .State != "started" {
235
+ return true
236
+ }
237
+ return false
238
+ })
239
+
240
+ eg .Go (func () (err error ) {
241
+ poolSize := len (coldMachines )
242
+ if poolSize >= STOPPED_MACHINES_POOL_SIZE {
243
+ poolSize = STOPPED_MACHINES_POOL_SIZE
244
+ }
245
+
246
+ if len (coldMachines ) > 0 {
247
+ // for cold machines, we can update all of them at once.
248
+ // there's no need for protection against downtime since the machines are already stopped
249
+ return md .updateProcessGroup (ctx , coldMachines , machineLogger , poolSize )
250
+ }
251
+
252
+ return nil
253
+ })
254
+
255
+ eg .Go (func () (err error ) {
256
+ // for warm machines, we update them in chunks of size, md.maxUnavailable.
257
+ // this is to prevent downtime/low-latency during deployments
258
+ poolSize := md .getPoolSize (len (warmMachines ))
259
+ if len (warmMachines ) > 0 {
260
+ return md .updateProcessGroup (ctx , warmMachines , machineLogger , poolSize )
261
+ }
262
+ return nil
263
+ })
264
+
265
+ err := eg .Wait ()
266
+ if err != nil {
202
267
span .RecordError (err )
268
+ if strings .Contains (err .Error (), "lease currently held by" ) {
269
+ err = & unrecoverableError {err : err }
270
+ }
203
271
return err
204
272
}
205
273
206
- span .RecordError (err )
207
- return err
274
+ return nil
208
275
})
209
276
}
210
277
@@ -259,7 +326,7 @@ func (md *machineDeployment) updateMachinesWRecovery(ctx context.Context, oldApp
259
326
return nil
260
327
}
261
328
262
- func (md * machineDeployment ) updateProcessGroup (ctx context.Context , machineTuples []machinePairing , statusLines map [ string ]statuslogger. StatusLine , poolSize int ) error {
329
+ func (md * machineDeployment ) updateProcessGroup (ctx context.Context , machineTuples []machinePairing , machineLogger * MachineLogger , poolSize int ) error {
263
330
ctx , span := tracing .GetTracer ().Start (ctx , "update_process_group" )
264
331
defer span .End ()
265
332
@@ -277,9 +344,9 @@ func (md *machineDeployment) updateProcessGroup(ctx context.Context, machineTupl
277
344
278
345
var sl statuslogger.StatusLine
279
346
if oldMachine != nil {
280
- sl = statusLines [ oldMachine .ID ]
347
+ sl = machineLogger . getLoggerFromID ( oldMachine .ID )
281
348
} else if newMachine != nil {
282
- sl = statusLines [ newMachine .ID ]
349
+ sl = machineLogger . getLoggerFromID ( newMachine .ID )
283
350
}
284
351
285
352
err := md .updateMachineWChecks (ctx , oldMachine , newMachine , sl , md .io , machineCheckResult )
@@ -300,18 +367,15 @@ func (md *machineDeployment) updateProcessGroup(ctx context.Context, machineTupl
300
367
return nil
301
368
}
302
369
303
- func (md * machineDeployment ) acquireLeases (ctx context.Context , machineTuples []machinePairing , poolSize int , statusLogger statuslogger. StatusLogger ) error {
370
+ func (md * machineDeployment ) acquireLeases (ctx context.Context , machineTuples []machinePairing , poolSize int , machToLogger * MachineLogger ) error {
304
371
ctx , span := tracing .GetTracer ().Start (ctx , "acquire_leases" )
305
372
306
373
leaseGroup := errgroup.Group {}
307
374
leaseGroup .SetLimit (poolSize )
308
375
309
- for idx , machineTuple := range machineTuples {
376
+ for _ , machineTuple := range machineTuples {
310
377
machineTuple := machineTuple
311
- idx := idx
312
-
313
378
leaseGroup .Go (func () error {
314
- sl := statusLogger .Line (idx )
315
379
316
380
var machine * fly.Machine
317
381
if machineTuple .oldMachine != nil {
@@ -321,6 +385,7 @@ func (md *machineDeployment) acquireLeases(ctx context.Context, machineTuples []
321
385
} else {
322
386
return nil
323
387
}
388
+ sl := machToLogger .getLoggerFromID (machine .ID )
324
389
325
390
if machine .LeaseNonce != "" {
326
391
sl .LogStatus (statuslogger .StatusRunning , fmt .Sprintf ("Already have lease for %s" , machine .ID ))
@@ -351,20 +416,18 @@ func (md *machineDeployment) acquireLeases(ctx context.Context, machineTuples []
351
416
return nil
352
417
}
353
418
354
- func (md * machineDeployment ) releaseLeases (ctx context.Context , machineTuples []machinePairing , statusLogger statuslogger. StatusLogger ) error {
419
+ func (md * machineDeployment ) releaseLeases (ctx context.Context , machineTuples []machinePairing , machToLogger * MachineLogger ) error {
355
420
ctx = context .WithoutCancel (ctx )
356
421
ctx , span := tracing .GetTracer ().Start (ctx , "release_leases" )
357
422
defer span .End ()
358
423
359
424
leaseGroup := errgroup.Group {}
360
425
leaseGroup .SetLimit (len (machineTuples ))
361
426
362
- for idx , machineTuple := range machineTuples {
427
+ for _ , machineTuple := range machineTuples {
363
428
machineTuple := machineTuple
364
- idx := idx
365
429
366
430
leaseGroup .Go (func () error {
367
- sl := statusLogger .Line (idx )
368
431
369
432
var machine * fly.Machine
370
433
if machineTuple .oldMachine != nil {
@@ -375,6 +438,8 @@ func (md *machineDeployment) releaseLeases(ctx context.Context, machineTuples []
375
438
return nil
376
439
}
377
440
441
+ sl := machToLogger .getLoggerFromID (machine .ID )
442
+
378
443
sl .LogStatus (statuslogger .StatusRunning , fmt .Sprintf ("Clearing lease for %s" , machine .ID ))
379
444
if machine .LeaseNonce == "" {
380
445
sl .LogStatus (statuslogger .StatusSuccess , fmt .Sprintf ("Cleared lease for %s" , machine .ID ))
0 commit comments