Skip to content

Commit 4a8a63c

Browse files
committed
fix: instability in instance updates
This adds a short initial in `WaitForService` and adds some additional checks to verify that the service has hit its desired state. See the inline comment for more details about this change. It also adds some tolerance for transient connection errors from the Patroni API. This lengthens the time to detect a failed instance in exchange for more durability when the Patroni reload operation causes a brief unavailibility. PLAT-98
1 parent 981d2c7 commit 4a8a63c

File tree

2 files changed

+37
-17
lines changed

2 files changed

+37
-17
lines changed

server/internal/database/utils.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,22 @@ func WaitForPatroniRunning(ctx context.Context, patroniClient *patroni.Client, t
1717
ticker := time.NewTicker(5 * time.Second)
1818
defer ticker.Stop()
1919

20+
// We want some tolerance to transient connection errors.
21+
const maxConnectionErrors = 3
22+
var errCount int
23+
2024
for {
2125
select {
2226
case <-ctx.Done():
2327
return ctx.Err()
2428
case <-ticker.C:
2529
status, err := patroniClient.GetInstanceStatus(ctx)
2630
if err != nil {
27-
return fmt.Errorf("failed to get cluster status: %w", err)
31+
errCount++
32+
if errCount >= maxConnectionErrors {
33+
return fmt.Errorf("failed to get cluster status: %w", err)
34+
}
35+
continue
2836
}
2937
if status.InRunningState() {
3038
return nil

server/internal/docker/docker.go

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -320,15 +320,35 @@ func (d *Docker) WaitForService(ctx context.Context, serviceID string, timeout t
320320
ctx, cancel := context.WithTimeout(ctx, timeout)
321321
defer cancel()
322322

323-
errChan := make(chan error)
324-
go func() {
325-
for {
323+
// The Docker Swarm API doesn't give us a reliable way to wait until the
324+
// service spec has been propagated, so calling the inspect endpoint
325+
// immediately after update can return a stale result. This ticker not only
326+
// gives us an elegant way to poll, it also gives us a brief delay before
327+
// the polling starts. That delay should be enough for the service spec to
328+
// propagate.
329+
//
330+
// For comparison, the Docker CLI validates that the service hits a steady
331+
// state for a specified time period, which defaults to 5 seconds:
332+
// https://github.com/docker/cli/blob/8ed44f916fa908a04f03f69369bc2a16e0db7cc9/cli/command/service/progress/progress.go#L143
333+
// We can use a simpler implementation because our service always has a
334+
// health check.
335+
ticker := time.NewTicker(5 * time.Second)
336+
defer ticker.Stop()
337+
338+
for {
339+
select {
340+
case <-ctx.Done():
341+
return ctx.Err()
342+
case <-ticker.C:
326343
service, _, err := d.client.ServiceInspectWithRaw(ctx, serviceID, types.ServiceInspectOptions{})
327344
if err != nil {
328-
errChan <- fmt.Errorf("failed to inspect service: %w", errTranslate(err))
345+
return fmt.Errorf("failed to inspect service: %w", errTranslate(err))
329346
}
330347
if service.Spec.Mode.Replicated == nil {
331-
errChan <- fmt.Errorf("WaitForServiceScale is only usable for replicated services: %w", err)
348+
return fmt.Errorf("WaitForService is only usable for replicated services: %w", err)
349+
}
350+
if service.UpdateStatus != nil && service.UpdateStatus.State != swarm.UpdateStateCompleted {
351+
continue
332352
}
333353
var desired uint64
334354
if r := service.Spec.Mode.Replicated.Replicas; r != nil {
@@ -341,26 +361,18 @@ func (d *Docker) WaitForService(ctx context.Context, serviceID string, timeout t
341361
}),
342362
})
343363
if err != nil {
344-
errChan <- fmt.Errorf("failed to list tasks for service: %w", errTranslate(err))
364+
return fmt.Errorf("failed to list tasks for service: %w", errTranslate(err))
345365
}
346366
var running uint64
347367
for _, t := range tasks {
348-
if t.Status.State == swarm.TaskStateRunning {
368+
if t.DesiredState == swarm.TaskStateRunning && t.Status.State == swarm.TaskStateRunning {
349369
running++
350370
}
351371
}
352372
if running == desired {
353-
errChan <- nil
373+
return nil
354374
}
355-
time.Sleep(time.Second)
356375
}
357-
}()
358-
359-
select {
360-
case err := <-errChan:
361-
return err
362-
case <-ctx.Done():
363-
return fmt.Errorf("timed out waiting for service %q to scale", serviceID)
364376
}
365377
}
366378

0 commit comments

Comments
 (0)