Retry service-busy errors after a delay (#1174)

Groxx · web-flow · commit 2618d0caa5e4 · 2022-11-07T12:26:33.000-08:00
Builds on #1167, but adds delay before retrying service-busy errors. For now, since our server-side RPS quotas are calculated per second, this delays at least 1 second per service busy error. This is in contrast to the previous behavior, which would have retried up to about a dozen times in the same period, which is the cause of service-busy-based retry storms that cause lots more service-busy errors. --- This also gives us an easy way to make use of "retry after" information in errors we return to the caller, though currently our errors do not contain that. Eventually this should probably come from the server, which has a global view of how many requests this service has sent, and can provide a more precise delay to individual callers. E.g. currently our server-side ratelimiter works in 1-second slices... but that isn't something that's guaranteed to stay true. The server could also detect truly large floods of requests, and return jittered values larger than 1 second to more powerfully stop the storm, or to allow prioritizing some requests (like activity responses) over others simply by returning a lower delay.
diff --git a/internal/common/backoff/retry.go b/internal/common/backoff/retry.go
@@ -22,8 +22,11 @@ package backoff
 
 import (
 	"context"
+	"errors"
 	"sync"
 	"time"
+
+	s "go.uber.org/cadence/.gen/go/shared"
 )
 
 type (
@@ -87,7 +90,7 @@ func NewConcurrentRetrier(retryPolicy RetryPolicy) *ConcurrentRetrier {
 }
 
 // Retry function can be used to wrap any call with retry logic using the passed in policy
-func Retry(ctx context.Context, operation Operation, policy RetryPolicy, isRetryable IsRetryable) error {
+func Retry(ctx context.Context, operation Operation, policy RetryPolicy, isRetriable IsRetryable) error {
 	var err error
 	var next time.Duration
 
@@ -103,16 +106,40 @@ Retry_Loop:
 			return err
 		}
 
-		// Check if the error is retryable
-		if isRetryable != nil && !isRetryable(err) {
+		if !isRetriable(err) {
 			return err
 		}
 
+		retryAfter := ErrRetryableAfter(err)
+		// update the time to wait until the next attempt.
+		// as this is a *minimum*, just add it to the current delay time.
+		//
+		// this could be changed to clamp to retryAfter as a minimum.
+		// this is intentionally *not* done here, so repeated service-busy errors are guaranteed
+		// to generate *increasing* amount of time between requests, and not just send N in a row
+		// with 1 second of delay.  duplicates imply "still overloaded", so this will hopefully
+		// help reduce the odds of snowballing.
+		// this is a pretty minor thing though, and it should not cause problems if we change it
+		// to make behavior more predictable.
+		next += retryAfter
+
 		// check if ctx is done
+		if ctx.Err() != nil {
+			return err
+		}
+
+		// wait for the next retry period (or context timeout)
 		if ctxDone := ctx.Done(); ctxDone != nil {
+			// we could check if this is longer than context deadline and immediately fail...
+			// ...but wasting time prevents higher-level retries from trying too early.
+			// this is particularly useful for service-busy, but seems valid for essentially all retried errors.
+			//
+			// this could probably be changed if we get requests for it, but for now it better-protects
+			// the server by preventing "external" retry storms.
 			timer := time.NewTimer(next)
 			select {
 			case <-ctxDone:
+				timer.Stop()
 				return err
 			case <-timer.C:
 				continue Retry_Loop
@@ -123,3 +150,21 @@ Retry_Loop:
 		time.Sleep(next)
 	}
 }
+
+// ErrRetryableAfter returns a minimum delay until the next attempt.
+//
+// for most errors this will be 0, and normal backoff logic will determine
+// the full retry period, but e.g. service busy errors (or any case where the
+// server knows a "time until it is not useful to retry") are safe to assume
+// that a literally immediate retry is *not* going to be useful.
+//
+// note that this is only a minimum, however.  longer delays are assumed to
+// be equally valid.
+func ErrRetryableAfter(err error) (retryAfter time.Duration) {
+	if target := (*s.ServiceBusyError)(nil); errors.As(err, &target) {
+		// eventually: return a time-until-retry from the server.
+		// for now though, just ensure at least one second before the next attempt.
+		return time.Second
+	}
+	return 0
+}
diff --git a/internal/common/backoff/retry_test.go b/internal/common/backoff/retry_test.go
@@ -27,28 +27,51 @@ import (
 	"time"
 
 	"github.com/stretchr/testify/assert"
+	"go.uber.org/cadence/.gen/go/shared"
+)
+
+type errCategory int
+
+const (
+	noErr errCategory = iota
+	anyErr
+	serviceBusyErr
 )
 
 func TestRetry(t *testing.T) {
 	t.Parallel()
 
+	always := func(err error) bool {
+		return true
+	}
+	never := func(err error) bool {
+		return false
+	}
+
 	succeedOnAttemptNum := 5
 	tests := []struct {
 		name        string
 		maxAttempts int
+		maxTime     time.Duration // context timeout
 		isRetryable func(error) bool
 
-		shouldError   bool
+		err           errCategory
 		expectedCalls int
 	}{
-		{"success", 2 * succeedOnAttemptNum, nil, false, succeedOnAttemptNum},
-		{"too many tries", 3, nil, true, 4}, // max 3 retries == 4 calls.  must be < succeedOnAttemptNum to work.
-		{"success with always custom retry", 2 * succeedOnAttemptNum, func(err error) bool {
-			return true // retry on all errors, same as no custom retry
-		}, false, succeedOnAttemptNum},
-		{"success with never custom retry", 2 * succeedOnAttemptNum, func(err error) bool {
-			return false // never retry
-		}, true, 1},
+		{"success", 2 * succeedOnAttemptNum, time.Second, always, noErr, succeedOnAttemptNum},
+		{"too many tries", 3, time.Second, always, anyErr, 4}, // max 3 retries == 4 calls.  must be < succeedOnAttemptNum to work.
+		{"success with always custom retry", 2 * succeedOnAttemptNum, time.Second, always, noErr, succeedOnAttemptNum},
+		{"success with never custom retry", 2 * succeedOnAttemptNum, time.Second, never, anyErr, 1},
+
+		// elapsed-time-sensitive tests below.
+		// consider raising time granularity if flaky, or we could set up a more complete mock
+		// to resolve flakiness for real, but that's a fair bit more complex.
+
+		// try -> sleep(10ms) -> try -> sleep(20ms) -> try -> sleep(40ms) -> timeout == 3 calls.
+		{"timed out eventually", 5, 50 * time.Millisecond, always, anyErr, 3},
+
+		// try -> sleep(longer than context timeout due to busy err) -> timeout == 1 call.
+		{"timed out due to long minimum delay", 5, 10 * time.Millisecond, always, serviceBusyErr, 1},
 	}
 
 	for _, test := range tests {
@@ -63,49 +86,34 @@ func TestRetry(t *testing.T) {
 					return nil
 				}
 
-				return &someError{}
+				switch test.err {
+				case noErr:
+					return &someError{} // non-erroring tests should not reach this branch
+				case anyErr:
+					return &someError{}
+				case serviceBusyErr:
+					return &shared.ServiceBusyError{}
+				}
+				panic("unreachable")
 			}
 
-			policy := NewExponentialRetryPolicy(1 * time.Millisecond)
-			policy.SetMaximumInterval(5 * time.Millisecond)
+			policy := NewExponentialRetryPolicy(10 * time.Millisecond)
+			policy.SetMaximumInterval(50 * time.Millisecond)
 			policy.SetMaximumAttempts(test.maxAttempts)
 
-			err := Retry(context.Background(), op, policy, test.isRetryable)
-			if test.shouldError {
-				assert.Error(t, err)
-			} else {
+			ctx, cancel := context.WithTimeout(context.Background(), test.maxTime)
+			defer cancel()
+			err := Retry(ctx, op, policy, test.isRetryable)
+			if test.err == noErr {
 				assert.NoError(t, err, "Retry count: %v", i)
+			} else {
+				assert.Error(t, err)
 			}
 			assert.Equal(t, test.expectedCalls, i, "wrong number of calls")
 		})
 	}
 }
 
-func TestNoRetryAfterContextDone(t *testing.T) {
-	t.Parallel()
-	retryCounter := 0
-	op := func() error {
-		retryCounter++
-
-		if retryCounter == 5 {
-			return nil
-		}
-
-		return &someError{}
-	}
-
-	policy := NewExponentialRetryPolicy(10 * time.Millisecond)
-	policy.SetMaximumInterval(50 * time.Millisecond)
-	policy.SetMaximumAttempts(10)
-
-	ctx, cancel := context.WithTimeout(context.Background(), 50*time.Millisecond)
-	defer cancel()
-
-	err := Retry(ctx, op, policy, nil)
-	assert.Error(t, err)
-	assert.True(t, retryCounter >= 2, "retryCounter should be at least 2 but was %d", retryCounter) // verify that we did retry
-}
-
 func TestConcurrentRetrier(t *testing.T) {
 	t.Parallel()
 	a := assert.New(t)
diff --git a/internal/internal_retry.go b/internal/internal_retry.go
@@ -113,6 +113,10 @@ func isServiceTransientError(err error) bool {
 		return false
 	}
 
+	if target := (*s.ServiceBusyError)(nil); errors.As(err, &target) {
+		return true
+	}
+
 	// s.InternalServiceError
 	// s.ServiceBusyError (must retry after a delay, but it is transient)
 	// server-side-only error types (as they should not reach clients)
diff --git a/internal/internal_retry_test.go b/internal/internal_retry_test.go
@@ -47,7 +47,8 @@ func TestErrRetries(t *testing.T) {
 			&s.RemoteSyncMatchedError{},
 			&s.InternalDataInconsistencyError{},
 		} {
-			assert.True(t, isServiceTransientError(err), "%T should be transient", err)
+			retryable := isServiceTransientError(err)
+			assert.True(t, retryable, "%T should be transient", err)
 		}
 	})
 	t.Run("terminal", func(t *testing.T) {
@@ -67,7 +68,8 @@ func TestErrRetries(t *testing.T) {
 
 			errShutdown, // shutdowns can't be stopped
 		} {
-			assert.False(t, isServiceTransientError(err), "%T should be fatal", err)
+			retryable := isServiceTransientError(err)
+			assert.False(t, retryable, "%T should be fatal", err)
 		}
 	})
 }
diff --git a/internal/internal_task_pollers.go b/internal/internal_task_pollers.go
@@ -765,12 +765,27 @@ func (wtp *workflowTaskPoller) poll(ctx context.Context) (interface{}, error) {
 
 	response, err := wtp.service.PollForDecisionTask(ctx, request, getYarpcCallOptions(wtp.featureFlags)...)
 	if err != nil {
-		if isServiceTransientError(err) {
+		retryable := isServiceTransientError(err)
+		if retryable {
 			wtp.metricsScope.Counter(metrics.DecisionPollTransientFailedCounter).Inc(1)
 		} else {
 			wtp.metricsScope.Counter(metrics.DecisionPollFailedCounter).Inc(1)
 		}
 		wtp.updateBacklog(request.TaskList.GetKind(), 0)
+
+		// pause for the retry delay if present.
+		// failures also have an exponential backoff, implemented at a higher level,
+		// but this ensures a minimum is respected.
+		retryAfter := backoff.ErrRetryableAfter(err)
+		if retryAfter > 0 {
+			t := time.NewTimer(retryAfter)
+			select {
+			case <-ctx.Done():
+				t.Stop()
+			case <-t.C:
+			}
+		}
+
 		return nil, err
 	}
 
@@ -990,11 +1005,26 @@ func (atp *activityTaskPoller) poll(ctx context.Context) (*s.PollForActivityTask
 	response, err := atp.service.PollForActivityTask(ctx, request, getYarpcCallOptions(atp.featureFlags)...)
 
 	if err != nil {
-		if isServiceTransientError(err) {
+		retryable := isServiceTransientError(err)
+		if retryable {
 			atp.metricsScope.Counter(metrics.ActivityPollTransientFailedCounter).Inc(1)
 		} else {
 			atp.metricsScope.Counter(metrics.ActivityPollFailedCounter).Inc(1)
 		}
+
+		// pause for the retry delay if present.
+		// failures also have an exponential backoff, implemented at a higher level,
+		// but this ensures a minimum is respected.
+		retryAfter := backoff.ErrRetryableAfter(err)
+		if retryAfter > 0 {
+			t := time.NewTimer(retryAfter)
+			select {
+			case <-ctx.Done():
+				t.Stop()
+			case <-t.C:
+			}
+		}
+
 		return nil, startTime, err
 	}
 	if response == nil || len(response.TaskToken) == 0 {
diff --git a/internal/internal_worker.go b/internal/internal_worker.go
@@ -161,7 +161,7 @@ func ensureRequiredParams(params *workerExecutionParameters) {
 		config := zap.NewProductionConfig()
 		// set default time formatter to "2006-01-02T15:04:05.000Z0700"
 		config.EncoderConfig.EncodeTime = zapcore.ISO8601TimeEncoder
-		//config.Level.SetLevel(zapcore.DebugLevel)
+		// config.Level.SetLevel(zapcore.DebugLevel)
 		logger, _ := config.Build()
 		params.Logger = logger
 		params.Logger.Info("No logger configured for cadence worker. Created default one.")
diff --git a/internal/workflow_replayer.go b/internal/workflow_replayer.go
@@ -26,9 +26,6 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
-	"io/ioutil"
-	"math"
-
 	"github.com/golang/mock/gomock"
 	"github.com/opentracing/opentracing-go"
 	"github.com/pborman/uuid"
@@ -40,6 +37,8 @@ import (
 	"go.uber.org/cadence/internal/common/backoff"
 	"go.uber.org/cadence/internal/common/serializer"
 	"go.uber.org/zap"
+	"io/ioutil"
+	"math"
 )
 
 const (

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,8 @@ func TestErrRetries(t *testing.T) {`
`47`	`47`	`&s.RemoteSyncMatchedError{},`
`48`	`48`	`&s.InternalDataInconsistencyError{},`
`49`	`49`	`} {`
`50`		`- assert.True(t, isServiceTransientError(err), "%T should be transient", err)`
	`50`	`+ retryable := isServiceTransientError(err)`
	`51`	`+ assert.True(t, retryable, "%T should be transient", err)`
`51`	`52`	`}`
`52`	`53`	`})`
`53`	`54`	`t.Run("terminal", func(t *testing.T) {`
`@@ -67,7 +68,8 @@ func TestErrRetries(t *testing.T) {`
`67`	`68`
`68`	`69`	`errShutdown, // shutdowns can't be stopped`
`69`	`70`	`} {`
`70`		`- assert.False(t, isServiceTransientError(err), "%T should be fatal", err)`
	`71`	`+ retryable := isServiceTransientError(err)`
	`72`	`+ assert.False(t, retryable, "%T should be fatal", err)`
`71`	`73`	`}`
`72`	`74`	`})`
`73`	`75`	`}`