Skip to content

Commit 1005ea5

Browse files
authored
Moving retryable-err checks to errors.As, moving some to not-retryable (#1167)
Part 1 of 2 for solving retry storms, particularly around incorrectly-categorized errors (e.g. limit exceeded) and service-busy. This PR moves us to `errors.As` to support wrapped errors in the future, and re-categorizes some incorrectly-retried errors. This is both useful on its own, and helps make #1174 a smaller and clearer change. Service-busy behavior is actually changed in #1174, this commit intentionally maintains its current (flawed) behavior.
1 parent d64c4ef commit 1005ea5

File tree

3 files changed

+102
-15
lines changed

3 files changed

+102
-15
lines changed

internal/internal_retry.go

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ package internal
2424

2525
import (
2626
"context"
27+
"errors"
2728
"time"
2829

2930
s "go.uber.org/cadence/.gen/go/shared"
@@ -62,26 +63,59 @@ func createDynamicServiceRetryPolicy(ctx context.Context) backoff.RetryPolicy {
6263
}
6364

6465
func isServiceTransientError(err error) bool {
65-
// Retrying by default so it covers all transport errors.
66-
switch err.(type) {
67-
case *s.BadRequestError,
68-
*s.EntityNotExistsError,
69-
*s.WorkflowExecutionAlreadyStartedError,
70-
*s.WorkflowExecutionAlreadyCompletedError,
71-
*s.DomainAlreadyExistsError,
72-
*s.QueryFailedError,
73-
*s.DomainNotActiveError,
74-
*s.CancellationAlreadyRequestedError,
75-
*s.ClientVersionNotSupportedError,
76-
*s.LimitExceededError:
66+
// check intentionally-not-retried error types via errors.As.
67+
//
68+
// sadly we cannot build this into a list of error values / types to range over, as that
69+
// would turn the variable passed as the &target into e.g. an interface{} or an error,
70+
// which is compatible with ALL errors, so all are .As(err, &target).
71+
//
72+
// so we're left with this mess. it's not even generics-friendly.
73+
// at least this pattern lets it be done inline without exposing the variable.
74+
if target := (*s.AccessDeniedError)(nil); errors.As(err, &target) {
75+
return false
76+
}
77+
if target := (*s.BadRequestError)(nil); errors.As(err, &target) {
78+
return false
79+
}
80+
if target := (*s.CancellationAlreadyRequestedError)(nil); errors.As(err, &target) {
81+
return false
82+
}
83+
if target := (*s.ClientVersionNotSupportedError)(nil); errors.As(err, &target) {
84+
return false
85+
}
86+
if target := (*s.DomainAlreadyExistsError)(nil); errors.As(err, &target) {
87+
return false
88+
}
89+
if target := (*s.DomainNotActiveError)(nil); errors.As(err, &target) {
90+
return false
91+
}
92+
if target := (*s.EntityNotExistsError)(nil); errors.As(err, &target) {
93+
return false
94+
}
95+
if target := (*s.FeatureNotEnabledError)(nil); errors.As(err, &target) {
96+
return false
97+
}
98+
if target := (*s.LimitExceededError)(nil); errors.As(err, &target) {
99+
return false
100+
}
101+
if target := (*s.QueryFailedError)(nil); errors.As(err, &target) {
102+
return false
103+
}
104+
if target := (*s.WorkflowExecutionAlreadyCompletedError)(nil); errors.As(err, &target) {
105+
return false
106+
}
107+
if target := (*s.WorkflowExecutionAlreadyStartedError)(nil); errors.As(err, &target) {
77108
return false
78109
}
79110

80-
if err == errShutdown {
111+
// shutdowns are not retryable, of course
112+
if errors.Is(err, errShutdown) {
81113
return false
82114
}
83115

84116
// s.InternalServiceError
85-
// s.ServiceBusyError
117+
// s.ServiceBusyError (must retry after a delay, but it is transient)
118+
// server-side-only error types (as they should not reach clients)
119+
// and all other `error` types
86120
return true
87121
}

internal/internal_retry_test.go

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package internal
2+
3+
import (
4+
"errors"
5+
"testing"
6+
7+
"github.com/stretchr/testify/assert"
8+
s "go.uber.org/cadence/.gen/go/shared"
9+
)
10+
11+
func TestErrRetries(t *testing.T) {
12+
t.Run("retryable", func(t *testing.T) {
13+
for _, err := range []error{
14+
// service-busy means "retry later", which is still transient/retryable.
15+
// callers with retries MUST detect this separately and delay before retrying,
16+
// and ideally we'll return a minimum time-to-wait in errors in the future.
17+
&s.ServiceBusyError{},
18+
19+
&s.InternalServiceError{}, // fallback error type from server, "unknown" = "retry might work"
20+
errors.New("unrecognized"), // fallback error type elsewhere, "unknown" = "retry might work"
21+
22+
// below are all server-side internal errors and should never be exposed, they're just included
23+
// to be explicit about current behavior (without intentionally deciding on any of them).
24+
// retry by default, like any other unknown error. it shouldn't lead to incorrect behavior.
25+
&s.CurrentBranchChangedError{},
26+
&s.RetryTaskV2Error{},
27+
&s.RemoteSyncMatchedError{},
28+
&s.InternalDataInconsistencyError{},
29+
} {
30+
assert.True(t, isServiceTransientError(err), "%T should be transient", err)
31+
}
32+
})
33+
t.Run("terminal", func(t *testing.T) {
34+
for _, err := range []error{
35+
&s.AccessDeniedError{}, // access won't be granted immediately
36+
&s.BadRequestError{}, // bad requests don't become good
37+
&s.CancellationAlreadyRequestedError{}, // can only cancel once
38+
&s.ClientVersionNotSupportedError{}, // clients don't magically upgrade
39+
&s.DomainAlreadyExistsError{}, // re-creating again won't work
40+
&s.DomainNotActiveError{}, // may actually succeed, but very unlikely to work immediately
41+
&s.EntityNotExistsError{}, // may actually succeed, but very unlikely to work immediately
42+
&s.FeatureNotEnabledError{}, // features won't magically enable
43+
&s.LimitExceededError{}, // adding +1 to a value does not move it below its limit
44+
&s.QueryFailedError{}, // arguable, this could be considered "unknown" and retryable
45+
&s.WorkflowExecutionAlreadyCompletedError{}, // completed workflows won't uncomplete
46+
&s.WorkflowExecutionAlreadyStartedError{}, // started workflows could complete quickly, but re-starting may not be desirable
47+
48+
errShutdown, // shutdowns can't be stopped
49+
} {
50+
assert.False(t, isServiceTransientError(err), "%T should be fatal", err)
51+
}
52+
})
53+
}

internal/workflow_shadower_worker_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ func (s *shadowWorkerSuite) TestStartShadowWorker_Failed_StartWorkflowError() {
149149
Name: common.StringPtr(testDomain),
150150
}, callOptions()...).Return(&shared.DescribeDomainResponse{}, nil).Times(1)
151151
// first return a retryable error to check if retry policy is configured
152-
s.mockService.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any(), callOptions()...).Return(nil, &shared.ServiceBusyError{}).Times(1)
152+
s.mockService.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any(), callOptions()...).Return(nil, &shared.InternalServiceError{}).Times(1)
153153
// then return a non-retryable error
154154
s.mockService.EXPECT().StartWorkflowExecution(gomock.Any(), gomock.Any(), callOptions()...).Return(nil, &shared.BadRequestError{}).Times(1)
155155

0 commit comments

Comments
 (0)