diff --git a/CHANGELOG.md b/CHANGELOG.md index a18dca3fb..9d9bab3b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added worker.NewV2 with validation on decision poller count (#1370) - Upgraded internal tooling to support Go 1.24, should be no user-noticeable changes (#1421) +### Fixed +- **Significant** test-suite-only change: `RetryPolicy` handling in tests has been off by 1 for many years. + This is now fixed, and your tests will likely need to be updated. (#1414) + ## [v1.2.10] - 2024-07-10 ### Added - Revert "Handle panics while polling for tasks (#1352)" (#1357) diff --git a/internal/client.go b/internal/client.go index c39382365..9918a50c1 100644 --- a/internal/client.go +++ b/internal/client.go @@ -493,13 +493,35 @@ type ( // This value is the cap of the interval. Default is 100x of initial interval. MaximumInterval time.Duration - // Maximum time to retry. Either ExpirationInterval or MaximumAttempts is required. + // Maximum time to attempt to retry. Either ExpirationInterval or MaximumAttempts is required. + // // When exceeded the retries stop even if maximum retries is not reached yet. + // + // Note that this applies to the retry policy itself, not the thing being retried - an + // ExpirationInterval of 1s will not cancel an activity that runs longer than one second. + // + // E.g. a 1-hour-long StartToClose activity with a 5 minute RetryPolicy.ExpirationInterval + // will lead to both this kind of behavior: + // - be called once + // - fail within 30 seconds + // - be retried (30 < ExpirationInterval) + // - fail again after 5 minutes + // - not be retried (5m30s > ExpirationInterval) + // and this: + // - be called once + // - fail after 10m + // - not be retried (10m > ExpirationInterval) ExpirationInterval time.Duration - // Maximum number of attempts. When exceeded the retries stop even if not expired yet. + // Maximum number of attempts to perform, including the initial call. + // When exceeded, no further retries will occur even if the expiration interval has not passed. + // // If not set or set to 0, it means unlimited, and rely on ExpirationInterval to stop. // Either MaximumAttempts or ExpirationInterval is required. + // + // CAUTION: MaximumAttempts of 1 does not allow any "retries", as the first attempt will occur + // even without a retry policy. E.g. if you want an activity to be able to fail and be retried once, + // set MaximumAttempts to 2. MaximumAttempts int32 // Non-Retriable errors. This is optional. Cadence server will stop retry if error reason matches this list. diff --git a/internal/internal_task_handlers.go b/internal/internal_task_handlers.go index a1b206f26..2dc02d3bc 100644 --- a/internal/internal_task_handlers.go +++ b/internal/internal_task_handlers.go @@ -1057,7 +1057,18 @@ func getRetryBackoffWithNowTime(p *RetryPolicy, attempt int32, errReason string, return noRetryBackoff } - if p.MaximumAttempts > 0 && attempt > p.MaximumAttempts-1 { + if p.MaximumAttempts > 0 && attempt >= p.MaximumAttempts-1 { + // >=max-1 matches server behavior, which treats all this somewhat oddly, but it has been consistent for a long time. + // basically: + // - attempts means *retry attempts*, as it's only relevant for retries + // - max attempts means *total executions*, counting the first and all retries + // so e.g. max=3 means 3 calls, with attempt counts 0, 1, 2. + // + // first==0 makes the backoff interval below convenient (no coefficient), + // otherwise this feels confusing and it contributes to RetryPolicy's ambiguities, + // as some things apply to the *policy* (expiration, attempts displayed) and some to all executions (max attempts). + // + // we may be able to change this with a completely new API, but for now it must not change for backwards compat. return noRetryBackoff // max attempt reached }