From 3e7178821249bbcd51e726a375b216d401639892 Mon Sep 17 00:00:00 2001 From: Steven L Date: Thu, 26 Dec 2024 17:59:26 -0600 Subject: [PATCH 1/3] [wip] Fix retry policy attempts to match server behavior --- internal/internal_task_handlers.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/internal/internal_task_handlers.go b/internal/internal_task_handlers.go index a1b206f26..2dc02d3bc 100644 --- a/internal/internal_task_handlers.go +++ b/internal/internal_task_handlers.go @@ -1057,7 +1057,18 @@ func getRetryBackoffWithNowTime(p *RetryPolicy, attempt int32, errReason string, return noRetryBackoff } - if p.MaximumAttempts > 0 && attempt > p.MaximumAttempts-1 { + if p.MaximumAttempts > 0 && attempt >= p.MaximumAttempts-1 { + // >=max-1 matches server behavior, which treats all this somewhat oddly, but it has been consistent for a long time. + // basically: + // - attempts means *retry attempts*, as it's only relevant for retries + // - max attempts means *total executions*, counting the first and all retries + // so e.g. max=3 means 3 calls, with attempt counts 0, 1, 2. + // + // first==0 makes the backoff interval below convenient (no coefficient), + // otherwise this feels confusing and it contributes to RetryPolicy's ambiguities, + // as some things apply to the *policy* (expiration, attempts displayed) and some to all executions (max attempts). + // + // we may be able to change this with a completely new API, but for now it must not change for backwards compat. return noRetryBackoff // max attempt reached } From 0e571da3f2ece39d63294f9cd29a8269042d925c Mon Sep 17 00:00:00 2001 From: Steven L Date: Mon, 5 May 2025 18:39:56 -0500 Subject: [PATCH 2/3] changelog entry --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a18dca3fb..9d9bab3b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added worker.NewV2 with validation on decision poller count (#1370) - Upgraded internal tooling to support Go 1.24, should be no user-noticeable changes (#1421) +### Fixed +- **Significant** test-suite-only change: `RetryPolicy` handling in tests has been off by 1 for many years. + This is now fixed, and your tests will likely need to be updated. (#1414) + ## [v1.2.10] - 2024-07-10 ### Added - Revert "Handle panics while polling for tasks (#1352)" (#1357) From 51ed9ceb45268b04ccb125e48a33e22a037adceb Mon Sep 17 00:00:00 2001 From: Steven L Date: Mon, 5 May 2025 19:30:50 -0500 Subject: [PATCH 3/3] describe retry policy a bit more carefully --- internal/client.go | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/internal/client.go b/internal/client.go index c39382365..9918a50c1 100644 --- a/internal/client.go +++ b/internal/client.go @@ -493,13 +493,35 @@ type ( // This value is the cap of the interval. Default is 100x of initial interval. MaximumInterval time.Duration - // Maximum time to retry. Either ExpirationInterval or MaximumAttempts is required. + // Maximum time to attempt to retry. Either ExpirationInterval or MaximumAttempts is required. + // // When exceeded the retries stop even if maximum retries is not reached yet. + // + // Note that this applies to the retry policy itself, not the thing being retried - an + // ExpirationInterval of 1s will not cancel an activity that runs longer than one second. + // + // E.g. a 1-hour-long StartToClose activity with a 5 minute RetryPolicy.ExpirationInterval + // will lead to both this kind of behavior: + // - be called once + // - fail within 30 seconds + // - be retried (30 < ExpirationInterval) + // - fail again after 5 minutes + // - not be retried (5m30s > ExpirationInterval) + // and this: + // - be called once + // - fail after 10m + // - not be retried (10m > ExpirationInterval) ExpirationInterval time.Duration - // Maximum number of attempts. When exceeded the retries stop even if not expired yet. + // Maximum number of attempts to perform, including the initial call. + // When exceeded, no further retries will occur even if the expiration interval has not passed. + // // If not set or set to 0, it means unlimited, and rely on ExpirationInterval to stop. // Either MaximumAttempts or ExpirationInterval is required. + // + // CAUTION: MaximumAttempts of 1 does not allow any "retries", as the first attempt will occur + // even without a retry policy. E.g. if you want an activity to be able to fail and be retried once, + // set MaximumAttempts to 2. MaximumAttempts int32 // Non-Retriable errors. This is optional. Cadence server will stop retry if error reason matches this list.