From a70328dbbcd7156e3dbbe6ddd7167486aaa39c77 Mon Sep 17 00:00:00 2001 From: Eric Allam Date: Fri, 15 Aug 2025 16:48:12 +0100 Subject: [PATCH] fix(runner): prevent retry immediately race condition which can cause stuck runs that end up being system failures --- .changeset/loud-rules-dream.md | 5 ++ .../src/entryPoints/managed/execution.ts | 24 ++++++++- .../src/trigger/attemptFailures.ts | 50 +++++++++++++++++++ 3 files changed, 78 insertions(+), 1 deletion(-) create mode 100644 .changeset/loud-rules-dream.md create mode 100644 references/hello-world/src/trigger/attemptFailures.ts diff --git a/.changeset/loud-rules-dream.md b/.changeset/loud-rules-dream.md new file mode 100644 index 0000000000..ce6688c6cb --- /dev/null +++ b/.changeset/loud-rules-dream.md @@ -0,0 +1,5 @@ +--- +"trigger.dev": patch +--- + +fix(runner): prevent retry immediately race condition which can cause stuck runs that end up being system failures diff --git a/packages/cli-v3/src/entryPoints/managed/execution.ts b/packages/cli-v3/src/entryPoints/managed/execution.ts index dbf9448807..84a6b00d64 100644 --- a/packages/cli-v3/src/entryPoints/managed/execution.ts +++ b/packages/cli-v3/src/entryPoints/managed/execution.ts @@ -77,6 +77,7 @@ export class RunExecution { private shutdownReason?: string; private isCompletingRun = false; + private ignoreSnapshotChanges = false; private supervisorSocket: SupervisorSocket; private notifier?: RunNotifier; @@ -237,6 +238,16 @@ export class RunExecution { completedWaitpoints: completedWaitpoints.length, }; + if (this.ignoreSnapshotChanges) { + this.sendDebugLog("processSnapshotChange: ignoring snapshot change", { + incomingSnapshotId: snapshot.friendlyId, + completedWaitpoints: completedWaitpoints.length, + currentAttemptNumber: this.currentAttemptNumber, + newAttemptNumber: run.attemptNumber, + }); + return; + } + if (!this.snapshotManager) { this.sendDebugLog("handleSnapshotChange: missing snapshot manager", snapshotMetadata); return; @@ -808,7 +819,9 @@ export class RunExecution { } // Start and execute next attempt - const [startError, start] = await tryCatch(this.startAttempt({ isWarmStart: true })); + const [startError, start] = await tryCatch( + this.enableIgnoreSnapshotChanges(() => this.startAttempt({ isWarmStart: true })) + ); if (startError) { this.sendDebugLog("failed to start attempt for retry", { error: startError.message }); @@ -829,6 +842,15 @@ export class RunExecution { } } + private async enableIgnoreSnapshotChanges(fn: () => Promise): Promise { + this.ignoreSnapshotChanges = true; + try { + return await fn(); + } finally { + this.ignoreSnapshotChanges = false; + } + } + /** * Restores a suspended execution from PENDING_EXECUTING */ diff --git a/references/hello-world/src/trigger/attemptFailures.ts b/references/hello-world/src/trigger/attemptFailures.ts new file mode 100644 index 0000000000..b0a33e3f58 --- /dev/null +++ b/references/hello-world/src/trigger/attemptFailures.ts @@ -0,0 +1,50 @@ +import { task } from "@trigger.dev/sdk"; +import { setTimeout } from "timers/promises"; + +export const attemptFailures = task({ + id: "attempt-failures", + retry: { + maxAttempts: 3, + minTimeoutInMs: 500, + maxTimeoutInMs: 1000, + factor: 1.5, + }, + run: async (payload: any, { ctx }) => { + await setTimeout(5); + + await attemptFailureSubtask.triggerAndWait({}).unwrap(); + }, +}); + +export const attemptFailureSubtask = task({ + id: "attempt-failure-subtask", + retry: { + maxAttempts: 1, + }, + run: async (payload: any, { ctx }) => { + await setTimeout(20_000); + + throw new Error("Forced error to cause a retry"); + }, +}); + +export const attemptFailures2 = task({ + id: "attempt-failures-2", + retry: { + maxAttempts: 3, + minTimeoutInMs: 500, + maxTimeoutInMs: 1000, + factor: 1.5, + }, + run: async (payload: any, { ctx }) => { + if (ctx.attempt.number <= 2) { + throw new Error("Forced error to cause a retry"); + } + + await setTimeout(10_000); + + return { + success: true, + }; + }, +});