Skip to content

Commit 9b1877b

Browse files
authored
fix(run-engine): retry non-zero exit code errors (#2467)
We’re also now saving the retryConfig from the BackgroundWorkerTask on TaskRun.lockedRetryConfig when the run is first locked to the version
1 parent 0b2b73f commit 9b1877b

File tree

6 files changed

+90
-19
lines changed

6 files changed

+90
-19
lines changed
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
-- AlterTable
2+
ALTER TABLE "public"."TaskRun" ADD COLUMN "lockedRetryConfig" JSONB;
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
# Please do not edit this file manually
22
# It should be added in your version-control system (i.e. Git)
3-
provider = "postgresql"
3+
provider = "postgresql"

internal-packages/database/prisma/schema.prisma

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ model OrganizationAccessToken {
151151
/// This is used to find the token in the database
152152
hashedToken String @unique
153153
154-
organization Organization @relation(fields: [organizationId], references: [id])
154+
organization Organization @relation(fields: [organizationId], references: [id])
155155
organizationId String
156156
157157
/// Optional expiration date for the token
@@ -648,11 +648,12 @@ model TaskRun {
648648
649649
concurrencyKey String?
650650
651-
delayUntil DateTime?
652-
queuedAt DateTime?
653-
ttl String?
654-
expiredAt DateTime?
655-
maxAttempts Int?
651+
delayUntil DateTime?
652+
queuedAt DateTime?
653+
ttl String?
654+
expiredAt DateTime?
655+
maxAttempts Int?
656+
lockedRetryConfig Json?
656657
657658
/// optional token that can be used to authenticate the task run
658659
oneTimeUseToken String?

internal-packages/run-engine/src/engine/retrying.ts

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import {
33
isOOMRunError,
44
RetryOptions,
55
sanitizeError,
6+
shouldLookupRetrySettings,
67
shouldRetryError,
78
TaskRunError,
89
taskRunErrorEnhancer,
@@ -72,13 +73,11 @@ export async function retryOutcomeFromCompletion(
7273
};
7374
}
7475

75-
// No retry settings
76-
if (!retrySettings) {
77-
return { outcome: "fail_run", sanitizedError };
78-
}
76+
const enhancedError = taskRunErrorEnhancer(error);
7977

8078
// Not a retriable error: fail
81-
const retriableError = shouldRetryError(taskRunErrorEnhancer(error));
79+
const retriableError = shouldRetryError(enhancedError);
80+
8281
if (!retriableError) {
8382
return { outcome: "fail_run", sanitizedError };
8483
}
@@ -95,6 +94,7 @@ export async function retryOutcomeFromCompletion(
9594
},
9695
select: {
9796
maxAttempts: true,
97+
lockedRetryConfig: true,
9898
},
9999
});
100100

@@ -112,6 +112,48 @@ export async function retryOutcomeFromCompletion(
112112
return { outcome: "fail_run", sanitizedError };
113113
}
114114

115+
// No retry settings
116+
if (!retrySettings) {
117+
const shouldLookup = shouldLookupRetrySettings(enhancedError);
118+
119+
if (!shouldLookup) {
120+
return { outcome: "fail_run", sanitizedError };
121+
}
122+
123+
const retryConfig = run.lockedRetryConfig;
124+
125+
if (!retryConfig) {
126+
return { outcome: "fail_run", sanitizedError };
127+
}
128+
129+
const parsedRetryConfig = RetryOptions.nullish().safeParse(retryConfig);
130+
131+
if (!parsedRetryConfig.success) {
132+
return { outcome: "fail_run", sanitizedError };
133+
}
134+
135+
if (!parsedRetryConfig.data) {
136+
return { outcome: "fail_run", sanitizedError };
137+
}
138+
139+
const nextDelay = calculateNextRetryDelay(parsedRetryConfig.data, attemptNumber ?? 1);
140+
141+
if (!nextDelay) {
142+
return { outcome: "fail_run", sanitizedError };
143+
}
144+
145+
const retrySettings = {
146+
timestamp: Date.now() + nextDelay,
147+
delay: nextDelay,
148+
};
149+
150+
return {
151+
outcome: "retry",
152+
method: "queue", // we'll always retry on the queue because usually having no settings means something bad happened
153+
settings: retrySettings,
154+
};
155+
}
156+
115157
return {
116158
outcome: "retry",
117159
method: retryUsingQueue ? "queue" : "immediate",
@@ -130,19 +172,15 @@ async function retryOOMOnMachine(
130172
},
131173
select: {
132174
machinePreset: true,
133-
lockedBy: {
134-
select: {
135-
retryConfig: true,
136-
},
137-
},
175+
lockedRetryConfig: true,
138176
},
139177
});
140178

141-
if (!run || !run.lockedBy || !run.machinePreset) {
179+
if (!run || !run.lockedRetryConfig || !run.machinePreset) {
142180
return;
143181
}
144182

145-
const retryConfig = run.lockedBy?.retryConfig;
183+
const retryConfig = run.lockedRetryConfig;
146184
const parsedRetryConfig = RetryOptions.nullish().safeParse(retryConfig);
147185

148186
if (!parsedRetryConfig.success) {

internal-packages/run-engine/src/engine/systems/dequeueSystem.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,9 @@ export class DequeueSystem {
403403
result.run.maxDurationInSeconds,
404404
result.task.maxDurationInSeconds
405405
);
406+
const lockedRetryConfig = result.run.lockedRetryConfig
407+
? undefined
408+
: result.task.retryConfig;
406409

407410
const lockedTaskRun = await prisma.taskRun.update({
408411
where: {
@@ -413,6 +416,7 @@ export class DequeueSystem {
413416
lockedById: result.task.id,
414417
lockedToVersionId: result.worker.id,
415418
lockedQueueId: result.queue.id,
419+
lockedRetryConfig: lockedRetryConfig ?? undefined,
416420
status: "DEQUEUED",
417421
startedAt,
418422
baseCostInCents: this.options.machines.baseCostInCents,

packages/core/src/v3/errors.ts

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -346,6 +346,32 @@ export function shouldRetryError(error: TaskRunError): boolean {
346346
}
347347
}
348348

349+
export function shouldLookupRetrySettings(error: TaskRunError): boolean {
350+
switch (error.type) {
351+
case "INTERNAL_ERROR": {
352+
switch (error.code) {
353+
case "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE":
354+
return true;
355+
356+
default:
357+
return false;
358+
}
359+
}
360+
case "STRING_ERROR": {
361+
return false;
362+
}
363+
case "BUILT_IN_ERROR": {
364+
return false;
365+
}
366+
case "CUSTOM_ERROR": {
367+
return false;
368+
}
369+
default: {
370+
assertExhaustive(error);
371+
}
372+
}
373+
}
374+
349375
export function correctErrorStackTrace(
350376
stackTrace: string,
351377
projectDir?: string,

0 commit comments

Comments
 (0)