Skip to content

Commit 3c7ff85

Browse files
committed
improve retry span and add machine props
1 parent 83dff56 commit 3c7ff85

File tree

1 file changed

+35
-22
lines changed

1 file changed

+35
-22
lines changed

apps/webapp/app/v3/services/completeAttempt.server.ts

Lines changed: 35 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { Attributes } from "@opentelemetry/api";
22
import {
3+
MachinePresetName,
34
TaskRunContext,
45
TaskRunError,
56
TaskRunErrorCodes,
@@ -256,6 +257,7 @@ export class CompleteAttemptService extends BaseService {
256257
let isOOMRetry = false;
257258
let isOOMAttempt = isOOMError(completion.error);
258259
let isOnMaxOOMMachine = false;
260+
let oomMachine: MachinePresetName | undefined;
259261

260262
//OOM errors should retry (if an OOM machine is specified, and we're not already on it)
261263
if (isOOMAttempt) {
@@ -268,10 +270,10 @@ export class CompleteAttemptService extends BaseService {
268270
execution,
269271
});
270272

271-
isOnMaxOOMMachine =
272-
retryConfig?.outOfMemory?.machine === taskRunAttempt.taskRun.machinePreset;
273+
oomMachine = retryConfig?.outOfMemory?.machine;
274+
isOnMaxOOMMachine = oomMachine === taskRunAttempt.taskRun.machinePreset;
273275

274-
if (retryConfig?.outOfMemory?.machine && !isOnMaxOOMMachine) {
276+
if (oomMachine && !isOnMaxOOMMachine) {
275277
//we will retry
276278
isOOMRetry = true;
277279
retriableError = true;
@@ -290,7 +292,7 @@ export class CompleteAttemptService extends BaseService {
290292
id: taskRunAttempt.taskRunId,
291293
},
292294
data: {
293-
machinePreset: retryConfig.outOfMemory.machine,
295+
machinePreset: oomMachine,
294296
},
295297
});
296298
}
@@ -309,6 +311,7 @@ export class CompleteAttemptService extends BaseService {
309311
environment,
310312
checkpoint,
311313
forceRequeue: isOOMRetry,
314+
oomMachine,
312315
});
313316
}
314317

@@ -554,6 +557,7 @@ export class CompleteAttemptService extends BaseService {
554557
environment,
555558
checkpoint,
556559
forceRequeue = false,
560+
oomMachine,
557561
}: {
558562
execution: TaskRunExecution;
559563
executionRetry: TaskRunExecutionRetry;
@@ -562,29 +566,38 @@ export class CompleteAttemptService extends BaseService {
562566
environment: AuthenticatedEnvironment;
563567
checkpoint?: CheckpointData;
564568
forceRequeue?: boolean;
569+
/** Setting this will also alter the retry span message */
570+
oomMachine?: MachinePresetName;
565571
}) {
566572
const retryAt = new Date(executionRetry.timestamp);
567573

568574
// Retry the task run
569-
await eventRepository.recordEvent(`Retry #${execution.attempt.number} delay`, {
570-
taskSlug: taskRunAttempt.taskRun.taskIdentifier,
571-
environment,
572-
attributes: {
573-
metadata: this.#generateMetadataAttributesForNextAttempt(execution),
574-
properties: {
575-
retryAt: retryAt.toISOString(),
576-
},
577-
runId: taskRunAttempt.taskRun.friendlyId,
578-
style: {
579-
icon: "schedule-attempt",
575+
await eventRepository.recordEvent(
576+
`Retry #${execution.attempt.number} delay${oomMachine ? " after OOM" : ""}`,
577+
{
578+
taskSlug: taskRunAttempt.taskRun.taskIdentifier,
579+
environment,
580+
attributes: {
581+
metadata: this.#generateMetadataAttributesForNextAttempt(execution),
582+
properties: {
583+
retryAt: retryAt.toISOString(),
584+
previousMachine: oomMachine
585+
? taskRunAttempt.taskRun.machinePreset ?? undefined
586+
: undefined,
587+
nextMachine: oomMachine,
588+
},
589+
runId: taskRunAttempt.taskRun.friendlyId,
590+
style: {
591+
icon: "schedule-attempt",
592+
},
593+
queueId: taskRunAttempt.queueId,
594+
queueName: taskRunAttempt.taskRun.queue,
580595
},
581-
queueId: taskRunAttempt.queueId,
582-
queueName: taskRunAttempt.taskRun.queue,
583-
},
584-
context: taskRunAttempt.taskRun.traceContext as Record<string, string | undefined>,
585-
spanIdSeed: `retry-${taskRunAttempt.number + 1}`,
586-
endTime: retryAt,
587-
});
596+
context: taskRunAttempt.taskRun.traceContext as Record<string, string | undefined>,
597+
spanIdSeed: `retry-${taskRunAttempt.number + 1}`,
598+
endTime: retryAt,
599+
}
600+
);
588601

589602
logger.debug("[CompleteAttemptService] Retrying", {
590603
taskRun: taskRunAttempt.taskRun.friendlyId,

0 commit comments

Comments
 (0)