diff --git a/.cursor/mcp.json b/.cursor/mcp.json index 96a6f73e4e..9b3221784d 100644 --- a/.cursor/mcp.json +++ b/.cursor/mcp.json @@ -4,4 +4,4 @@ "url": "http://localhost:3333/sse" } } -} +} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index ab091cb534..e044183922 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -138,15 +138,15 @@ "type": "node-terminal", "request": "launch", "name": "Debug RunEngine tests", - "command": "pnpm run test --filter @internal/run-engine", - "cwd": "${workspaceFolder}", + "command": "pnpm run test ./src/engine/tests/releaseConcurrencyQueue.test.ts -t 'Should manage token bucket and queue correctly'", + "cwd": "${workspaceFolder}/internal-packages/run-engine", "sourceMaps": true }, { "type": "node-terminal", "request": "launch", "name": "Debug RunQueue tests", - "command": "pnpm run test ./src/engine/tests/waitpoints.test.ts", + "command": "pnpm run test ./src/run-queue/index.test.ts", "cwd": "${workspaceFolder}/internal-packages/run-engine", "sourceMaps": true } diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 4ff67c0439..e6c209c814 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -565,6 +565,13 @@ const EnvironmentSchema = z.object({ RUN_ENGINE_RATE_LIMIT_REJECTION_LOGS_ENABLED: z.string().default("1"), RUN_ENGINE_RATE_LIMIT_LIMITER_LOGS_ENABLED: z.string().default("0"), + RUN_ENGINE_RELEASE_CONCURRENCY_ENABLED: z.string().default("0"), + RUN_ENGINE_RELEASE_CONCURRENCY_MAX_TOKENS_RATIO: z.coerce.number().default(1), + RUN_ENGINE_RELEASE_CONCURRENCY_MAX_RETRIES: z.coerce.number().int().default(3), + RUN_ENGINE_RELEASE_CONCURRENCY_CONSUMERS_COUNT: z.coerce.number().int().default(1), + RUN_ENGINE_RELEASE_CONCURRENCY_POLL_INTERVAL: z.coerce.number().int().default(500), + RUN_ENGINE_RELEASE_CONCURRENCY_BATCH_SIZE: z.coerce.number().int().default(10), + /** How long should the presence ttl last */ DEV_PRESENCE_TTL_MS: z.coerce.number().int().default(30_000), DEV_PRESENCE_POLL_INTERVAL_MS: z.coerce.number().int().default(5_000), diff --git a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts index 3bb2ecf664..f448c5b5ac 100644 --- a/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts +++ b/apps/webapp/app/routes/admin.api.v1.environments.$environmentId.ts @@ -2,7 +2,7 @@ import { ActionFunctionArgs, json, LoaderFunctionArgs } from "@remix-run/server- import { z } from "zod"; import { prisma } from "~/db.server"; import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server"; -import { marqs } from "~/v3/marqs/index.server"; +import { engine } from "~/v3/runEngine.server"; import { updateEnvConcurrencyLimits } from "~/v3/runQueue.server"; const ParamsSchema = z.object({ @@ -113,20 +113,15 @@ export async function loader({ request, params }: LoaderFunctionArgs) { Object.fromEntries(requestUrl.searchParams.entries()) ); - const concurrencyLimit = await marqs.getEnvConcurrencyLimit(environment); - const currentConcurrency = await marqs.currentConcurrencyOfEnvironment(environment); - const reserveConcurrency = await marqs.reserveConcurrencyOfEnvironment(environment); + const concurrencyLimit = await engine.runQueue.getEnvConcurrencyLimit(environment); + const currentConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment(environment); if (searchParams.queue) { - const queueConcurrencyLimit = await marqs.getQueueConcurrencyLimit( + const queueConcurrencyLimit = await engine.runQueue.getQueueConcurrencyLimit( environment, searchParams.queue ); - const queueCurrentConcurrency = await marqs.currentConcurrencyOfQueue( - environment, - searchParams.queue - ); - const queueReserveConcurrency = await marqs.reserveConcurrencyOfQueue( + const queueCurrentConcurrency = await engine.runQueue.currentConcurrencyOfQueue( environment, searchParams.queue ); @@ -135,12 +130,10 @@ export async function loader({ request, params }: LoaderFunctionArgs) { id: environment.id, concurrencyLimit, currentConcurrency, - reserveConcurrency, queueConcurrencyLimit, queueCurrentConcurrency, - queueReserveConcurrency, }); } - return json({ id: environment.id, concurrencyLimit, currentConcurrency, reserveConcurrency }); + return json({ id: environment.id, concurrencyLimit, currentConcurrency }); } diff --git a/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.wait.duration.ts b/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.wait.duration.ts index 8e11d4d626..24aa181404 100644 --- a/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.wait.duration.ts +++ b/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.wait.duration.ts @@ -48,12 +48,9 @@ const { action } = createActionApiRoute( const waitResult = await engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: waitpoint.id, - environmentId: authentication.environment.id, projectId: authentication.environment.project.id, organizationId: authentication.environment.organization.id, - releaseConcurrency: { - releaseQueue: true, - }, + releaseConcurrency: body.releaseConcurrency, }); return json({ diff --git a/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.waitpoints.tokens.$waitpointFriendlyId.wait.ts b/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.waitpoints.tokens.$waitpointFriendlyId.wait.ts index b20d0fd22d..e9bd27d693 100644 --- a/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.waitpoints.tokens.$waitpointFriendlyId.wait.ts +++ b/apps/webapp/app/routes/engine.v1.runs.$runFriendlyId.waitpoints.tokens.$waitpointFriendlyId.wait.ts @@ -34,10 +34,10 @@ const { action } = createActionApiRoute( throw json({ error: "Waitpoint not found" }, { status: 404 }); } + // TODO: Add releaseConcurrency from the body const result = await engine.blockRunWithWaitpoint({ runId, waitpoints: [waitpointId], - environmentId: authentication.environment.id, projectId: authentication.environment.project.id, organizationId: authentication.environment.organization.id, }); diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts index 382302ff16..4c70511e02 100644 --- a/apps/webapp/app/v3/runEngine.server.ts +++ b/apps/webapp/app/v3/runEngine.server.ts @@ -1,10 +1,10 @@ import { RunEngine } from "@internal/run-engine"; +import { defaultMachine } from "@trigger.dev/platform/v3"; import { prisma } from "~/db.server"; import { env } from "~/env.server"; -import { tracer } from "./tracer.server"; import { singleton } from "~/utils/singleton"; -import { defaultMachine, machines } from "@trigger.dev/platform/v3"; import { allMachines } from "./machinePresets.server"; +import { tracer } from "./tracer.server"; export const engine = singleton("RunEngine", createRunEngine); @@ -73,6 +73,23 @@ function createRunEngine() { EXECUTING: env.RUN_ENGINE_TIMEOUT_EXECUTING, EXECUTING_WITH_WAITPOINTS: env.RUN_ENGINE_TIMEOUT_EXECUTING_WITH_WAITPOINTS, }, + releaseConcurrency: { + disabled: env.RUN_ENGINE_RELEASE_CONCURRENCY_ENABLED === "0", + maxTokensRatio: env.RUN_ENGINE_RELEASE_CONCURRENCY_MAX_TOKENS_RATIO, + maxRetries: env.RUN_ENGINE_RELEASE_CONCURRENCY_MAX_RETRIES, + consumersCount: env.RUN_ENGINE_RELEASE_CONCURRENCY_CONSUMERS_COUNT, + pollInterval: env.RUN_ENGINE_RELEASE_CONCURRENCY_POLL_INTERVAL, + batchSize: env.RUN_ENGINE_RELEASE_CONCURRENCY_BATCH_SIZE, + redis: { + keyPrefix: "engine:", + port: env.RUN_ENGINE_RUN_QUEUE_REDIS_PORT ?? undefined, + host: env.RUN_ENGINE_RUN_QUEUE_REDIS_HOST ?? undefined, + username: env.RUN_ENGINE_RUN_QUEUE_REDIS_USERNAME ?? undefined, + password: env.RUN_ENGINE_RUN_QUEUE_REDIS_PASSWORD ?? undefined, + enableAutoPipelining: true, + ...(env.RUN_ENGINE_RUN_QUEUE_REDIS_TLS_DISABLED === "true" ? {} : { tls: {} }), + }, + }, }); return engine; diff --git a/apps/webapp/app/v3/services/triggerTaskV2.server.ts b/apps/webapp/app/v3/services/triggerTaskV2.server.ts index 1d1f6cb558..4a3c5efe57 100644 --- a/apps/webapp/app/v3/services/triggerTaskV2.server.ts +++ b/apps/webapp/app/v3/services/triggerTaskV2.server.ts @@ -4,6 +4,9 @@ import { packetRequiresOffloading, QueueOptions, SemanticInternalAttributes, + TaskRunError, + taskRunErrorEnhancer, + taskRunErrorToString, TriggerTaskRequestBody, } from "@trigger.dev/core/v3"; import { @@ -164,10 +167,10 @@ export class TriggerTaskServiceV2 extends WithRunEngine { index: options.batchIndex ?? 0, } : undefined, - environmentId: environment.id, projectId: environment.projectId, organizationId: environment.organizationId, tx: this._prisma, + releaseConcurrency: body.options?.releaseConcurrency, }); } ); @@ -271,7 +274,7 @@ export class TriggerTaskServiceV2 extends WithRunEngine { immediate: true, }, async (event, traceContext, traceparent) => { - const run = await autoIncrementCounter.incrementInTransaction( + const result = await autoIncrementCounter.incrementInTransaction( `v3-run:${environment.id}:${taskId}`, async (num, tx) => { const lockedToBackgroundWorker = body.options?.lockToVersion @@ -370,11 +373,18 @@ export class TriggerTaskServiceV2 extends WithRunEngine { : undefined, machine: body.options?.machine, priorityMs: body.options?.priority ? body.options.priority * 1_000 : undefined, + releaseConcurrency: body.options?.releaseConcurrency, }, this._prisma ); - return { run: taskRun, isCached: false }; + const error = taskRun.error ? TaskRunError.parse(taskRun.error) : undefined; + + if (error) { + event.failWithError(error); + } + + return { run: taskRun, error, isCached: false }; }, async (_, tx) => { const counter = await tx.taskRunNumberCounter.findFirst({ @@ -390,7 +400,13 @@ export class TriggerTaskServiceV2 extends WithRunEngine { this._prisma ); - return run; + if (result?.error) { + throw new ServiceValidationError( + taskRunErrorToString(taskRunErrorEnhancer(result.error)) + ); + } + + return result; } ); } catch (error) { diff --git a/internal-packages/database/prisma/migrations/20250314133612_add_queued_executing_status_to_snapshots/migration.sql b/internal-packages/database/prisma/migrations/20250314133612_add_queued_executing_status_to_snapshots/migration.sql new file mode 100644 index 0000000000..307831bc06 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20250314133612_add_queued_executing_status_to_snapshots/migration.sql @@ -0,0 +1,4 @@ +-- AlterEnum +ALTER TYPE "TaskRunExecutionStatus" +ADD + VALUE 'QUEUED_EXECUTING'; \ No newline at end of file diff --git a/internal-packages/database/prisma/migrations/20250318163201_add_previous_snapshot_id_to_task_run_execution_snapshot/migration.sql b/internal-packages/database/prisma/migrations/20250318163201_add_previous_snapshot_id_to_task_run_execution_snapshot/migration.sql new file mode 100644 index 0000000000..1f979fd2e0 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20250318163201_add_previous_snapshot_id_to_task_run_execution_snapshot/migration.sql @@ -0,0 +1,8 @@ +-- DropIndex +DROP INDEX "SecretStore_key_idx"; + +-- AlterTable +ALTER TABLE "TaskRunExecutionSnapshot" ADD COLUMN "previousSnapshotId" TEXT; + +-- CreateIndex +CREATE INDEX "SecretStore_key_idx" ON "SecretStore"("key" text_pattern_ops); diff --git a/internal-packages/database/prisma/migrations/20250319103257_add_release_concurrency_on_waitpoint_to_task_queue/migration.sql b/internal-packages/database/prisma/migrations/20250319103257_add_release_concurrency_on_waitpoint_to_task_queue/migration.sql new file mode 100644 index 0000000000..66cea8acdd --- /dev/null +++ b/internal-packages/database/prisma/migrations/20250319103257_add_release_concurrency_on_waitpoint_to_task_queue/migration.sql @@ -0,0 +1,5 @@ +-- AlterTable +ALTER TABLE + "TaskQueue" +ADD + COLUMN "releaseConcurrencyOnWaitpoint" BOOLEAN NOT NULL DEFAULT false; \ No newline at end of file diff --git a/internal-packages/database/prisma/migrations/20250319110754_add_org_and_project_to_execution_snapshots/migration.sql b/internal-packages/database/prisma/migrations/20250319110754_add_org_and_project_to_execution_snapshots/migration.sql new file mode 100644 index 0000000000..afdb979e87 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20250319110754_add_org_and_project_to_execution_snapshots/migration.sql @@ -0,0 +1,26 @@ +/* + Warnings: + + - Added the required column `organizationId` to the `TaskRunExecutionSnapshot` table without a default value. This is not possible if the table is not empty. + - Added the required column `projectId` to the `TaskRunExecutionSnapshot` table without a default value. This is not possible if the table is not empty. + + */ +-- AlterTable +ALTER TABLE + "TaskRunExecutionSnapshot" +ADD + COLUMN "organizationId" TEXT NOT NULL, +ADD + COLUMN "projectId" TEXT NOT NULL; + +-- AddForeignKey +ALTER TABLE + "TaskRunExecutionSnapshot" +ADD + CONSTRAINT "TaskRunExecutionSnapshot_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "Project"("id") ON DELETE RESTRICT ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE + "TaskRunExecutionSnapshot" +ADD + CONSTRAINT "TaskRunExecutionSnapshot_organizationId_fkey" FOREIGN KEY ("organizationId") REFERENCES "Organization"("id") ON DELETE RESTRICT ON UPDATE CASCADE; \ No newline at end of file diff --git a/internal-packages/database/prisma/migrations/20250319114436_add_metadata_to_task_run_execution_snapshots/migration.sql b/internal-packages/database/prisma/migrations/20250319114436_add_metadata_to_task_run_execution_snapshots/migration.sql new file mode 100644 index 0000000000..d4121ed929 --- /dev/null +++ b/internal-packages/database/prisma/migrations/20250319114436_add_metadata_to_task_run_execution_snapshots/migration.sql @@ -0,0 +1,5 @@ +-- AlterTable +ALTER TABLE + "TaskRunExecutionSnapshot" +ADD + COLUMN "metadata" JSONB; \ No newline at end of file diff --git a/internal-packages/database/prisma/migrations/20250319131807_add_locked_queue_id_to_task_run/migration.sql b/internal-packages/database/prisma/migrations/20250319131807_add_locked_queue_id_to_task_run/migration.sql new file mode 100644 index 0000000000..b1bf829b7e --- /dev/null +++ b/internal-packages/database/prisma/migrations/20250319131807_add_locked_queue_id_to_task_run/migration.sql @@ -0,0 +1,5 @@ +-- AlterTable +ALTER TABLE + "TaskRun" +ADD + COLUMN "lockedQueueId" TEXT; \ No newline at end of file diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma index 84d8c55702..8b92c6448b 100644 --- a/internal-packages/database/prisma/schema.prisma +++ b/internal-packages/database/prisma/schema.prisma @@ -164,6 +164,7 @@ model Organization { organizationIntegrations OrganizationIntegration[] workerGroups WorkerInstanceGroup[] workerInstances WorkerInstance[] + executionSnapshots TaskRunExecutionSnapshot[] } model ExternalAccount { @@ -504,6 +505,7 @@ model Project { waitpoints Waitpoint[] taskRunWaitpoints TaskRunWaitpoint[] taskRunCheckpoints TaskRunCheckpoint[] + executionSnapshots TaskRunExecutionSnapshot[] } enum ProjectVersion { @@ -1724,7 +1726,9 @@ model TaskRun { projectId String // The specific queue this run is in - queue String + queue String + // The queueId is set when the run is locked to a specific queue + lockedQueueId String? /// The main queue that this run is part of masterQueue String @default("main") @@ -1965,6 +1969,9 @@ model TaskRunExecutionSnapshot { isValid Boolean @default(true) error String? + /// The previous snapshot ID + previousSnapshotId String? + /// Run runId String run TaskRun @relation(fields: [runId], references: [id]) @@ -1982,6 +1989,12 @@ model TaskRunExecutionSnapshot { environment RuntimeEnvironment @relation(fields: [environmentId], references: [id]) environmentType RuntimeEnvironmentType + projectId String + project Project @relation(fields: [projectId], references: [id]) + + organizationId String + organization Organization @relation(fields: [organizationId], references: [id]) + /// Waitpoints that have been completed for this execution completedWaitpoints Waitpoint[] @relation("completedWaitpoints") @@ -2003,6 +2016,9 @@ model TaskRunExecutionSnapshot { lastHeartbeatAt DateTime? + /// Metadata used by various systems in the run engine + metadata Json? + /// Used to get the latest valid snapshot quickly @@index([runId, isValid, createdAt(sort: Desc)]) } @@ -2012,6 +2028,8 @@ enum TaskRunExecutionStatus { RUN_CREATED /// Run is in the RunQueue QUEUED + /// Run is in the RunQueue, and is also executing. This happens when a run is continued cannot reacquire concurrency + QUEUED_EXECUTING /// Run has been pulled from the queue, but isn't executing yet PENDING_EXECUTING /// Run is executing on a worker @@ -2526,6 +2544,9 @@ model TaskQueue { paused Boolean @default(false) + /// If true, when a run is paused and waiting for waitpoints to be completed, the run will release the concurrency capacity. + releaseConcurrencyOnWaitpoint Boolean @default(false) + createdAt DateTime @default(now()) updatedAt DateTime @updatedAt diff --git a/internal-packages/redis-worker/src/worker.test.ts b/internal-packages/redis-worker/src/worker.test.ts index 2a138b49a0..1768f39107 100644 --- a/internal-packages/redis-worker/src/worker.test.ts +++ b/internal-packages/redis-worker/src/worker.test.ts @@ -36,25 +36,21 @@ describe("Worker", () => { logger: new Logger("test", "log"), }).start(); - try { - // Enqueue 10 items - for (let i = 0; i < 10; i++) { - await worker.enqueue({ - id: `item-${i}`, - job: "testJob", - payload: { value: i }, - visibilityTimeoutMs: 5000, - }); - } + // Enqueue 10 items + for (let i = 0; i < 10; i++) { + await worker.enqueue({ + id: `item-${i}`, + job: "testJob", + payload: { value: i }, + visibilityTimeoutMs: 5000, + }); + } - // Wait for items to be processed - await new Promise((resolve) => setTimeout(resolve, 2000)); + // Wait for items to be processed + await new Promise((resolve) => setTimeout(resolve, 2000)); - expect(processedItems.length).toBe(10); - expect(new Set(processedItems).size).toBe(10); // Ensure all items were processed uniquely - } finally { - worker.stop(); - } + expect(processedItems.length).toBe(10); + expect(new Set(processedItems).size).toBe(10); // Ensure all items were processed uniquely }); redisTest( @@ -97,25 +93,21 @@ describe("Worker", () => { logger: new Logger("test", "error"), }).start(); - try { - // Enqueue 10 items - for (let i = 0; i < 10; i++) { - await worker.enqueue({ - id: `item-${i}`, - job: "testJob", - payload: { value: i }, - visibilityTimeoutMs: 5000, - }); - } + // Enqueue 10 items + for (let i = 0; i < 10; i++) { + await worker.enqueue({ + id: `item-${i}`, + job: "testJob", + payload: { value: i }, + visibilityTimeoutMs: 5000, + }); + } - // Wait for items to be processed - await new Promise((resolve) => setTimeout(resolve, 500)); + // Wait for items to be processed + await new Promise((resolve) => setTimeout(resolve, 500)); - expect(processedItems.length).toBe(10); - expect(new Set(processedItems).size).toBe(10); // Ensure all items were processed uniquely - } finally { - worker.stop(); - } + expect(processedItems.length).toBe(10); + expect(new Set(processedItems).size).toBe(10); // Ensure all items were processed uniquely } ); @@ -156,33 +148,29 @@ describe("Worker", () => { logger: new Logger("test", "error"), }).start(); - try { - // Enqueue the item that will permanently fail - await worker.enqueue({ - id: failedItemId, - job: "testJob", - payload: { value: 999 }, - }); - - // Enqueue a normal item - await worker.enqueue({ - id: "normal-item", - job: "testJob", - payload: { value: 1 }, - }); - - // Wait for items to be processed and retried - await new Promise((resolve) => setTimeout(resolve, 1000)); - - // Check that the normal item was processed - expect(processedItems).toEqual([1]); - - // Check that the failed item is in the DLQ - const dlqSize = await worker.queue.sizeOfDeadLetterQueue(); - expect(dlqSize).toBe(1); - } finally { - worker.stop(); - } + // Enqueue the item that will permanently fail + await worker.enqueue({ + id: failedItemId, + job: "testJob", + payload: { value: 999 }, + }); + + // Enqueue a normal item + await worker.enqueue({ + id: "normal-item", + job: "testJob", + payload: { value: 1 }, + }); + + // Wait for items to be processed and retried + await new Promise((resolve) => setTimeout(resolve, 1000)); + + // Check that the normal item was processed + expect(processedItems).toEqual([1]); + + // Check that the failed item is in the DLQ + const dlqSize = await worker.queue.sizeOfDeadLetterQueue(); + expect(dlqSize).toBe(1); } ); @@ -225,45 +213,41 @@ describe("Worker", () => { logger: new Logger("test", "error"), }).start(); - try { - // Enqueue the item that will fail 3 times - await worker.enqueue({ - id: failedItemId, - job: "testJob", - payload: { value: 999 }, - }); + // Enqueue the item that will fail 3 times + await worker.enqueue({ + id: failedItemId, + job: "testJob", + payload: { value: 999 }, + }); - // Wait for the item to be processed and moved to DLQ - await new Promise((resolve) => setTimeout(resolve, 1000)); + // Wait for the item to be processed and moved to DLQ + await new Promise((resolve) => setTimeout(resolve, 1000)); - // Check that the item is in the DLQ - let dlqSize = await worker.queue.sizeOfDeadLetterQueue(); - expect(dlqSize).toBe(1); + // Check that the item is in the DLQ + let dlqSize = await worker.queue.sizeOfDeadLetterQueue(); + expect(dlqSize).toBe(1); - // Create a Redis client to publish the redrive message - const redisClient = createRedisClient({ - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }); + // Create a Redis client to publish the redrive message + const redisClient = createRedisClient({ + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }); - // Publish redrive message - await redisClient.publish("test-worker:redrive", JSON.stringify({ id: failedItemId })); + // Publish redrive message + await redisClient.publish("test-worker:redrive", JSON.stringify({ id: failedItemId })); - // Wait for the item to be redrived and processed - await new Promise((resolve) => setTimeout(resolve, 1000)); + // Wait for the item to be redrived and processed + await new Promise((resolve) => setTimeout(resolve, 1000)); - // Check that the item was processed successfully - expect(processedItems).toEqual([999]); + // Check that the item was processed successfully + expect(processedItems).toEqual([999]); - // Check that the DLQ is now empty - dlqSize = await worker.queue.sizeOfDeadLetterQueue(); - expect(dlqSize).toBe(0); + // Check that the DLQ is now empty + dlqSize = await worker.queue.sizeOfDeadLetterQueue(); + expect(dlqSize).toBe(0); - await redisClient.quit(); - } finally { - worker.stop(); - } + await redisClient.quit(); } ); }); diff --git a/internal-packages/redis-worker/src/worker.ts b/internal-packages/redis-worker/src/worker.ts index d4fca68c6d..c4a5a2edc0 100644 --- a/internal-packages/redis-worker/src/worker.ts +++ b/internal-packages/redis-worker/src/worker.ts @@ -365,7 +365,7 @@ class Worker { if (err) { this.logger.error(`Failed to subscribe to ${channel}`, { error: err }); } else { - this.logger.log(`Subscribed to ${channel}`); + this.logger.debug(`Subscribed to ${channel}`); } }); diff --git a/internal-packages/redis/src/index.ts b/internal-packages/redis/src/index.ts index 13264773b9..546e1c1b29 100644 --- a/internal-packages/redis/src/index.ts +++ b/internal-packages/redis/src/index.ts @@ -8,7 +8,7 @@ const defaultOptions: Partial = { const delay = Math.min(times * 50, 1000); return delay; }, - maxRetriesPerRequest: 20, + maxRetriesPerRequest: process.env.VITEST ? 1 : 20, }; const logger = new Logger("Redis", "debug"); diff --git a/internal-packages/run-engine/README.md b/internal-packages/run-engine/README.md index a2ca8fda22..499242da21 100644 --- a/internal-packages/run-engine/README.md +++ b/internal-packages/run-engine/README.md @@ -24,6 +24,7 @@ It is responsible for: Many operations on the run are "atomic" in the sense that only a single operation can mutate them at a time. We use RedLock to create a distributed lock to ensure this. Postgres locking is not enough on its own because we have multiple API instances and Redis is used for the queue. There are race conditions we need to deal with: + - When checkpointing the run continues to execute until the checkpoint has been stored. At the same time the run continues and the checkpoint can become irrelevant if the waitpoint is completed. Both can happen at the same time, so we must lock the run and protect against outdated checkpoints. ## Run execution @@ -41,6 +42,7 @@ We can also store invalid states by setting an error. These invalid states are p ## Workers A worker is a server that runs tasks. There are two types of workers: + - Hosted workers (serverless, managed and cloud-only) - Self-hosted workers @@ -67,6 +69,7 @@ If there's only a `workerGroup`, we can just `dequeueFromMasterQueue()` to get r This is a fair multi-tenant queue. It is designed to fairly select runs, respect concurrency limits, and have high throughput. It provides visibility into the current concurrency for the env, org, etc. It has built-in reliability features: + - When nacking we increment the `attempt` and if it continually fails we will move it to a Dead Letter Queue (DLQ). - If a run is in the DLQ you can redrive it. @@ -87,23 +90,26 @@ A single Waitpoint can block many runs, the same waitpoint can only block a run They can have output data associated with them, e.g. the finished run payload. That includes an error, e.g. a failed run. There are currently three types: - - `RUN` which gets completed when the associated run completes. Every run has an `associatedWaitpoint` that matches the lifetime of the run. - - `DATETIME` which gets completed when the datetime is reached. - - `MANUAL` which gets completed when that event occurs. + +- `RUN` which gets completed when the associated run completes. Every run has an `associatedWaitpoint` that matches the lifetime of the run. +- `DATETIME` which gets completed when the datetime is reached. +- `MANUAL` which gets completed when that event occurs. Waitpoints can have an idempotencyKey which allows stops them from being created multiple times. This is especially useful for event waitpoints, where you don't want to create a new waitpoint for the same event twice. ### `wait.for()` or `wait.until()` + Wait for a future time, then continue. We should add the option to pass an `idempotencyKey` so a second attempt doesn't wait again. By default it would wait again. ```ts //Note if the idempotency key is a string, it will get prefixed with the run id. //you can explicitly pass in an idempotency key created with the the global scope. -await wait.until(new Date('2022-01-01T00:00:00Z'), { idempotencyKey: "first-wait" }); -await wait.until(new Date('2022-01-01T00:00:00Z'), { idempotencyKey: "second-wait" }); +await wait.until(new Date("2022-01-01T00:00:00Z"), { idempotencyKey: "first-wait" }); +await wait.until(new Date("2022-01-01T00:00:00Z"), { idempotencyKey: "second-wait" }); ``` ### `triggerAndWait()` or `batchTriggerAndWait()` + Trigger and then wait for run(s) to finish. If the run fails it will still continue but with the errors so the developer can decide what to do. ### The `trigger` `delay` option @@ -111,6 +117,7 @@ Trigger and then wait for run(s) to finish. If the run fails it will still conti When triggering a run and passing the `delay` option, we use a `DATETIME` waitpoint to block the run from starting. ### `wait.forRequest()` + Wait until a request has been received at the URL that you are given. This is useful for pausing a run and then continuing it again when some external event occurs on another service. For example, Replicate have an API where they will callback when their work is complete. ### `wait.forWaitpoint(waitpointId)` @@ -155,6 +162,7 @@ When `trigger` is called the run is added to the queue. We only dequeue when the When `trigger` is called, we check if the rate limit has been exceeded. If it has then we ignore the trigger. The run is thrown away and an appropriate error is returned. This is useful: + - To prevent abuse. - To control how many executions a user can do (using a `key` with rate limiting). @@ -163,6 +171,7 @@ This is useful: When `trigger` is called, we prevent too many runs happening in a period by collapsing into a single run. This is done by discarding some runs in a period. This is useful: + - To prevent too many runs happening in a short period. We should mark the run as `"DELAYED"` with the correct `delayUntil` time. This will allow the user to see that the run is delayed and why. @@ -172,6 +181,7 @@ We should mark the run as `"DELAYED"` with the correct `delayUntil` time. This w When `trigger` is called the run is added to the queue. We only run them when they don't exceed the limit in that time period, by controlling the timing of when they are dequeued. This is useful: + - To prevent too many runs happening in a short period. - To control how many executions a user can do (using a `key` with throttling). - When you need to execute every run but not too many in a short period, e.g. avoiding rate limits. @@ -181,9 +191,140 @@ This is useful: When `trigger` is called, we batch the runs together. This means the payload of the run is an array of items, each being a single payload. This is useful: + - For performance, as it reduces the number of runs in the system. - It can be useful when using 3rd party APIs that support batching. ## Emitting events The Run Engine emits events using its `eventBus`. This is used for runs completing, failing, or things that any workers should be aware of. + +# RunEngine System Architecture + +The RunEngine is composed of several specialized systems that handle different aspects of task execution and management. Below is a diagram showing the relationships between these systems. + +```mermaid +graph TD + RE[RunEngine] + DS[DequeueSystem] + RAS[RunAttemptSystem] + ESS[ExecutionSnapshotSystem] + WS[WaitpointSystem] + BS[BatchSystem] + ES[EnqueueSystem] + CS[CheckpointSystem] + DRS[DelayedRunSystem] + TS[TtlSystem] + WFS[WaitingForWorkerSystem] + + %% Core Dependencies + RE --> DS + RE --> RAS + RE --> ESS + RE --> WS + RE --> BS + RE --> ES + RE --> CS + RE --> DRS + RE --> TS + RE --> WFS + + %% System Dependencies + DS --> ESS + DS --> RAS + + RAS --> ESS + RAS --> WS + RAS --> BS + + WS --> ESS + WS --> ES + + ES --> ESS + + CS --> ESS + CS --> ES + + DRS --> ES + + WFS --> ES + + TS --> WS + + %% Shared Resources + subgraph Resources + PRI[(Prisma)] + LOG[Logger] + TRC[Tracer] + RQ[RunQueue] + RL[RunLocker] + EB[EventBus] + WRK[Worker] + RCQ[ReleaseConcurrencyQueue] + end + + %% Resource Dependencies + RE -.-> Resources + DS & RAS & ESS & WS & BS & ES & CS & DRS & TS & WFS -.-> Resources +``` + +## System Responsibilities + +### DequeueSystem + +- Handles dequeuing of tasks from master queues +- Manages resource allocation and constraints +- Handles task deployment verification + +### RunAttemptSystem + +- Manages run attempt lifecycle +- Handles success/failure scenarios +- Manages retries and cancellations +- Coordinates with other systems for run completion + +### ExecutionSnapshotSystem + +- Creates and manages execution snapshots +- Tracks run state and progress +- Manages heartbeats for active runs +- Maintains execution history + +### WaitpointSystem + +- Manages waitpoints for task synchronization +- Handles waitpoint completion +- Coordinates blocked runs +- Manages concurrency release + +### BatchSystem + +- Manages batch operations +- Handles batch completion +- Coordinates batch-related task runs + +### EnqueueSystem + +- Handles enqueueing of runs +- Manages run scheduling +- Coordinates with execution snapshots + +## Shared Resources + +- **Prisma**: Database access +- **Logger**: Logging functionality +- **Tracer**: Tracing and monitoring +- **RunQueue**: Task queue management +- **RunLocker**: Run locking mechanism +- **EventBus**: Event communication +- **Worker**: Background task execution +- **ReleaseConcurrencyQueue**: Manages concurrency token release + +## Key Interactions + +1. **RunEngine** orchestrates all systems and manages shared resources +2. **DequeueSystem** works with **RunAttemptSystem** for task execution +3. **RunAttemptSystem** coordinates with **WaitpointSystem** and **BatchSystem** +4. **WaitpointSystem** uses **EnqueueSystem** for run scheduling +5. **ExecutionSnapshotSystem** is used by all other systems to track state +6. All systems share common resources through the `SystemResources` interface diff --git a/internal-packages/run-engine/execution-states.png b/internal-packages/run-engine/execution-states.png index cc156dd7de..7bc7e0d0f9 100644 Binary files a/internal-packages/run-engine/execution-states.png and b/internal-packages/run-engine/execution-states.png differ diff --git a/internal-packages/run-engine/package.json b/internal-packages/run-engine/package.json index 450ea1cb06..ebf084dd53 100644 --- a/internal-packages/run-engine/package.json +++ b/internal-packages/run-engine/package.json @@ -19,23 +19,25 @@ "@internal/tracing": "workspace:*", "@trigger.dev/core": "workspace:*", "@trigger.dev/database": "workspace:*", + "@unkey/cache": "^1.5.0", "assert-never": "^1.2.1", "nanoid": "^3.3.4", "redlock": "5.0.0-beta.2", - "zod": "3.23.8", - "@unkey/cache": "^1.5.0", - "seedrandom": "^3.0.5" + "seedrandom": "^3.0.5", + "zod": "3.23.8" }, "devDependencies": { "@internal/testcontainers": "workspace:*", - "vitest": "^1.4.0", "@types/seedrandom": "^3.0.8", - "rimraf": "6.0.1" + "@vitest/coverage-v8": "^3.0.8", + "rimraf": "6.0.1", + "vitest": "^3.0.8" }, "scripts": { "clean": "rimraf dist", "typecheck": "tsc --noEmit -p tsconfig.build.json", "test": "vitest --sequence.concurrent=false --no-file-parallelism", + "test:coverage": "vitest --sequence.concurrent=false --no-file-parallelism --coverage.enabled", "build": "pnpm run clean && tsc -p tsconfig.build.json", "dev": "tsc --watch -p tsconfig.build.json" } diff --git a/internal-packages/run-engine/src/engine/errors.ts b/internal-packages/run-engine/src/engine/errors.ts index 33d9be6961..81e3b598bd 100644 --- a/internal-packages/run-engine/src/engine/errors.ts +++ b/internal-packages/run-engine/src/engine/errors.ts @@ -56,3 +56,27 @@ export function runStatusFromError(error: TaskRunError): TaskRunStatus { assertExhaustive(error.code); } } + +export class ServiceValidationError extends Error { + constructor( + message: string, + public status?: number + ) { + super(message); + this.name = "ServiceValidationError"; + } +} + +export class NotImplementedError extends Error { + constructor(message: string) { + console.error("This isn't implemented", { message }); + super(message); + } +} + +export class RunDuplicateIdempotencyKeyError extends Error { + constructor(message: string) { + super(message); + this.name = "RunDuplicateIdempotencyKeyError"; + } +} diff --git a/internal-packages/run-engine/src/engine/eventBus.ts b/internal-packages/run-engine/src/engine/eventBus.ts index 0ad687f27c..c64d0b2c11 100644 --- a/internal-packages/run-engine/src/engine/eventBus.ts +++ b/internal-packages/run-engine/src/engine/eventBus.ts @@ -1,6 +1,7 @@ import { TaskRunExecutionStatus, TaskRunStatus } from "@trigger.dev/database"; import { AuthenticatedEnvironment } from "../shared/index.js"; import { FlushedRunMetadata, TaskRunError } from "@trigger.dev/core/v3"; +import { EventEmitter } from "events"; export type EventBusEvents = { runAttemptStarted: [ @@ -178,3 +179,33 @@ export type EventBusEvents = { }; export type EventBusEventArgs = EventBusEvents[T]; + +export type EventBus = EventEmitter; + +/** + * Sends a notification that a run has changed and we need to fetch the latest run state. + * The worker will call `getRunExecutionData` via the API and act accordingly. + */ +export async function sendNotificationToWorker({ + runId, + snapshot, + eventBus, +}: { + runId: string; + snapshot: { + id: string; + executionStatus: TaskRunExecutionStatus; + }; + eventBus: EventBus; +}) { + eventBus.emit("workerNotification", { + time: new Date(), + run: { + id: runId, + }, + snapshot: { + id: snapshot.id, + executionStatus: snapshot.executionStatus, + }, + }); +} diff --git a/internal-packages/run-engine/src/engine/executionSnapshots.ts b/internal-packages/run-engine/src/engine/executionSnapshots.ts deleted file mode 100644 index 3f2cec09a3..0000000000 --- a/internal-packages/run-engine/src/engine/executionSnapshots.ts +++ /dev/null @@ -1,131 +0,0 @@ -import { CompletedWaitpoint, ExecutionResult } from "@trigger.dev/core/v3"; -import { BatchId, RunId, SnapshotId } from "@trigger.dev/core/v3/isomorphic"; -import { - PrismaClientOrTransaction, - TaskRunCheckpoint, - TaskRunExecutionSnapshot, -} from "@trigger.dev/database"; - -interface LatestExecutionSnapshot extends TaskRunExecutionSnapshot { - friendlyId: string; - runFriendlyId: string; - checkpoint: TaskRunCheckpoint | null; - completedWaitpoints: CompletedWaitpoint[]; -} - -/* Gets the most recent valid snapshot for a run */ -export async function getLatestExecutionSnapshot( - prisma: PrismaClientOrTransaction, - runId: string -): Promise { - const snapshot = await prisma.taskRunExecutionSnapshot.findFirst({ - where: { runId, isValid: true }, - include: { - completedWaitpoints: true, - checkpoint: true, - }, - orderBy: { createdAt: "desc" }, - }); - - if (!snapshot) { - throw new Error(`No execution snapshot found for TaskRun ${runId}`); - } - - return { - ...snapshot, - friendlyId: SnapshotId.toFriendlyId(snapshot.id), - runFriendlyId: RunId.toFriendlyId(snapshot.runId), - completedWaitpoints: snapshot.completedWaitpoints.flatMap((w) => { - //get all indexes of the waitpoint in the completedWaitpointOrder - //we do this because the same run can be in a batch multiple times (i.e. same idempotencyKey) - let indexes: (number | undefined)[] = []; - for (let i = 0; i < snapshot.completedWaitpointOrder.length; i++) { - if (snapshot.completedWaitpointOrder[i] === w.id) { - indexes.push(i); - } - } - - if (indexes.length === 0) { - indexes.push(undefined); - } - - return indexes.map((index) => { - return { - id: w.id, - index: index === -1 ? undefined : index, - friendlyId: w.friendlyId, - type: w.type, - completedAt: w.completedAt ?? new Date(), - idempotencyKey: - w.userProvidedIdempotencyKey && !w.inactiveIdempotencyKey - ? w.idempotencyKey - : undefined, - completedByTaskRun: w.completedByTaskRunId - ? { - id: w.completedByTaskRunId, - friendlyId: RunId.toFriendlyId(w.completedByTaskRunId), - batch: snapshot.batchId - ? { - id: snapshot.batchId, - friendlyId: BatchId.toFriendlyId(snapshot.batchId), - } - : undefined, - } - : undefined, - completedAfter: w.completedAfter ?? undefined, - completedByBatch: w.completedByBatchId - ? { - id: w.completedByBatchId, - friendlyId: BatchId.toFriendlyId(w.completedByBatchId), - } - : undefined, - output: w.output ?? undefined, - outputType: w.outputType, - outputIsError: w.outputIsError, - } satisfies CompletedWaitpoint; - }); - }), - }; -} - -export async function getExecutionSnapshotCompletedWaitpoints( - prisma: PrismaClientOrTransaction, - snapshotId: string -) { - const waitpoints = await prisma.taskRunExecutionSnapshot.findFirst({ - where: { id: snapshotId }, - include: { - completedWaitpoints: true, - }, - }); - - //deduplicate waitpoints - const waitpointIds = new Set(); - return ( - waitpoints?.completedWaitpoints.filter((waitpoint) => { - if (waitpointIds.has(waitpoint.id)) { - return false; - } else { - waitpointIds.add(waitpoint.id); - return true; - } - }) ?? [] - ); -} - -export function executionResultFromSnapshot(snapshot: TaskRunExecutionSnapshot): ExecutionResult { - return { - snapshot: { - id: snapshot.id, - friendlyId: SnapshotId.toFriendlyId(snapshot.id), - executionStatus: snapshot.executionStatus, - description: snapshot.description, - }, - run: { - id: snapshot.runId, - friendlyId: RunId.toFriendlyId(snapshot.runId), - status: snapshot.runStatus, - attemptNumber: snapshot.attemptNumber, - }, - }; -} diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 46556b9588..1e5a949349 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -1,7 +1,6 @@ import { createRedisClient, Redis } from "@internal/redis"; import { Worker } from "@internal/redis-worker"; -import { Attributes, Span, SpanKind, trace, Tracer } from "@internal/tracing"; -import { assertExhaustive } from "@trigger.dev/core"; +import { startSpan, trace, Tracer } from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { CheckpointInput, @@ -10,119 +9,49 @@ import { DequeuedMessage, ExecutionResult, MachineResources, - parsePacket, - RetryOptions, RunExecutionData, - sanitizeError, - shouldRetryError, StartRunAttemptResult, - TaskRunError, - taskRunErrorEnhancer, - TaskRunExecution, TaskRunExecutionResult, - TaskRunFailedExecutionResult, - TaskRunInternalError, - TaskRunSuccessfulExecutionResult, - timeoutError, } from "@trigger.dev/core/v3"; +import { BatchId, QueueId, RunId, WaitpointId } from "@trigger.dev/core/v3/isomorphic"; import { - BatchId, - CheckpointId, - getMaxDuration, - parseNaturalLanguageDuration, - QueueId, - RunId, - sanitizeQueueName, - SnapshotId, - WaitpointId, -} from "@trigger.dev/core/v3/isomorphic"; -import { - $transaction, Prisma, PrismaClient, PrismaClientOrTransaction, - RuntimeEnvironmentType, TaskRun, TaskRunExecutionSnapshot, - TaskRunExecutionStatus, - TaskRunStatus, Waitpoint, } from "@trigger.dev/database"; import { assertNever } from "assert-never"; -import { nanoid } from "nanoid"; import { EventEmitter } from "node:events"; -import { z } from "zod"; -import { RunQueue } from "../run-queue/index.js"; import { FairQueueSelectionStrategy } from "../run-queue/fairQueueSelectionStrategy.js"; +import { RunQueue } from "../run-queue/index.js"; +import { RunQueueFullKeyProducer } from "../run-queue/keyProducer.js"; import { MinimalAuthenticatedEnvironment } from "../shared/index.js"; -import { MAX_TASK_RUN_ATTEMPTS } from "./consts.js"; -import { getRunWithBackgroundWorkerTasks } from "./db/worker.js"; -import { runStatusFromError } from "./errors.js"; -import { EventBusEvents } from "./eventBus.js"; -import { executionResultFromSnapshot, getLatestExecutionSnapshot } from "./executionSnapshots.js"; +import { + NotImplementedError, + RunDuplicateIdempotencyKeyError, + ServiceValidationError, +} from "./errors.js"; +import { EventBus, EventBusEvents } from "./eventBus.js"; import { RunLocker } from "./locking.js"; -import { getMachinePreset } from "./machinePresets.js"; +import { BatchSystem } from "./systems/batchSystem.js"; +import { CheckpointSystem } from "./systems/checkpointSystem.js"; +import { DelayedRunSystem } from "./systems/delayedRunSystem.js"; +import { DequeueSystem } from "./systems/dequeueSystem.js"; +import { EnqueueSystem } from "./systems/enqueueSystem.js"; import { - isCheckpointable, - isDequeueableExecutionStatus, - isExecuting, - isFinalRunStatus, - isPendingExecuting, -} from "./statuses.js"; -import { HeartbeatTimeouts, RunEngineOptions, TriggerParams } from "./types.js"; -import { RunQueueFullKeyProducer } from "../run-queue/keyProducer.js"; -import { retryOutcomeFromCompletion } from "./retrying.js"; - -const workerCatalog = { - finishWaitpoint: { - schema: z.object({ - waitpointId: z.string(), - error: z.string().optional(), - }), - visibilityTimeoutMs: 5000, - }, - heartbeatSnapshot: { - schema: z.object({ - runId: z.string(), - snapshotId: z.string(), - }), - visibilityTimeoutMs: 5000, - }, - expireRun: { - schema: z.object({ - runId: z.string(), - }), - visibilityTimeoutMs: 5000, - }, - cancelRun: { - schema: z.object({ - runId: z.string(), - completedAt: z.coerce.date(), - reason: z.string().optional(), - }), - visibilityTimeoutMs: 5000, - }, - queueRunsWaitingForWorker: { - schema: z.object({ - backgroundWorkerId: z.string(), - }), - visibilityTimeoutMs: 5000, - }, - tryCompleteBatch: { - schema: z.object({ - batchId: z.string(), - }), - visibilityTimeoutMs: 10_000, - }, - continueRunIfUnblocked: { - schema: z.object({ - runId: z.string(), - }), - visibilityTimeoutMs: 10_000, - }, -}; - -type EngineWorker = Worker; + ExecutionSnapshotSystem, + getLatestExecutionSnapshot, +} from "./systems/executionSnapshotSystem.js"; +import { ReleaseConcurrencySystem } from "./systems/releaseConcurrencySystem.js"; +import { RunAttemptSystem } from "./systems/runAttemptSystem.js"; +import { SystemResources } from "./systems/systems.js"; +import { TtlSystem } from "./systems/ttlSystem.js"; +import { WaitingForWorkerSystem } from "./systems/waitingForWorkerSystem.js"; +import { WaitpointSystem } from "./systems/waitpointSystem.js"; +import { EngineWorker, HeartbeatTimeouts, RunEngineOptions, TriggerParams } from "./types.js"; +import { workerCatalog } from "./workerCatalog.js"; export class RunEngine { private runLockRedis: Redis; @@ -133,7 +62,18 @@ export class RunEngine { private logger = new Logger("RunEngine", "debug"); private tracer: Tracer; private heartbeatTimeouts: HeartbeatTimeouts; - eventBus = new EventEmitter(); + eventBus: EventBus = new EventEmitter(); + executionSnapshotSystem: ExecutionSnapshotSystem; + runAttemptSystem: RunAttemptSystem; + dequeueSystem: DequeueSystem; + waitpointSystem: WaitpointSystem; + batchSystem: BatchSystem; + enqueueSystem: EnqueueSystem; + checkpointSystem: CheckpointSystem; + delayedRunSystem: DelayedRunSystem; + ttlSystem: TtlSystem; + waitingForWorkerSystem: WaitingForWorkerSystem; + releaseConcurrencySystem: ReleaseConcurrencySystem; constructor(private readonly options: RunEngineOptions) { this.prisma = options.prisma; @@ -183,7 +123,7 @@ export class RunEngine { logger: new Logger("RunEngineWorker", "debug"), jobs: { finishWaitpoint: async ({ payload }) => { - await this.completeWaitpoint({ + await this.waitpointSystem.completeWaitpoint({ id: payload.waitpointId, output: payload.error ? { @@ -197,26 +137,31 @@ export class RunEngine { await this.#handleStalledSnapshot(payload); }, expireRun: async ({ payload }) => { - await this.#expireRun({ runId: payload.runId }); + await this.ttlSystem.expireRun({ runId: payload.runId }); }, cancelRun: async ({ payload }) => { - await this.cancelRun({ + await this.runAttemptSystem.cancelRun({ runId: payload.runId, completedAt: payload.completedAt, reason: payload.reason, }); }, queueRunsWaitingForWorker: async ({ payload }) => { - await this.#queueRunsWaitingForWorker({ backgroundWorkerId: payload.backgroundWorkerId }); + await this.waitingForWorkerSystem.enqueueRunsWaitingForWorker({ + backgroundWorkerId: payload.backgroundWorkerId, + }); }, tryCompleteBatch: async ({ payload }) => { - await this.#tryCompleteBatch({ batchId: payload.batchId }); + await this.batchSystem.performCompleteBatch({ batchId: payload.batchId }); }, continueRunIfUnblocked: async ({ payload }) => { - await this.#continueRunIfUnblocked({ + await this.waitpointSystem.continueRunIfUnblocked({ runId: payload.runId, }); }, + enqueueDelayedRun: async ({ payload }) => { + await this.delayedRunSystem.enqueueDelayedRun({ runId: payload.runId }); + }, }, }).start(); @@ -232,6 +177,128 @@ export class RunEngine { ...defaultHeartbeatTimeouts, ...(options.heartbeatTimeoutsMs ?? {}), }; + + const resources: SystemResources = { + prisma: this.prisma, + worker: this.worker, + eventBus: this.eventBus, + logger: this.logger, + tracer: this.tracer, + runLock: this.runLock, + runQueue: this.runQueue, + }; + + this.releaseConcurrencySystem = new ReleaseConcurrencySystem({ + resources, + queueOptions: + typeof options.releaseConcurrency?.disabled === "boolean" && + options.releaseConcurrency.disabled + ? undefined + : { + redis: { + ...options.queue.redis, // Use base queue redis options + ...options.releaseConcurrency?.redis, // Allow overrides + keyPrefix: `${options.queue.redis.keyPrefix ?? ""}release-concurrency:`, + }, + retry: { + maxRetries: options.releaseConcurrency?.maxRetries ?? 5, + backoff: { + minDelay: options.releaseConcurrency?.backoff?.minDelay ?? 1000, + maxDelay: options.releaseConcurrency?.backoff?.maxDelay ?? 10000, + factor: options.releaseConcurrency?.backoff?.factor ?? 2, + }, + }, + consumersCount: options.releaseConcurrency?.consumersCount ?? 1, + pollInterval: options.releaseConcurrency?.pollInterval ?? 1000, + batchSize: options.releaseConcurrency?.batchSize ?? 10, + executor: async (descriptor, snapshotId) => { + await this.releaseConcurrencySystem.executeReleaseConcurrencyForSnapshot( + snapshotId + ); + }, + maxTokens: async (descriptor) => { + const environment = await this.prisma.runtimeEnvironment.findFirstOrThrow({ + where: { id: descriptor.envId }, + select: { + maximumConcurrencyLimit: true, + }, + }); + + return ( + environment.maximumConcurrencyLimit * + (options.releaseConcurrency?.maxTokensRatio ?? 1.0) + ); + }, + keys: { + fromDescriptor: (descriptor) => + `org:${descriptor.orgId}:proj:${descriptor.projectId}:env:${descriptor.envId}`, + toDescriptor: (name) => ({ + orgId: name.split(":")[1], + projectId: name.split(":")[3], + envId: name.split(":")[5], + }), + }, + tracer: this.tracer, + }, + }); + + this.executionSnapshotSystem = new ExecutionSnapshotSystem({ + resources, + heartbeatTimeouts: this.heartbeatTimeouts, + }); + + this.enqueueSystem = new EnqueueSystem({ + resources, + executionSnapshotSystem: this.executionSnapshotSystem, + }); + + this.checkpointSystem = new CheckpointSystem({ + resources, + releaseConcurrencySystem: this.releaseConcurrencySystem, + executionSnapshotSystem: this.executionSnapshotSystem, + enqueueSystem: this.enqueueSystem, + }); + + this.delayedRunSystem = new DelayedRunSystem({ + resources, + enqueueSystem: this.enqueueSystem, + }); + + this.waitingForWorkerSystem = new WaitingForWorkerSystem({ + resources, + enqueueSystem: this.enqueueSystem, + }); + + this.waitpointSystem = new WaitpointSystem({ + resources, + executionSnapshotSystem: this.executionSnapshotSystem, + enqueueSystem: this.enqueueSystem, + releaseConcurrencySystem: this.releaseConcurrencySystem, + }); + + this.ttlSystem = new TtlSystem({ + resources, + waitpointSystem: this.waitpointSystem, + }); + + this.batchSystem = new BatchSystem({ + resources, + }); + + this.runAttemptSystem = new RunAttemptSystem({ + resources, + executionSnapshotSystem: this.executionSnapshotSystem, + batchSystem: this.batchSystem, + waitpointSystem: this.waitpointSystem, + machines: this.options.machines, + }); + + this.dequeueSystem = new DequeueSystem({ + resources, + executionSnapshotSystem: this.executionSnapshotSystem, + runAttemptSystem: this.runAttemptSystem, + machines: this.options.machines, + }); } //MARK: - Run functions @@ -282,19 +349,15 @@ export class RunEngine { machine, workerId, runnerId, + releaseConcurrency, }: TriggerParams, tx?: PrismaClientOrTransaction ): Promise { const prisma = tx ?? this.prisma; - return this.#trace( + return startSpan( + this.tracer, "trigger", - { - friendlyId, - environmentId: environment.id, - projectId: environment.project.id, - taskIdentifier, - }, async (span) => { const status = delayUntil ? "DELAYED" : "PENDING"; @@ -378,6 +441,8 @@ export class RunEngine { runStatus: status, environmentId: environment.id, environmentType: environment.type, + projectId: environment.project.id, + organizationId: environment.organization.id, workerId, runnerId, }, @@ -417,43 +482,28 @@ export class RunEngine { await this.runLock.lock([taskRun.id], 5000, async (signal) => { //create associated waitpoint (this completes when the run completes) - const associatedWaitpoint = await this.#createRunAssociatedWaitpoint(prisma, { - projectId: environment.project.id, - environmentId: environment.id, - completedByTaskRunId: taskRun.id, - }); + const associatedWaitpoint = await this.waitpointSystem.createRunAssociatedWaitpoint( + prisma, + { + projectId: environment.project.id, + environmentId: environment.id, + completedByTaskRunId: taskRun.id, + } + ); //triggerAndWait or batchTriggerAndWait if (resumeParentOnCompletion && parentTaskRunId) { //this will block the parent run from continuing until this waitpoint is completed (and removed) - await this.blockRunWithWaitpoint({ + await this.waitpointSystem.blockRunWithWaitpoint({ runId: parentTaskRunId, waitpoints: associatedWaitpoint.id, - environmentId: associatedWaitpoint.environmentId, projectId: associatedWaitpoint.projectId, - organizationId: environment.organization.id, batch, workerId, runnerId, tx: prisma, + releaseConcurrency, }); - - //release the concurrency - //if the queue is the same then it's recursive and we need to release that too otherwise we could have a deadlock - const parentRun = await prisma.taskRun.findUnique({ - select: { - queue: true, - }, - where: { - id: parentTaskRunId, - }, - }); - const releaseRunConcurrency = parentRun?.queue === taskRun.queue; - await this.runQueue.releaseConcurrency( - environment.organization.id, - parentTaskRunId, - releaseRunConcurrency - ); } //Make sure lock extension succeeded @@ -516,42 +566,17 @@ export class RunEngine { } } - if (taskRun.delayUntil) { - const delayWaitpointResult = await this.createDateTimeWaitpoint({ - projectId: environment.project.id, - environmentId: environment.id, - completedAfter: taskRun.delayUntil, - tx: prisma, - }); - - await prisma.taskRunWaitpoint.create({ - data: { - taskRunId: taskRun.id, - waitpointId: delayWaitpointResult.waitpoint.id, - projectId: delayWaitpointResult.waitpoint.projectId, - }, - }); - } - - if (!taskRun.delayUntil && taskRun.ttl) { - const expireAt = parseNaturalLanguageDuration(taskRun.ttl); - - if (expireAt) { - await this.worker.enqueue({ - id: `expireRun:${taskRun.id}`, - job: "expireRun", - payload: { runId: taskRun.id }, - availableAt: expireAt, - }); - } - } - //Make sure lock extension succeeded signal.throwIfAborted(); - //enqueue the run if it's not delayed - if (!taskRun.delayUntil) { - await this.#enqueueRun({ + if (taskRun.delayUntil) { + // Schedule the run to be enqueued at the delayUntil time + await this.delayedRunSystem.scheduleDelayedRunEnqueuing({ + runId: taskRun.id, + delayUntil: taskRun.delayUntil, + }); + } else { + await this.enqueueSystem.enqueueRun({ run: taskRun, env: environment, timestamp: Date.now() - taskRun.priorityMs, @@ -559,10 +584,22 @@ export class RunEngine { runnerId, tx: prisma, }); + + if (taskRun.ttl) { + await this.ttlSystem.scheduleExpireRun({ runId: taskRun.id, ttl: taskRun.ttl }); + } } }); return taskRun; + }, + { + attributes: { + friendlyId, + environmentId: environment.id, + projectId: environment.project.id, + taskIdentifier, + }, } ); } @@ -592,437 +629,15 @@ export class RunEngine { runnerId?: string; tx?: PrismaClientOrTransaction; }): Promise { - const prisma = tx ?? this.prisma; - return this.#trace("dequeueFromMasterQueue", { consumerId, masterQueue }, async (span) => { - //gets multiple runs from the queue - const messages = await this.runQueue.dequeueMessageFromMasterQueue( - consumerId, - masterQueue, - maxRunCount - ); - if (messages.length === 0) { - return []; - } - - //we can't send more than the max resources - const consumedResources: MachineResources = { - cpu: 0, - memory: 0, - }; - - const dequeuedRuns: DequeuedMessage[] = []; - - for (const message of messages) { - const orgId = message.message.orgId; - const runId = message.messageId; - - span.setAttribute("runId", runId); - - //lock the run so nothing else can modify it - try { - const dequeuedRun = await this.runLock.lock([runId], 5000, async (signal) => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); - - if (!isDequeueableExecutionStatus(snapshot.executionStatus)) { - //create a failed snapshot - await this.#createExecutionSnapshot(prisma, { - run: { - id: snapshot.runId, - status: snapshot.runStatus, - }, - snapshot: { - executionStatus: snapshot.executionStatus, - description: - "Tried to dequeue a run that is not in a valid state to be dequeued.", - }, - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - checkpointId: snapshot.checkpointId ?? undefined, - completedWaitpoints: snapshot.completedWaitpoints, - error: `Tried to dequeue a run that is not in a valid state to be dequeued.`, - workerId, - runnerId, - }); - - //todo is there a way to recover this, so the run can be retried? - //for example should we update the status to a dequeuable status and nack it? - //then at least it has a chance of succeeding and we have the error log above - await this.#systemFailure({ - runId, - error: { - type: "INTERNAL_ERROR", - code: "TASK_DEQUEUED_INVALID_STATE", - message: `Task was in the ${snapshot.executionStatus} state when it was dequeued for execution.`, - }, - tx: prisma, - }); - this.logger.error( - `RunEngine.dequeueFromMasterQueue(): Run is not in a valid state to be dequeued: ${runId}\n ${snapshot.id}:${snapshot.executionStatus}` - ); - return null; - } - - const result = await getRunWithBackgroundWorkerTasks(prisma, runId, backgroundWorkerId); - - if (!result.success) { - switch (result.code) { - case "NO_RUN": { - //this should not happen, the run is unrecoverable so we'll ack it - this.logger.error("RunEngine.dequeueFromMasterQueue(): No run found", { - runId, - latestSnapshot: snapshot.id, - }); - await this.runQueue.acknowledgeMessage(orgId, runId); - return null; - } - case "NO_WORKER": - case "TASK_NEVER_REGISTERED": - case "TASK_NOT_IN_LATEST": { - this.logger.warn(`RunEngine.dequeueFromMasterQueue(): ${result.code}`, { - runId, - latestSnapshot: snapshot.id, - result, - }); - - //not deployed yet, so we'll wait for the deploy - await this.#waitingForDeploy({ - orgId, - runId, - reason: result.message, - tx: prisma, - }); - return null; - } - case "BACKGROUND_WORKER_MISMATCH": { - this.logger.warn( - "RunEngine.dequeueFromMasterQueue(): Background worker mismatch", - { - runId, - latestSnapshot: snapshot.id, - result, - } - ); - - //worker mismatch so put it back in the queue - await this.runQueue.nackMessage({ orgId, messageId: runId }); - - return null; - } - default: { - assertExhaustive(result); - } - } - } - - //check for a valid deployment if it's not a development environment - if (result.run.runtimeEnvironment.type !== "DEVELOPMENT") { - if (!result.deployment || !result.deployment.imageReference) { - this.logger.warn("RunEngine.dequeueFromMasterQueue(): No deployment found", { - runId, - latestSnapshot: snapshot.id, - result, - }); - //not deployed yet, so we'll wait for the deploy - await this.#waitingForDeploy({ - orgId, - runId, - reason: "No deployment or deployment image reference found for deployed run", - tx: prisma, - }); - - return null; - } - } - - const machinePreset = getMachinePreset({ - machines: this.options.machines.machines, - defaultMachine: this.options.machines.defaultMachine, - config: result.task.machineConfig ?? {}, - run: result.run, - }); - - //increment the consumed resources - consumedResources.cpu += machinePreset.cpu; - consumedResources.memory += machinePreset.memory; - - //are we under the limit? - if (maxResources) { - if ( - consumedResources.cpu > maxResources.cpu || - consumedResources.memory > maxResources.memory - ) { - this.logger.debug( - "RunEngine.dequeueFromMasterQueue(): Consumed resources over limit, nacking", - { - runId, - consumedResources, - maxResources, - } - ); - - //put it back in the queue where it was - await this.runQueue.nackMessage({ - orgId, - messageId: runId, - incrementAttemptCount: false, - retryAt: result.run.createdAt.getTime() - result.run.priorityMs, - }); - return null; - } - } - - // Check max attempts that can optionally be set when triggering a run - let maxAttempts: number | null | undefined = result.run.maxAttempts; - - // If it's not set, we'll grab it from the task's retry config - if (!maxAttempts) { - const retryConfig = result.task.retryConfig; - - this.logger.debug( - "RunEngine.dequeueFromMasterQueue(): maxAttempts not set, using task's retry config", - { - runId, - task: result.task.id, - rawRetryConfig: retryConfig, - } - ); - - const parsedConfig = RetryOptions.nullable().safeParse(retryConfig); - - if (!parsedConfig.success) { - this.logger.error("RunEngine.dequeueFromMasterQueue(): Invalid retry config", { - runId, - task: result.task.id, - rawRetryConfig: retryConfig, - }); - - await this.#systemFailure({ - runId, - error: { - type: "INTERNAL_ERROR", - code: "TASK_DEQUEUED_INVALID_RETRY_CONFIG", - message: `Invalid retry config: ${retryConfig}`, - }, - tx: prisma, - }); - - return null; - } - - if (!parsedConfig.data) { - this.logger.error("RunEngine.dequeueFromMasterQueue(): No retry config", { - runId, - task: result.task.id, - rawRetryConfig: retryConfig, - }); - - await this.#systemFailure({ - runId, - error: { - type: "INTERNAL_ERROR", - code: "TASK_DEQUEUED_NO_RETRY_CONFIG", - message: `No retry config found`, - }, - tx: prisma, - }); - - return null; - } - - maxAttempts = parsedConfig.data.maxAttempts; - } - - //update the run - const lockedTaskRun = await prisma.taskRun.update({ - where: { - id: runId, - }, - data: { - lockedAt: new Date(), - lockedById: result.task.id, - lockedToVersionId: result.worker.id, - startedAt: result.run.startedAt ?? new Date(), - baseCostInCents: this.options.machines.baseCostInCents, - machinePreset: machinePreset.name, - taskVersion: result.worker.version, - sdkVersion: result.worker.sdkVersion, - cliVersion: result.worker.cliVersion, - maxDurationInSeconds: getMaxDuration( - result.run.maxDurationInSeconds, - result.task.maxDurationInSeconds - ), - maxAttempts: maxAttempts ?? undefined, - }, - include: { - runtimeEnvironment: true, - tags: true, - }, - }); - - if (!lockedTaskRun) { - this.logger.error("RunEngine.dequeueFromMasterQueue(): Failed to lock task run", { - taskRun: result.run.id, - taskIdentifier: result.run.taskIdentifier, - deployment: result.deployment?.id, - worker: result.worker.id, - task: result.task.id, - runId, - }); - - await this.runQueue.acknowledgeMessage(orgId, runId); - return null; - } - - const queue = await prisma.taskQueue.findUnique({ - where: { - runtimeEnvironmentId_name: { - runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId, - name: sanitizeQueueName(lockedTaskRun.queue), - }, - }, - }); - - if (!queue) { - this.logger.debug( - "RunEngine.dequeueFromMasterQueue(): queue not found, so nacking message", - { - queueMessage: message, - taskRunQueue: lockedTaskRun.queue, - runtimeEnvironmentId: lockedTaskRun.runtimeEnvironmentId, - } - ); - - //will auto-retry - const gotRequeued = await this.runQueue.nackMessage({ orgId, messageId: runId }); - if (!gotRequeued) { - await this.#systemFailure({ - runId, - error: { - type: "INTERNAL_ERROR", - code: "TASK_DEQUEUED_QUEUE_NOT_FOUND", - message: `Tried to dequeue the run but the queue doesn't exist: ${lockedTaskRun.queue}`, - }, - tx: prisma, - }); - } - - return null; - } - - const currentAttemptNumber = lockedTaskRun.attemptNumber ?? 0; - const nextAttemptNumber = currentAttemptNumber + 1; - - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run: { - id: runId, - status: snapshot.runStatus, - attemptNumber: lockedTaskRun.attemptNumber, - }, - snapshot: { - executionStatus: "PENDING_EXECUTING", - description: "Run was dequeued for execution", - }, - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - checkpointId: snapshot.checkpointId ?? undefined, - completedWaitpoints: snapshot.completedWaitpoints, - workerId, - runnerId, - }); - - return { - version: "1" as const, - dequeuedAt: new Date(), - snapshot: { - id: newSnapshot.id, - friendlyId: newSnapshot.friendlyId, - executionStatus: newSnapshot.executionStatus, - description: newSnapshot.description, - }, - image: result.deployment?.imageReference ?? undefined, - checkpoint: newSnapshot.checkpoint ?? undefined, - completedWaitpoints: snapshot.completedWaitpoints, - backgroundWorker: { - id: result.worker.id, - friendlyId: result.worker.friendlyId, - version: result.worker.version, - }, - deployment: { - id: result.deployment?.id, - friendlyId: result.deployment?.friendlyId, - }, - run: { - id: lockedTaskRun.id, - friendlyId: lockedTaskRun.friendlyId, - isTest: lockedTaskRun.isTest, - machine: machinePreset, - attemptNumber: nextAttemptNumber, - masterQueue: lockedTaskRun.masterQueue, - traceContext: lockedTaskRun.traceContext as Record, - }, - environment: { - id: lockedTaskRun.runtimeEnvironment.id, - type: lockedTaskRun.runtimeEnvironment.type, - }, - organization: { - id: orgId, - }, - project: { - id: lockedTaskRun.projectId, - }, - } satisfies DequeuedMessage; - }); - - if (dequeuedRun !== null) { - dequeuedRuns.push(dequeuedRun); - } - } catch (error) { - this.logger.error( - "RunEngine.dequeueFromMasterQueue(): Thrown error while preparing run to be run", - { - error, - runId, - } - ); - - const run = await prisma.taskRun.findFirst({ - where: { id: runId }, - include: { - runtimeEnvironment: true, - }, - }); - - if (!run) { - //this isn't ideal because we're not creating a snapshot… but we can't do much else - this.logger.error( - "RunEngine.dequeueFromMasterQueue(): Thrown error, then run not found. Nacking.", - { - runId, - orgId, - } - ); - await this.runQueue.nackMessage({ orgId, messageId: runId }); - continue; - } - - //this is an unknown error, we'll reattempt (with auto-backoff and eventually DLQ) - const gotRequeued = await this.#tryNackAndRequeue({ - run, - environment: run.runtimeEnvironment, - orgId, - error: { - type: "INTERNAL_ERROR", - code: "TASK_RUN_DEQUEUED_MAX_RETRIES", - message: `We tried to dequeue the run the maximum number of times but it wouldn't start executing`, - }, - tx: prisma, - }); - //we don't need this, but it makes it clear we're in a loop here - continue; - } - } - - return dequeuedRuns; + return this.dequeueSystem.dequeueFromMasterQueue({ + consumerId, + masterQueue, + maxRunCount, + maxResources, + backgroundWorkerId, + workerId, + runnerId, + tx, }); } @@ -1101,317 +716,38 @@ export class RunEngine { isWarmStart?: boolean; tx?: PrismaClientOrTransaction; }): Promise { - const prisma = tx ?? this.prisma; - - return this.#trace("startRunAttempt", { runId, snapshotId }, async (span) => { - return this.runLock.lock([runId], 5000, async (signal) => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + return this.runAttemptSystem.startRunAttempt({ + runId, + snapshotId, + workerId, + runnerId, + isWarmStart, + tx, + }); + } - if (latestSnapshot.id !== snapshotId) { - //if there is a big delay between the snapshot and the attempt, the snapshot might have changed - //we just want to log because elsewhere it should have been put back into a state where it can be attempted - this.logger.warn( - "RunEngine.createRunAttempt(): snapshot has changed since the attempt was created, ignoring." - ); - throw new ServiceValidationError("Snapshot changed", 409); - } - - const environment = await this.#getAuthenticatedEnvironmentFromRun(runId, prisma); - if (!environment) { - throw new ServiceValidationError("Environment not found", 404); - } - - const taskRun = await prisma.taskRun.findFirst({ - where: { - id: runId, - }, - include: { - tags: true, - lockedBy: { - include: { - worker: { - select: { - id: true, - version: true, - sdkVersion: true, - cliVersion: true, - supportsLazyAttempts: true, - }, - }, - }, - }, - batchItems: { - include: { - batchTaskRun: true, - }, - }, - }, - }); - - this.logger.debug("Creating a task run attempt", { taskRun }); - - if (!taskRun) { - throw new ServiceValidationError("Task run not found", 404); - } - - span.setAttribute("projectId", taskRun.projectId); - span.setAttribute("environmentId", taskRun.runtimeEnvironmentId); - span.setAttribute("taskRunId", taskRun.id); - span.setAttribute("taskRunFriendlyId", taskRun.friendlyId); - - if (taskRun.status === "CANCELED") { - throw new ServiceValidationError("Task run is cancelled", 400); - } - - if (!taskRun.lockedBy) { - throw new ServiceValidationError("Task run is not locked", 400); - } - - const queue = await prisma.taskQueue.findUnique({ - where: { - runtimeEnvironmentId_name: { - runtimeEnvironmentId: environment.id, - name: taskRun.queue, - }, - }, - }); - - if (!queue) { - throw new ServiceValidationError("Queue not found", 404); - } - - //increment the attempt number (start at 1) - const nextAttemptNumber = (taskRun.attemptNumber ?? 0) + 1; - - if (nextAttemptNumber > MAX_TASK_RUN_ATTEMPTS) { - await this.#attemptFailed({ - runId: taskRun.id, - snapshotId, - completion: { - ok: false, - id: taskRun.id, - error: { - type: "INTERNAL_ERROR", - code: "TASK_RUN_CRASHED", - message: "Max attempts reached.", - }, - }, - tx: prisma, - }); - throw new ServiceValidationError("Max attempts reached", 400); - } - - this.eventBus.emit("runAttemptStarted", { - time: new Date(), - run: { - id: taskRun.id, - attemptNumber: nextAttemptNumber, - baseCostInCents: taskRun.baseCostInCents, - }, - organization: { - id: environment.organization.id, - }, - }); - - const result = await $transaction( - prisma, - async (tx) => { - const run = await tx.taskRun.update({ - where: { - id: taskRun.id, - }, - data: { - status: "EXECUTING", - attemptNumber: nextAttemptNumber, - executedAt: taskRun.attemptNumber === null ? new Date() : undefined, - }, - include: { - tags: true, - lockedBy: { - include: { worker: true }, - }, - }, - }); - - const newSnapshot = await this.#createExecutionSnapshot(tx, { - run, - snapshot: { - executionStatus: "EXECUTING", - description: `Attempt created, starting execution${ - isWarmStart ? " (warm start)" : "" - }`, - }, - environmentId: latestSnapshot.environmentId, - environmentType: latestSnapshot.environmentType, - workerId, - runnerId, - }); - - if (taskRun.ttl) { - //don't expire the run, it's going to execute - await this.worker.ack(`expireRun:${taskRun.id}`); - } - - return { run, snapshot: newSnapshot }; - }, - (error) => { - this.logger.error("RunEngine.createRunAttempt(): prisma.$transaction error", { - code: error.code, - meta: error.meta, - stack: error.stack, - message: error.message, - name: error.name, - }); - throw new ServiceValidationError( - "Failed to update task run and execution snapshot", - 500 - ); - } - ); - - if (!result) { - this.logger.error("RunEngine.createRunAttempt(): failed to create task run attempt", { - runId: taskRun.id, - nextAttemptNumber, - }); - throw new ServiceValidationError("Failed to create task run attempt", 500); - } - - const { run, snapshot } = result; - - const machinePreset = getMachinePreset({ - machines: this.options.machines.machines, - defaultMachine: this.options.machines.defaultMachine, - config: taskRun.lockedBy.machineConfig ?? {}, - run: taskRun, - }); - - const metadata = await parsePacket({ - data: taskRun.metadata ?? undefined, - dataType: taskRun.metadataType, - }); - - const execution: TaskRunExecution = { - task: { - id: run.lockedBy!.slug, - filePath: run.lockedBy!.filePath, - exportName: run.lockedBy!.exportName, - }, - attempt: { - number: nextAttemptNumber, - startedAt: latestSnapshot.updatedAt, - /** @deprecated */ - id: "deprecated", - /** @deprecated */ - backgroundWorkerId: "deprecated", - /** @deprecated */ - backgroundWorkerTaskId: "deprecated", - /** @deprecated */ - status: "deprecated", - }, - run: { - id: run.friendlyId, - payload: run.payload, - payloadType: run.payloadType, - createdAt: run.createdAt, - tags: run.tags.map((tag) => tag.name), - isTest: run.isTest, - idempotencyKey: run.idempotencyKey ?? undefined, - startedAt: run.startedAt ?? run.createdAt, - maxAttempts: run.maxAttempts ?? undefined, - version: run.lockedBy!.worker.version, - metadata, - maxDuration: run.maxDurationInSeconds ?? undefined, - /** @deprecated */ - context: undefined, - /** @deprecated */ - durationMs: run.usageDurationMs, - /** @deprecated */ - costInCents: run.costInCents, - /** @deprecated */ - baseCostInCents: run.baseCostInCents, - traceContext: run.traceContext as Record, - priority: run.priorityMs === 0 ? undefined : run.priorityMs / 1_000, - }, - queue: { - id: queue.friendlyId, - name: queue.name, - }, - environment: { - id: environment.id, - slug: environment.slug, - type: environment.type, - }, - organization: { - id: environment.organization.id, - slug: environment.organization.slug, - name: environment.organization.title, - }, - project: { - id: environment.project.id, - ref: environment.project.externalRef, - slug: environment.project.slug, - name: environment.project.name, - }, - batch: - taskRun.batchItems[0] && taskRun.batchItems[0].batchTaskRun - ? { id: taskRun.batchItems[0].batchTaskRun.friendlyId } - : undefined, - machine: machinePreset, - }; - - return { run, snapshot, execution }; - }); - }); - } - - /** How a run is completed */ - async completeRunAttempt({ - runId, - snapshotId, - completion, - workerId, - runnerId, - }: { - runId: string; - snapshotId: string; - completion: TaskRunExecutionResult; - workerId?: string; - runnerId?: string; - }): Promise { - if (completion.metadata) { - this.eventBus.emit("runMetadataUpdated", { - time: new Date(), - run: { - id: runId, - metadata: completion.metadata, - }, - }); - } - - switch (completion.ok) { - case true: { - return this.#attemptSucceeded({ - runId, - snapshotId, - completion, - tx: this.prisma, - workerId, - runnerId, - }); - } - case false: { - return this.#attemptFailed({ - runId, - snapshotId, - completion, - tx: this.prisma, - workerId, - runnerId, - }); - } - } - } + /** How a run is completed */ + async completeRunAttempt({ + runId, + snapshotId, + completion, + workerId, + runnerId, + }: { + runId: string; + snapshotId: string; + completion: TaskRunExecutionResult; + workerId?: string; + runnerId?: string; + }): Promise { + return this.runAttemptSystem.completeRunAttempt({ + runId, + snapshotId, + completion, + workerId, + runnerId, + }); + } /** Call this to cancel a run. @@ -1436,138 +772,14 @@ export class RunEngine { finalizeRun?: boolean; tx?: PrismaClientOrTransaction; }): Promise { - const prisma = tx ?? this.prisma; - reason = reason ?? "Cancelled by user"; - - return this.#trace("cancelRun", { runId }, async (span) => { - return this.runLock.lock([runId], 5_000, async (signal) => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); - - //already finished, do nothing - if (latestSnapshot.executionStatus === "FINISHED") { - return executionResultFromSnapshot(latestSnapshot); - } - - //is pending cancellation and we're not finalizing, alert the worker again - if (latestSnapshot.executionStatus === "PENDING_CANCEL" && !finalizeRun) { - await this.#sendNotificationToWorker({ runId, snapshot: latestSnapshot }); - return executionResultFromSnapshot(latestSnapshot); - } - - //set the run to cancelled immediately - const error: TaskRunError = { - type: "STRING_ERROR", - raw: reason, - }; - - const run = await prisma.taskRun.update({ - where: { id: runId }, - data: { - status: "CANCELED", - completedAt: finalizeRun ? completedAt ?? new Date() : completedAt, - error, - }, - select: { - id: true, - friendlyId: true, - status: true, - attemptNumber: true, - spanId: true, - batchId: true, - createdAt: true, - completedAt: true, - taskEventStore: true, - runtimeEnvironment: { - select: { - organizationId: true, - }, - }, - associatedWaitpoint: { - select: { - id: true, - }, - }, - childRuns: { - select: { - id: true, - }, - }, - }, - }); - - //remove it from the queue and release concurrency - await this.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId); - - //if executing, we need to message the worker to cancel the run and put it into `PENDING_CANCEL` status - if (isExecuting(latestSnapshot.executionStatus)) { - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "PENDING_CANCEL", - description: "Run was cancelled", - }, - environmentId: latestSnapshot.environmentId, - environmentType: latestSnapshot.environmentType, - workerId, - runnerId, - }); - - //the worker needs to be notified so it can kill the run and complete the attempt - await this.#sendNotificationToWorker({ runId, snapshot: newSnapshot }); - return executionResultFromSnapshot(newSnapshot); - } - - //not executing, so we will actually finish the run - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "FINISHED", - description: "Run was cancelled, not finished", - }, - environmentId: latestSnapshot.environmentId, - environmentType: latestSnapshot.environmentType, - workerId, - runnerId, - }); - - if (!run.associatedWaitpoint) { - throw new ServiceValidationError("No associated waitpoint found", 400); - } - - //complete the waitpoint so the parent run can continue - await this.completeWaitpoint({ - id: run.associatedWaitpoint.id, - output: { value: JSON.stringify(error), isError: true }, - }); - - this.eventBus.emit("runCancelled", { - time: new Date(), - run: { - id: run.id, - friendlyId: run.friendlyId, - spanId: run.spanId, - taskEventStore: run.taskEventStore, - createdAt: run.createdAt, - completedAt: run.completedAt, - error, - }, - }); - - //schedule the cancellation of all the child runs - //it will call this function for each child, - //which will recursively cancel all children if they need to be - if (run.childRuns.length > 0) { - for (const childRun of run.childRuns) { - await this.worker.enqueue({ - id: `cancelRun:${childRun.id}`, - job: "cancelRun", - payload: { runId: childRun.id, completedAt: run.completedAt ?? new Date(), reason }, - }); - } - } - - return executionResultFromSnapshot(newSnapshot); - }); + return this.runAttemptSystem.cancelRun({ + runId, + workerId, + runnerId, + completedAt, + reason, + finalizeRun, + tx, }); } @@ -1576,17 +788,15 @@ export class RunEngine { }: { backgroundWorkerId: string; }): Promise { - //we want this to happen in the background - await this.worker.enqueue({ - job: "queueRunsWaitingForWorker", - payload: { backgroundWorkerId }, + return this.waitingForWorkerSystem.enqueueRunsWaitingForWorker({ + backgroundWorkerId, }); } /** * Reschedules a delayed run where the run hasn't been queued yet */ - async rescheduleRun({ + async rescheduleDelayedRun({ runId, delayUntil, tx, @@ -1595,56 +805,10 @@ export class RunEngine { delayUntil: Date; tx?: PrismaClientOrTransaction; }): Promise { - const prisma = tx ?? this.prisma; - return this.#trace("rescheduleRun", { runId }, async (span) => { - return await this.runLock.lock([runId], 5_000, async (signal) => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); - - //if the run isn't just created then we can't reschedule it - if (snapshot.executionStatus !== "RUN_CREATED") { - throw new ServiceValidationError("Cannot reschedule a run that is not delayed"); - } - - const updatedRun = await prisma.taskRun.update({ - where: { - id: runId, - }, - data: { - delayUntil: delayUntil, - executionSnapshots: { - create: { - engine: "V2", - executionStatus: "RUN_CREATED", - description: "Delayed run was rescheduled to a future date", - runStatus: "EXPIRED", - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - }, - }, - }, - include: { - blockedByWaitpoints: true, - }, - }); - - if (updatedRun.blockedByWaitpoints.length === 0) { - throw new ServiceValidationError( - "Cannot reschedule a run that is not blocked by a waitpoint" - ); - } - - const result = await this.#rescheduleDateTimeWaitpoint( - prisma, - updatedRun.blockedByWaitpoints[0].waitpointId, - delayUntil - ); - - if (!result.success) { - throw new ServiceValidationError("Failed to reschedule waitpoint, too late.", 400); - } - - return updatedRun; - }); + return this.delayedRunSystem.rescheduleDelayedRun({ + runId, + delayUntil, + tx, }); } @@ -1689,70 +853,14 @@ export class RunEngine { idempotencyKeyExpiresAt?: Date; tx?: PrismaClientOrTransaction; }) { - const prisma = tx ?? this.prisma; - - const existingWaitpoint = idempotencyKey - ? await prisma.waitpoint.findUnique({ - where: { - environmentId_idempotencyKey: { - environmentId, - idempotencyKey, - }, - }, - }) - : undefined; - - if (existingWaitpoint) { - if ( - existingWaitpoint.idempotencyKeyExpiresAt && - new Date() > existingWaitpoint.idempotencyKeyExpiresAt - ) { - //the idempotency key has expired - //remove the waitpoint idempotencyKey - await prisma.waitpoint.update({ - where: { - id: existingWaitpoint.id, - }, - data: { - idempotencyKey: nanoid(24), - inactiveIdempotencyKey: existingWaitpoint.idempotencyKey, - }, - }); - - //let it fall through to create a new waitpoint - } else { - return { waitpoint: existingWaitpoint, isCached: true }; - } - } - - const waitpoint = await prisma.waitpoint.upsert({ - where: { - environmentId_idempotencyKey: { - environmentId, - idempotencyKey: idempotencyKey ?? nanoid(24), - }, - }, - create: { - ...WaitpointId.generate(), - type: "DATETIME", - idempotencyKey: idempotencyKey ?? nanoid(24), - idempotencyKeyExpiresAt, - userProvidedIdempotencyKey: !!idempotencyKey, - environmentId, - projectId, - completedAfter, - }, - update: {}, - }); - - await this.worker.enqueue({ - id: `finishWaitpoint.${waitpoint.id}`, - job: "finishWaitpoint", - payload: { waitpointId: waitpoint.id }, - availableAt: completedAfter, + return this.waitpointSystem.createDateTimeWaitpoint({ + projectId, + environmentId, + completedAfter, + idempotencyKey, + idempotencyKeyExpiresAt, + tx, }); - - return { waitpoint, isCached: false }; } /** This creates a MANUAL waitpoint, that can be explicitly completed (or failed). @@ -1771,74 +879,13 @@ export class RunEngine { idempotencyKeyExpiresAt?: Date; timeout?: Date; }): Promise<{ waitpoint: Waitpoint; isCached: boolean }> { - const existingWaitpoint = idempotencyKey - ? await this.prisma.waitpoint.findUnique({ - where: { - environmentId_idempotencyKey: { - environmentId, - idempotencyKey, - }, - }, - }) - : undefined; - - if (existingWaitpoint) { - if ( - existingWaitpoint.idempotencyKeyExpiresAt && - new Date() > existingWaitpoint.idempotencyKeyExpiresAt - ) { - //the idempotency key has expired - //remove the waitpoint idempotencyKey - await this.prisma.waitpoint.update({ - where: { - id: existingWaitpoint.id, - }, - data: { - idempotencyKey: nanoid(24), - inactiveIdempotencyKey: existingWaitpoint.idempotencyKey, - }, - }); - - //let it fall through to create a new waitpoint - } else { - return { waitpoint: existingWaitpoint, isCached: true }; - } - } - - const waitpoint = await this.prisma.waitpoint.upsert({ - where: { - environmentId_idempotencyKey: { - environmentId, - idempotencyKey: idempotencyKey ?? nanoid(24), - }, - }, - create: { - ...WaitpointId.generate(), - type: "MANUAL", - idempotencyKey: idempotencyKey ?? nanoid(24), - idempotencyKeyExpiresAt, - userProvidedIdempotencyKey: !!idempotencyKey, - environmentId, - projectId, - completedAfter: timeout, - }, - update: {}, + return this.waitpointSystem.createManualWaitpoint({ + environmentId, + projectId, + idempotencyKey, + idempotencyKeyExpiresAt, + timeout, }); - - //schedule the timeout - if (timeout) { - await this.worker.enqueue({ - id: `finishWaitpoint.${waitpoint.id}`, - job: "finishWaitpoint", - payload: { - waitpointId: waitpoint.id, - error: JSON.stringify(timeoutError(timeout)), - }, - availableAt: timeout, - }); - } - - return { waitpoint, isCached: false }; } /** This block a run with a BATCH waitpoint. @@ -1877,7 +924,6 @@ export class RunEngine { await this.blockRunWithWaitpoint({ runId, waitpoints: waitpoint.id, - environmentId, projectId, organizationId, batch: { id: batchId }, @@ -1905,8 +951,6 @@ export class RunEngine { async unblockRunForCreatedBatch({ runId, batchId, - environmentId, - projectId, tx, }: { runId: string; @@ -1938,20 +982,12 @@ export class RunEngine { } async tryCompleteBatch({ batchId }: { batchId: string }): Promise { - await this.worker.enqueue({ - //this will debounce the call - id: `tryCompleteBatch:${batchId}`, - job: "tryCompleteBatch", - payload: { batchId: batchId }, - //2s in the future - availableAt: new Date(Date.now() + 2_000), - }); + return this.batchSystem.scheduleCompleteBatch({ batchId }); } async getWaitpoint({ waitpointId, environmentId, - projectId, }: { environmentId: string; projectId: string; @@ -1986,7 +1022,6 @@ export class RunEngine { runId, waitpoints, projectId, - organizationId, releaseConcurrency, timeout, spanIdToComplete, @@ -1997,12 +1032,9 @@ export class RunEngine { }: { runId: string; waitpoints: string | string[]; - environmentId: string; projectId: string; organizationId: string; - releaseConcurrency?: { - releaseQueue: boolean; - }; + releaseConcurrency?: boolean; timeout?: Date; spanIdToComplete?: string; batch?: { id: string; index?: number }; @@ -2010,1683 +1042,184 @@ export class RunEngine { runnerId?: string; tx?: PrismaClientOrTransaction; }): Promise { - const prisma = tx ?? this.prisma; + return this.waitpointSystem.blockRunWithWaitpoint({ + runId, + waitpoints, + projectId, + releaseConcurrency, + timeout, + spanIdToComplete, + batch, + workerId, + runnerId, + tx, + }); + } - let $waitpoints = typeof waitpoints === "string" ? [waitpoints] : waitpoints; - - return await this.runLock.lock([runId], 5000, async (signal) => { - let snapshot: TaskRunExecutionSnapshot = await getLatestExecutionSnapshot(prisma, runId); - - //block the run with the waitpoints, returning how many waitpoints are pending - const insert = await prisma.$queryRaw<{ pending_count: BigInt }[]>` - WITH inserted AS ( - INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt", "spanIdToComplete", "batchId", "batchIndex") - SELECT - gen_random_uuid(), - ${runId}, - w.id, - ${projectId}, - NOW(), - NOW(), - ${spanIdToComplete ?? null}, - ${batch?.id ?? null}, - ${batch?.index ?? null} - FROM "Waitpoint" w - WHERE w.id IN (${Prisma.join($waitpoints)}) - ON CONFLICT DO NOTHING - RETURNING "waitpointId" - ) - SELECT COUNT(*) as pending_count - FROM inserted i - JOIN "Waitpoint" w ON w.id = i."waitpointId" - WHERE w.status = 'PENDING';`; - - const pendingCount = Number(insert.at(0)?.pending_count ?? 0); - - let newStatus: TaskRunExecutionStatus = "SUSPENDED"; - if ( - snapshot.executionStatus === "EXECUTING" || - snapshot.executionStatus === "EXECUTING_WITH_WAITPOINTS" - ) { - newStatus = "EXECUTING_WITH_WAITPOINTS"; - } - - //if the state has changed, create a new snapshot - if (newStatus !== snapshot.executionStatus) { - snapshot = await this.#createExecutionSnapshot(prisma, { - run: { - id: snapshot.runId, - status: snapshot.runStatus, - attemptNumber: snapshot.attemptNumber, - }, - snapshot: { - executionStatus: newStatus, - description: "Run was blocked by a waitpoint.", - }, - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - batchId: batch?.id ?? snapshot.batchId ?? undefined, - workerId, - runnerId, - }); - - // Let the worker know immediately, so it can suspend the run - await this.#sendNotificationToWorker({ runId, snapshot }); - } - - if (timeout) { - for (const waitpoint of $waitpoints) { - await this.worker.enqueue({ - id: `finishWaitpoint.${waitpoint}`, - job: "finishWaitpoint", - payload: { - waitpointId: waitpoint, - error: JSON.stringify(timeoutError(timeout)), - }, - availableAt: timeout, - }); - } - } - - //no pending waitpoint, schedule unblocking the run - //debounce if we're rapidly adding waitpoints - if (pendingCount === 0) { - await this.worker.enqueue({ - //this will debounce the call - id: `continueRunIfUnblocked:${runId}`, - job: "continueRunIfUnblocked", - payload: { runId: runId }, - //in the near future - availableAt: new Date(Date.now() + 50), - }); - } else { - if (releaseConcurrency) { - //release concurrency - await this.runQueue.releaseConcurrency( - organizationId, - runId, - releaseConcurrency.releaseQueue === true - ); - } - } - - return snapshot; - }); - } - - /** This completes a waitpoint and updates all entries so the run isn't blocked, - * if they're no longer blocked. This doesn't suffer from race conditions. */ - async completeWaitpoint({ - id, - output, - }: { - id: string; - output?: { - value: string; - type?: string; - isError: boolean; - }; - }): Promise { - const result = await $transaction( - this.prisma, - async (tx) => { - // 1. Find the TaskRuns blocked by this waitpoint - const affectedTaskRuns = await tx.taskRunWaitpoint.findMany({ - where: { waitpointId: id }, - select: { taskRunId: true, spanIdToComplete: true, createdAt: true }, - }); - - if (affectedTaskRuns.length === 0) { - this.logger.warn(`completeWaitpoint: No TaskRunWaitpoints found for waitpoint`, { - waitpointId: id, - }); - } - - // 2. Update the waitpoint to completed (only if it's pending) - let waitpoint: Waitpoint | null = null; - try { - waitpoint = await tx.waitpoint.update({ - where: { id, status: "PENDING" }, - data: { - status: "COMPLETED", - completedAt: new Date(), - output: output?.value, - outputType: output?.type, - outputIsError: output?.isError, - }, - }); - } catch (error) { - if (error instanceof Prisma.PrismaClientKnownRequestError && error.code === "P2025") { - waitpoint = await tx.waitpoint.findFirst({ - where: { id }, - }); - } else { - this.logger.log("completeWaitpoint: error updating waitpoint:", { error }); - throw error; - } - } - - return { waitpoint, affectedTaskRuns }; - }, - (error) => { - this.logger.error(`completeWaitpoint: Error completing waitpoint ${id}, retrying`, { - error, - }); - throw error; - } - ); - - if (!result) { - throw new Error(`Waitpoint couldn't be updated`); - } - - if (!result.waitpoint) { - throw new Error(`Waitpoint ${id} not found`); - } - - //schedule trying to continue the runs - for (const run of result.affectedTaskRuns) { - await this.worker.enqueue({ - //this will debounce the call - id: `continueRunIfUnblocked:${run.taskRunId}`, - job: "continueRunIfUnblocked", - payload: { runId: run.taskRunId }, - //50ms in the future - availableAt: new Date(Date.now() + 50), - }); - - // emit an event to complete associated cached runs - if (run.spanIdToComplete) { - this.eventBus.emit("cachedRunCompleted", { - time: new Date(), - span: { - id: run.spanIdToComplete, - createdAt: run.createdAt, - }, - blockedRunId: run.taskRunId, - hasError: output?.isError ?? false, - }); - } - } - - return result.waitpoint; - } - - async createCheckpoint({ - runId, - snapshotId, - checkpoint, - workerId, - runnerId, - tx, - }: { - runId: string; - snapshotId: string; - checkpoint: CheckpointInput; - workerId?: string; - runnerId?: string; - tx?: PrismaClientOrTransaction; - }): Promise { - const prisma = tx ?? this.prisma; - - return await this.runLock.lock([runId], 5_000, async (signal) => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); - if (snapshot.id !== snapshotId) { - this.eventBus.emit("incomingCheckpointDiscarded", { - time: new Date(), - run: { - id: runId, - }, - checkpoint: { - discardReason: "Not the latest snapshot", - metadata: checkpoint, - }, - snapshot: { - id: snapshot.id, - executionStatus: snapshot.executionStatus, - }, - }); - - return { - ok: false as const, - error: "Not the latest snapshot", - }; - } - - if (!isCheckpointable(snapshot.executionStatus)) { - this.logger.error("Tried to createCheckpoint on a run in an invalid state", { - snapshot, - }); - - this.eventBus.emit("incomingCheckpointDiscarded", { - time: new Date(), - run: { - id: runId, - }, - checkpoint: { - discardReason: `Status ${snapshot.executionStatus} is not checkpointable`, - metadata: checkpoint, - }, - snapshot: { - id: snapshot.id, - executionStatus: snapshot.executionStatus, - }, - }); - - return { - ok: false as const, - error: `Status ${snapshot.executionStatus} is not checkpointable`, - }; - } - - // Get the run and update the status - const run = await this.prisma.taskRun.update({ - where: { - id: runId, - }, - data: { - status: "WAITING_TO_RESUME", - }, - select: { - id: true, - status: true, - attemptNumber: true, - runtimeEnvironment: { - select: { - id: true, - projectId: true, - }, - }, - }, - }); - - if (!run) { - this.logger.error("Run not found for createCheckpoint", { - snapshot, - }); - - throw new ServiceValidationError("Run not found", 404); - } - - // Create the checkpoint - const taskRunCheckpoint = await prisma.taskRunCheckpoint.create({ - data: { - ...CheckpointId.generate(), - type: checkpoint.type, - location: checkpoint.location, - imageRef: checkpoint.imageRef, - reason: checkpoint.reason, - runtimeEnvironmentId: run.runtimeEnvironment.id, - projectId: run.runtimeEnvironment.projectId, - }, - }); - - //create a new execution snapshot, with the checkpoint - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "SUSPENDED", - description: "Run was suspended after creating a checkpoint.", - }, - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - checkpointId: taskRunCheckpoint.id, - workerId, - runnerId, - }); - - return { - ok: true as const, - ...executionResultFromSnapshot(newSnapshot), - checkpoint: taskRunCheckpoint, - } satisfies CreateCheckpointResult; - }); - } - - async continueRunExecution({ - runId, - snapshotId, - workerId, - runnerId, - tx, - }: { - runId: string; - snapshotId: string; - workerId?: string; - runnerId?: string; - tx?: PrismaClientOrTransaction; - }): Promise { - const prisma = tx ?? this.prisma; - - return await this.runLock.lock([runId], 5_000, async (signal) => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); - - if (snapshot.id !== snapshotId) { - throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400); - } - - if (!isPendingExecuting(snapshot.executionStatus)) { - throw new ServiceValidationError("Snapshot is not in a valid state to continue", 400); - } - - // Get the run and update the status - const run = await this.prisma.taskRun.update({ - where: { - id: runId, - }, - data: { - status: "EXECUTING", - }, - select: { - id: true, - status: true, - attemptNumber: true, - }, - }); - - if (!run) { - this.logger.error("Run not found for createCheckpoint", { - snapshot, - }); - - throw new ServiceValidationError("Run not found", 404); - } - - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "EXECUTING", - description: "Run was continued after being suspended", - }, - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - completedWaitpoints: snapshot.completedWaitpoints, - workerId, - runnerId, - }); - - // Let worker know about the new snapshot so it can continue the run - await this.#sendNotificationToWorker({ runId, snapshot: newSnapshot }); - - return { - ...executionResultFromSnapshot(newSnapshot), - } satisfies ExecutionResult; - }); - } - - /** - Send a heartbeat to signal the the run is still executing. - If a heartbeat isn't received, after a while the run is considered "stalled" - and some logic will be run to try recover it. - @returns The ExecutionResult, which could be a different snapshot. - */ - async heartbeatRun({ - runId, - snapshotId, - workerId, - runnerId, - tx, - }: { - runId: string; - snapshotId: string; - workerId?: string; - runnerId?: string; - tx?: PrismaClientOrTransaction; - }): Promise { - const prisma = tx ?? this.prisma; - - //we don't need to acquire a run lock for any of this, it's not critical if it happens on an older version - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); - if (latestSnapshot.id !== snapshotId) { - this.logger.log("heartbeatRun: no longer the latest snapshot, stopping the heartbeat.", { - runId, - snapshotId, - latestSnapshot, - workerId, - runnerId, - }); - - await this.worker.ack(`heartbeatSnapshot.${runId}`); - return executionResultFromSnapshot(latestSnapshot); - } - - if (latestSnapshot.workerId !== workerId) { - this.logger.debug("heartbeatRun: worker ID does not match the latest snapshot", { - runId, - snapshotId, - latestSnapshot, - workerId, - runnerId, - }); - } - - //update the snapshot heartbeat time - await prisma.taskRunExecutionSnapshot.update({ - where: { id: latestSnapshot.id }, - data: { - lastHeartbeatAt: new Date(), - }, - }); - - //extending the heartbeat - const intervalMs = this.#getHeartbeatIntervalMs(latestSnapshot.executionStatus); - if (intervalMs !== null) { - await this.worker.reschedule(`heartbeatSnapshot.${runId}`, new Date(Date.now() + intervalMs)); - } - - return executionResultFromSnapshot(latestSnapshot); - } - - /** Get required data to execute the run */ - async getRunExecutionData({ - runId, - tx, - }: { - runId: string; - tx?: PrismaClientOrTransaction; - }): Promise { - const prisma = tx ?? this.prisma; - try { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); - - const executionData: RunExecutionData = { - version: "1" as const, - snapshot: { - id: snapshot.id, - friendlyId: snapshot.friendlyId, - executionStatus: snapshot.executionStatus, - description: snapshot.description, - }, - run: { - id: snapshot.runId, - friendlyId: snapshot.runFriendlyId, - status: snapshot.runStatus, - attemptNumber: snapshot.attemptNumber ?? undefined, - }, - batch: snapshot.batchId - ? { - id: snapshot.batchId, - friendlyId: BatchId.toFriendlyId(snapshot.batchId), - } - : undefined, - checkpoint: snapshot.checkpoint - ? { - id: snapshot.checkpoint.id, - friendlyId: snapshot.checkpoint.friendlyId, - type: snapshot.checkpoint.type, - location: snapshot.checkpoint.location, - imageRef: snapshot.checkpoint.imageRef, - reason: snapshot.checkpoint.reason ?? undefined, - } - : undefined, - completedWaitpoints: snapshot.completedWaitpoints, - }; - - return executionData; - } catch (e) { - this.logger.error("Failed to getRunExecutionData", { - message: e instanceof Error ? e.message : e, - }); - return null; - } - } - - async quit() { - try { - //stop the run queue - await this.runQueue.quit(); - await this.worker.stop(); - await this.runLock.quit(); - - // This is just a failsafe - await this.runLockRedis.quit(); - } catch (error) { - // And should always throw - } - } - - async #systemFailure({ - runId, - error, - tx, - }: { - runId: string; - error: TaskRunInternalError; - tx?: PrismaClientOrTransaction; - }): Promise { - const prisma = tx ?? this.prisma; - return this.#trace("#systemFailure", { runId }, async (span) => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); - - //already finished - if (latestSnapshot.executionStatus === "FINISHED") { - //todo check run is in the correct state - return { - attemptStatus: "RUN_FINISHED", - snapshot: latestSnapshot, - run: { - id: runId, - friendlyId: latestSnapshot.runFriendlyId, - status: latestSnapshot.runStatus, - attemptNumber: latestSnapshot.attemptNumber, - }, - }; - } - - const result = await this.#attemptFailed({ - runId, - snapshotId: latestSnapshot.id, - completion: { - ok: false, - id: runId, - error, - }, - tx: prisma, - }); - - return result; - }); - } - - async #expireRun({ runId, tx }: { runId: string; tx?: PrismaClientOrTransaction }) { - const prisma = tx ?? this.prisma; - await this.runLock.lock([runId], 5_000, async (signal) => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); - - //if we're executing then we won't expire the run - if (isExecuting(snapshot.executionStatus)) { - return; - } - - //only expire "PENDING" runs - const run = await prisma.taskRun.findUnique({ where: { id: runId } }); - - if (!run) { - this.logger.debug("Could not find enqueued run to expire", { - runId, - }); - return; - } - - if (run.status !== "PENDING") { - this.logger.debug("Run cannot be expired because it's not in PENDING status", { - run, - }); - return; - } - - if (run.lockedAt) { - this.logger.debug("Run cannot be expired because it's locked, so will run", { - run, - }); - return; - } - - const error: TaskRunError = { - type: "STRING_ERROR", - raw: `Run expired because the TTL (${run.ttl}) was reached`, - }; - - const updatedRun = await prisma.taskRun.update({ - where: { id: runId }, - data: { - status: "EXPIRED", - completedAt: new Date(), - expiredAt: new Date(), - error, - executionSnapshots: { - create: { - engine: "V2", - executionStatus: "FINISHED", - description: "Run was expired because the TTL was reached", - runStatus: "EXPIRED", - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - }, - }, - }, - select: { - id: true, - spanId: true, - ttl: true, - associatedWaitpoint: { - select: { - id: true, - }, - }, - runtimeEnvironment: { - select: { - organizationId: true, - }, - }, - createdAt: true, - completedAt: true, - taskEventStore: true, - }, - }); - - await this.runQueue.acknowledgeMessage(updatedRun.runtimeEnvironment.organizationId, runId); - - if (!updatedRun.associatedWaitpoint) { - throw new ServiceValidationError("No associated waitpoint found", 400); - } - - await this.completeWaitpoint({ - id: updatedRun.associatedWaitpoint.id, - output: { value: JSON.stringify(error), isError: true }, - }); - - this.eventBus.emit("runExpired", { run: updatedRun, time: new Date() }); - }); - } - - async #waitingForDeploy({ - orgId, - runId, - workerId, - runnerId, - reason, - tx, - }: { - orgId: string; - runId: string; - workerId?: string; - runnerId?: string; - reason?: string; - tx?: PrismaClientOrTransaction; - }) { - const prisma = tx ?? this.prisma; - - return this.#trace("#waitingForDeploy", { runId }, async (span) => { - return this.runLock.lock([runId], 5_000, async (signal) => { - //mark run as waiting for deploy - const run = await prisma.taskRun.update({ - where: { id: runId }, - data: { - status: "WAITING_FOR_DEPLOY", - }, - select: { - id: true, - status: true, - attemptNumber: true, - runtimeEnvironment: { - select: { id: true, type: true }, - }, - }, - }); - - await this.#createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "RUN_CREATED", - description: - reason ?? - "The run doesn't have a background worker, so we're going to ack it for now.", - }, - environmentId: run.runtimeEnvironment.id, - environmentType: run.runtimeEnvironment.type, - workerId, - runnerId, - }); - - //we ack because when it's deployed it will be requeued - await this.runQueue.acknowledgeMessage(orgId, runId); - }); - }); - } - - async #attemptSucceeded({ - runId, - snapshotId, - completion, - tx, - workerId, - runnerId, - }: { - runId: string; - snapshotId: string; - completion: TaskRunSuccessfulExecutionResult; - tx: PrismaClientOrTransaction; - workerId?: string; - runnerId?: string; - }): Promise { - const prisma = tx ?? this.prisma; - return this.#trace("#completeRunAttemptSuccess", { runId, snapshotId }, async (span) => { - return this.runLock.lock([runId], 5_000, async (signal) => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); - - if (latestSnapshot.id !== snapshotId) { - throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400); - } - - span.setAttribute("completionStatus", completion.ok); - - const completedAt = new Date(); - - const run = await prisma.taskRun.update({ - where: { id: runId }, - data: { - status: "COMPLETED_SUCCESSFULLY", - completedAt, - output: completion.output, - outputType: completion.outputType, - executionSnapshots: { - create: { - executionStatus: "FINISHED", - description: "Task completed successfully", - runStatus: "COMPLETED_SUCCESSFULLY", - attemptNumber: latestSnapshot.attemptNumber, - environmentId: latestSnapshot.environmentId, - environmentType: latestSnapshot.environmentType, - workerId, - runnerId, - }, - }, - }, - select: { - id: true, - friendlyId: true, - status: true, - attemptNumber: true, - spanId: true, - associatedWaitpoint: { - select: { - id: true, - }, - }, - project: { - select: { - organizationId: true, - }, - }, - batchId: true, - createdAt: true, - completedAt: true, - taskEventStore: true, - }, - }); - const newSnapshot = await getLatestExecutionSnapshot(prisma, runId); - await this.runQueue.acknowledgeMessage(run.project.organizationId, runId); - - // We need to manually emit this as we created the final snapshot as part of the task run update - this.eventBus.emit("executionSnapshotCreated", { - time: newSnapshot.createdAt, - run: { - id: newSnapshot.runId, - }, - snapshot: { - ...newSnapshot, - completedWaitpointIds: newSnapshot.completedWaitpoints.map((wp) => wp.id), - }, - }); - - if (!run.associatedWaitpoint) { - throw new ServiceValidationError("No associated waitpoint found", 400); - } - - await this.completeWaitpoint({ - id: run.associatedWaitpoint.id, - output: completion.output - ? { value: completion.output, type: completion.outputType, isError: false } - : undefined, - }); - - this.eventBus.emit("runSucceeded", { - time: completedAt, - run: { - id: runId, - spanId: run.spanId, - output: completion.output, - outputType: completion.outputType, - createdAt: run.createdAt, - completedAt: run.completedAt, - taskEventStore: run.taskEventStore, - }, - }); - - await this.#finalizeRun(run); - - return { - attemptStatus: "RUN_FINISHED", - snapshot: newSnapshot, - run, - }; - }); - }); - } - - async #attemptFailed({ - runId, - snapshotId, - workerId, - runnerId, - completion, - forceRequeue, - tx, - }: { - runId: string; - snapshotId: string; - workerId?: string; - runnerId?: string; - completion: TaskRunFailedExecutionResult; - forceRequeue?: boolean; - tx: PrismaClientOrTransaction; - }): Promise { - const prisma = this.prisma; - - return this.#trace("completeRunAttemptFailure", { runId, snapshotId }, async (span) => { - return this.runLock.lock([runId], 5_000, async (signal) => { - const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); - - if (latestSnapshot.id !== snapshotId) { - throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400); - } - - span.setAttribute("completionStatus", completion.ok); - - //remove waitpoints blocking the run - const deletedCount = await this.#clearBlockingWaitpoints({ runId, tx }); - if (deletedCount > 0) { - this.logger.debug("Cleared blocking waitpoints", { runId, deletedCount }); - } - - const failedAt = new Date(); - - const retryResult = await retryOutcomeFromCompletion(prisma, { - runId, - error: completion.error, - retryUsingQueue: forceRequeue ?? false, - retrySettings: completion.retry, - attemptNumber: latestSnapshot.attemptNumber, - }); - - // Force requeue means it was crashed so the attempt span needs to be closed - if (forceRequeue) { - const minimalRun = await prisma.taskRun.findFirst({ - where: { - id: runId, - }, - select: { - status: true, - spanId: true, - maxAttempts: true, - runtimeEnvironment: { - select: { - organizationId: true, - }, - }, - taskEventStore: true, - createdAt: true, - completedAt: true, - }, - }); - - if (!minimalRun) { - throw new ServiceValidationError("Run not found", 404); - } - - this.eventBus.emit("runAttemptFailed", { - time: failedAt, - run: { - id: runId, - status: minimalRun.status, - spanId: minimalRun.spanId, - error: completion.error, - attemptNumber: latestSnapshot.attemptNumber ?? 0, - createdAt: minimalRun.createdAt, - completedAt: minimalRun.completedAt, - taskEventStore: minimalRun.taskEventStore, - }, - }); - } - - switch (retryResult.outcome) { - case "cancel_run": { - const result = await this.cancelRun({ - runId, - completedAt: failedAt, - reason: retryResult.reason, - finalizeRun: true, - tx: prisma, - }); - return { - attemptStatus: - result.snapshot.executionStatus === "PENDING_CANCEL" - ? "RUN_PENDING_CANCEL" - : "RUN_FINISHED", - ...result, - }; - } - case "fail_run": { - return await this.#permanentlyFailRun({ - runId, - snapshotId, - failedAt, - error: retryResult.sanitizedError, - workerId, - runnerId, - }); - } - case "retry": { - const retryAt = new Date(retryResult.settings.timestamp); - - const run = await prisma.taskRun.update({ - where: { - id: runId, - }, - data: { - status: "RETRYING_AFTER_FAILURE", - machinePreset: retryResult.machine, - }, - include: { - runtimeEnvironment: { - include: { - project: true, - organization: true, - orgMember: true, - }, - }, - }, - }); - - const nextAttemptNumber = - latestSnapshot.attemptNumber === null ? 1 : latestSnapshot.attemptNumber + 1; - - if (retryResult.wasOOMError) { - this.eventBus.emit("runAttemptFailed", { - time: failedAt, - run: { - id: runId, - status: run.status, - spanId: run.spanId, - error: completion.error, - attemptNumber: latestSnapshot.attemptNumber ?? 0, - createdAt: run.createdAt, - completedAt: run.completedAt, - taskEventStore: run.taskEventStore, - }, - }); - } - - this.eventBus.emit("runRetryScheduled", { - time: failedAt, - run: { - id: run.id, - friendlyId: run.friendlyId, - attemptNumber: nextAttemptNumber, - queue: run.queue, - taskIdentifier: run.taskIdentifier, - traceContext: run.traceContext as Record, - baseCostInCents: run.baseCostInCents, - spanId: run.spanId, - }, - organization: { - id: run.runtimeEnvironment.organizationId, - }, - environment: run.runtimeEnvironment, - retryAt, - }); - - //if it's a long delay and we support checkpointing, put it back in the queue - if ( - forceRequeue || - retryResult.method === "queue" || - (this.options.retryWarmStartThresholdMs !== undefined && - retryResult.settings.delay >= this.options.retryWarmStartThresholdMs) - ) { - //we nack the message, requeuing it for later - const nackResult = await this.#tryNackAndRequeue({ - run, - environment: run.runtimeEnvironment, - orgId: run.runtimeEnvironment.organizationId, - timestamp: retryAt.getTime(), - error: { - type: "INTERNAL_ERROR", - code: "TASK_RUN_DEQUEUED_MAX_RETRIES", - message: `We tried to dequeue the run the maximum number of times but it wouldn't start executing`, - }, - tx: prisma, - }); - - if (!nackResult.wasRequeued) { - return { - attemptStatus: "RUN_FINISHED", - ...nackResult, - }; - } else { - return { attemptStatus: "RETRY_QUEUED", ...nackResult }; - } - } - - //it will continue running because the retry delay is short - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "PENDING_EXECUTING", - description: "Attempt failed with a short delay, starting a new attempt", - }, - environmentId: latestSnapshot.environmentId, - environmentType: latestSnapshot.environmentType, - workerId, - runnerId, - }); - //the worker can fetch the latest snapshot and should create a new attempt - await this.#sendNotificationToWorker({ runId, snapshot: newSnapshot }); - - return { - attemptStatus: "RETRY_IMMEDIATELY", - ...executionResultFromSnapshot(newSnapshot), - }; - } - } - }); - }); - } - - async #permanentlyFailRun({ - runId, - snapshotId, - failedAt, - error, - workerId, - runnerId, - }: { - runId: string; - snapshotId: string; - failedAt: Date; - error: TaskRunError; - workerId?: string; - runnerId?: string; - }): Promise { - const prisma = this.prisma; - - return this.#trace("permanentlyFailRun", { runId, snapshotId }, async (span) => { - const status = runStatusFromError(error); - - //run permanently failed - const run = await prisma.taskRun.update({ - where: { - id: runId, - }, - data: { - status, - completedAt: failedAt, - error, - }, - select: { - id: true, - friendlyId: true, - status: true, - attemptNumber: true, - spanId: true, - batchId: true, - associatedWaitpoint: { - select: { - id: true, - }, - }, - runtimeEnvironment: { - select: { - id: true, - type: true, - organizationId: true, - }, - }, - taskEventStore: true, - createdAt: true, - completedAt: true, - }, - }); - - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "FINISHED", - description: "Run failed", - }, - environmentId: run.runtimeEnvironment.id, - environmentType: run.runtimeEnvironment.type, - workerId, - runnerId, - }); - - if (!run.associatedWaitpoint) { - throw new ServiceValidationError("No associated waitpoint found", 400); - } - - await this.completeWaitpoint({ - id: run.associatedWaitpoint.id, - output: { value: JSON.stringify(error), isError: true }, - }); - - await this.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId); - - this.eventBus.emit("runFailed", { - time: failedAt, - run: { - id: runId, - status: run.status, - spanId: run.spanId, - error, - taskEventStore: run.taskEventStore, - createdAt: run.createdAt, - completedAt: run.completedAt, - }, - }); - - await this.#finalizeRun(run); - - return { - attemptStatus: "RUN_FINISHED", - snapshot: newSnapshot, - run, - }; - }); - } - - //MARK: RunQueue - - /** The run can be added to the queue. When it's pulled from the queue it will be executed. */ - async #enqueueRun({ - run, - env, - timestamp, - tx, - snapshot, - batchId, - checkpointId, - completedWaitpoints, - workerId, - runnerId, - }: { - run: TaskRun; - env: MinimalAuthenticatedEnvironment; - timestamp: number; - tx?: PrismaClientOrTransaction; - snapshot?: { - description?: string; - }; - batchId?: string; - checkpointId?: string; - completedWaitpoints?: { - id: string; - index?: number; - }[]; - workerId?: string; - runnerId?: string; - }) { - const prisma = tx ?? this.prisma; - - await this.runLock.lock([run.id], 5000, async (signal) => { - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run: run, - snapshot: { - executionStatus: "QUEUED", - description: snapshot?.description ?? "Run was QUEUED", - }, - batchId, - environmentId: env.id, - environmentType: env.type, - checkpointId, - completedWaitpoints, - workerId, - runnerId, - }); - - const masterQueues = [run.masterQueue]; - if (run.secondaryMasterQueue) { - masterQueues.push(run.secondaryMasterQueue); - } - - await this.runQueue.enqueueMessage({ - env, - masterQueues, - message: { - runId: run.id, - taskIdentifier: run.taskIdentifier, - orgId: env.organization.id, - projectId: env.project.id, - environmentId: env.id, - environmentType: env.type, - queue: run.queue, - concurrencyKey: run.concurrencyKey ?? undefined, - timestamp, - attempt: 0, - }, - }); - }); - } - - async #tryNackAndRequeue({ - run, - environment, - orgId, - timestamp, - error, - workerId, - runnerId, - tx, - }: { - run: TaskRun; - environment: { - id: string; - type: RuntimeEnvironmentType; - }; - orgId: string; - timestamp?: number; - error: TaskRunInternalError; - workerId?: string; - runnerId?: string; - tx?: PrismaClientOrTransaction; - }): Promise<{ wasRequeued: boolean } & ExecutionResult> { - const prisma = tx ?? this.prisma; - - return await this.runLock.lock([run.id], 5000, async (signal) => { - //we nack the message, this allows another work to pick up the run - const gotRequeued = await this.runQueue.nackMessage({ - orgId, - messageId: run.id, - retryAt: timestamp, - }); - - if (!gotRequeued) { - const result = await this.#systemFailure({ - runId: run.id, - error, - tx: prisma, - }); - return { wasRequeued: false, ...result }; - } - - const newSnapshot = await this.#createExecutionSnapshot(prisma, { - run: run, - snapshot: { - executionStatus: "QUEUED", - description: "Requeued the run after a failure", - }, - environmentId: environment.id, - environmentType: environment.type, - workerId, - runnerId, - }); - - return { - wasRequeued: true, - snapshot: { - id: newSnapshot.id, - friendlyId: newSnapshot.friendlyId, - executionStatus: newSnapshot.executionStatus, - description: newSnapshot.description, - }, - run: { - id: newSnapshot.runId, - friendlyId: newSnapshot.runFriendlyId, - status: newSnapshot.runStatus, - attemptNumber: newSnapshot.attemptNumber, - }, - }; - }); - } - - async #continueRunIfUnblocked({ runId }: { runId: string }) { - // 1. Get the any blocking waitpoints - const blockingWaitpoints = await this.prisma.taskRunWaitpoint.findMany({ - where: { taskRunId: runId }, - select: { - batchId: true, - batchIndex: true, - waitpoint: { - select: { id: true, status: true }, - }, - }, - }); - - // 2. There are blockers still, so do nothing - if (blockingWaitpoints.some((w) => w.waitpoint.status !== "COMPLETED")) { - return; - } - - // 3. Get the run with environment - const run = await this.prisma.taskRun.findFirst({ - where: { - id: runId, - }, - include: { - runtimeEnvironment: { - select: { - id: true, - type: true, - maximumConcurrencyLimit: true, - project: { select: { id: true } }, - organization: { select: { id: true } }, - }, - }, - }, - }); - - if (!run) { - throw new Error(`#continueRunIfUnblocked: run not found: ${runId}`); - } - - //4. Continue the run whether it's executing or not - await this.runLock.lock([runId], 5000, async (signal) => { - const snapshot = await getLatestExecutionSnapshot(this.prisma, runId); - - //run is still executing, send a message to the worker - if (isExecuting(snapshot.executionStatus)) { - const newSnapshot = await this.#createExecutionSnapshot(this.prisma, { - run: { - id: runId, - status: snapshot.runStatus, - attemptNumber: snapshot.attemptNumber, - }, - snapshot: { - executionStatus: "EXECUTING", - description: "Run was continued, whilst still executing.", - }, - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - batchId: snapshot.batchId ?? undefined, - completedWaitpoints: blockingWaitpoints.map((b) => ({ - id: b.waitpoint.id, - index: b.batchIndex ?? undefined, - })), - }); - - //we reacquire the concurrency if it's still running because we're not going to be dequeuing (which also does this) - await this.runQueue.reacquireConcurrency(run.runtimeEnvironment.organization.id, runId); - - await this.#sendNotificationToWorker({ runId, snapshot: newSnapshot }); - } else { - if (snapshot.executionStatus !== "RUN_CREATED" && !snapshot.checkpointId) { - // TODO: We're screwed, should probably fail the run immediately - throw new Error(`#continueRunIfUnblocked: run has no checkpoint: ${run.id}`); - } - - //put it back in the queue, with the original timestamp (w/ priority) - //this prioritizes dequeuing waiting runs over new runs - await this.#enqueueRun({ - run, - env: run.runtimeEnvironment, - timestamp: run.createdAt.getTime() - run.priorityMs, - snapshot: { - description: "Run was QUEUED, because all waitpoints are completed", - }, - batchId: snapshot.batchId ?? undefined, - completedWaitpoints: blockingWaitpoints.map((b) => ({ - id: b.waitpoint.id, - index: b.batchIndex ?? undefined, - })), - checkpointId: snapshot.checkpointId ?? undefined, - }); - } - }); - - //5. Remove the blocking waitpoints - await this.prisma.taskRunWaitpoint.deleteMany({ - where: { - taskRunId: runId, - }, - }); - } - - async #queueRunsWaitingForWorker({ backgroundWorkerId }: { backgroundWorkerId: string }) { - //It could be a lot of runs, so we will process them in a batch - //if there are still more to process we will enqueue this function again - const maxCount = this.options.queueRunsWaitingForWorkerBatchSize ?? 200; - - const backgroundWorker = await this.prisma.backgroundWorker.findFirst({ - where: { - id: backgroundWorkerId, - }, - include: { - runtimeEnvironment: { - include: { - project: true, - organization: true, - }, - }, - tasks: true, - }, - }); - - if (!backgroundWorker) { - this.logger.error("#queueRunsWaitingForWorker: background worker not found", { - id: backgroundWorkerId, - }); - return; - } - - const runsWaitingForDeploy = await this.prisma.taskRun.findMany({ - where: { - runtimeEnvironmentId: backgroundWorker.runtimeEnvironmentId, - projectId: backgroundWorker.projectId, - status: "WAITING_FOR_DEPLOY", - taskIdentifier: { - in: backgroundWorker.tasks.map((task) => task.slug), - }, - }, - orderBy: { - createdAt: "asc", - }, - take: maxCount + 1, - }); - - //none to process - if (!runsWaitingForDeploy.length) return; - - for (const run of runsWaitingForDeploy) { - await this.prisma.$transaction(async (tx) => { - const updatedRun = await tx.taskRun.update({ - where: { - id: run.id, - }, - data: { - status: "PENDING", - }, - }); - await this.#enqueueRun({ - run: updatedRun, - env: backgroundWorker.runtimeEnvironment, - //add to the queue using the original run created time - //this should ensure they're in the correct order in the queue - timestamp: updatedRun.createdAt.getTime() - updatedRun.priorityMs, - tx, - }); - }); - } - - //enqueue more if needed - if (runsWaitingForDeploy.length > maxCount) { - await this.queueRunsWaitingForWorker({ backgroundWorkerId }); - } - } - - //MARK: - Waitpoints - async #createRunAssociatedWaitpoint( - tx: PrismaClientOrTransaction, - { - projectId, - environmentId, - completedByTaskRunId, - }: { projectId: string; environmentId: string; completedByTaskRunId: string } - ) { - return tx.waitpoint.create({ - data: { - ...WaitpointId.generate(), - type: "RUN", - status: "PENDING", - idempotencyKey: nanoid(24), - userProvidedIdempotencyKey: false, - projectId, - environmentId, - completedByTaskRunId, - }, - }); + /** This completes a waitpoint and updates all entries so the run isn't blocked, + * if they're no longer blocked. This doesn't suffer from race conditions. */ + async completeWaitpoint({ + id, + output, + }: { + id: string; + output?: { + value: string; + type?: string; + isError: boolean; + }; + }): Promise { + return this.waitpointSystem.completeWaitpoint({ id, output }); } - async #rescheduleDateTimeWaitpoint( - tx: PrismaClientOrTransaction, - waitpointId: string, - completedAfter: Date - ): Promise<{ success: true } | { success: false; error: string }> { - try { - const updatedWaitpoint = await tx.waitpoint.update({ - where: { id: waitpointId, status: "PENDING" }, - data: { - completedAfter, - }, - }); - } catch (error) { - if (error instanceof Prisma.PrismaClientKnownRequestError && error.code === "P2025") { - return { - success: false, - error: "Waitpoint doesn't exist or is already completed", - }; - } - - this.logger.error("Error rescheduling waitpoint", { error }); - - return { - success: false, - error: "An unknown error occurred", - }; - } - - //reschedule completion - await this.worker.enqueue({ - id: `finishWaitpoint.${waitpointId}`, - job: "finishWaitpoint", - payload: { waitpointId: waitpointId }, - availableAt: completedAfter, + /** + * This gets called AFTER the checkpoint has been created + * The CPU/Memory checkpoint at this point exists in our snapshot storage + */ + async createCheckpoint({ + runId, + snapshotId, + checkpoint, + workerId, + runnerId, + tx, + }: { + runId: string; + snapshotId: string; + checkpoint: CheckpointInput; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise { + return this.checkpointSystem.createCheckpoint({ + runId, + snapshotId, + checkpoint, + workerId, + runnerId, + tx, }); - - return { - success: true, - }; } - async #clearBlockingWaitpoints({ runId, tx }: { runId: string; tx?: PrismaClientOrTransaction }) { - const prisma = tx ?? this.prisma; - const deleted = await prisma.taskRunWaitpoint.deleteMany({ - where: { - taskRunId: runId, - }, + /** + * This is called when a run has been restored from a checkpoint and is ready to start executing again + */ + async continueRunExecution({ + runId, + snapshotId, + workerId, + runnerId, + tx, + }: { + runId: string; + snapshotId: string; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise { + return this.checkpointSystem.continueRunExecution({ + runId, + snapshotId, + workerId, + runnerId, + tx, }); - - return deleted.count; } - //#region TaskRunExecutionSnapshots - async #createExecutionSnapshot( - prisma: PrismaClientOrTransaction, - { - run, - snapshot, - batchId, - environmentId, - environmentType, - checkpointId, + /** + Send a heartbeat to signal the the run is still executing. + If a heartbeat isn't received, after a while the run is considered "stalled" + and some logic will be run to try recover it. + @returns The ExecutionResult, which could be a different snapshot. + */ + async heartbeatRun({ + runId, + snapshotId, + workerId, + runnerId, + tx, + }: { + runId: string; + snapshotId: string; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise { + return this.executionSnapshotSystem.heartbeatRun({ + runId, + snapshotId, workerId, runnerId, - completedWaitpoints, - error, - }: { - run: { id: string; status: TaskRunStatus; attemptNumber?: number | null }; - snapshot: { - executionStatus: TaskRunExecutionStatus; - description: string; - }; - batchId?: string; - environmentId: string; - environmentType: RuntimeEnvironmentType; - checkpointId?: string; - workerId?: string; - runnerId?: string; - completedWaitpoints?: { - id: string; - index?: number; - }[]; - error?: string; - } - ) { - const newSnapshot = await prisma.taskRunExecutionSnapshot.create({ - data: { - engine: "V2", - executionStatus: snapshot.executionStatus, - description: snapshot.description, - runId: run.id, - runStatus: run.status, - attemptNumber: run.attemptNumber ?? undefined, - batchId, - environmentId, - environmentType, - checkpointId, - workerId, - runnerId, - completedWaitpoints: { - connect: completedWaitpoints?.map((w) => ({ id: w.id })), - }, - completedWaitpointOrder: completedWaitpoints - ?.filter((c) => c.index !== undefined) - .sort((a, b) => a.index! - b.index!) - .map((w) => w.id), - isValid: error ? false : true, - error, - }, - include: { - checkpoint: true, - }, + tx, }); + } - if (!error) { - //set heartbeat (if relevant) - const intervalMs = this.#getHeartbeatIntervalMs(newSnapshot.executionStatus); - if (intervalMs !== null) { - await this.worker.enqueue({ - id: `heartbeatSnapshot.${run.id}`, - job: "heartbeatSnapshot", - payload: { snapshotId: newSnapshot.id, runId: run.id }, - availableAt: new Date(Date.now() + intervalMs), - }); - } - } + /** Get required data to execute the run */ + async getRunExecutionData({ + runId, + tx, + }: { + runId: string; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.prisma; + try { + const snapshot = await getLatestExecutionSnapshot(prisma, runId); - this.eventBus.emit("executionSnapshotCreated", { - time: newSnapshot.createdAt, - run: { - id: newSnapshot.runId, - }, - snapshot: { - ...newSnapshot, - completedWaitpointIds: completedWaitpoints?.map((w) => w.id) ?? [], - }, - }); + const executionData: RunExecutionData = { + version: "1" as const, + snapshot: { + id: snapshot.id, + friendlyId: snapshot.friendlyId, + executionStatus: snapshot.executionStatus, + description: snapshot.description, + }, + run: { + id: snapshot.runId, + friendlyId: snapshot.runFriendlyId, + status: snapshot.runStatus, + attemptNumber: snapshot.attemptNumber ?? undefined, + }, + batch: snapshot.batchId + ? { + id: snapshot.batchId, + friendlyId: BatchId.toFriendlyId(snapshot.batchId), + } + : undefined, + checkpoint: snapshot.checkpoint + ? { + id: snapshot.checkpoint.id, + friendlyId: snapshot.checkpoint.friendlyId, + type: snapshot.checkpoint.type, + location: snapshot.checkpoint.location, + imageRef: snapshot.checkpoint.imageRef, + reason: snapshot.checkpoint.reason ?? undefined, + } + : undefined, + completedWaitpoints: snapshot.completedWaitpoints, + }; - return { - ...newSnapshot, - friendlyId: SnapshotId.toFriendlyId(newSnapshot.id), - runFriendlyId: RunId.toFriendlyId(newSnapshot.runId), - }; + return executionData; + } catch (e) { + this.logger.error("Failed to getRunExecutionData", { + message: e instanceof Error ? e.message : e, + }); + return null; + } } - #getHeartbeatIntervalMs(status: TaskRunExecutionStatus): number | null { - switch (status) { - case "PENDING_EXECUTING": { - return this.heartbeatTimeouts.PENDING_EXECUTING; - } - case "PENDING_CANCEL": { - return this.heartbeatTimeouts.PENDING_CANCEL; - } - case "EXECUTING": { - return this.heartbeatTimeouts.EXECUTING; - } - case "EXECUTING_WITH_WAITPOINTS": { - return this.heartbeatTimeouts.EXECUTING_WITH_WAITPOINTS; - } - default: { - return null; - } + async quit() { + try { + //stop the run queue + await this.releaseConcurrencySystem.quit(); + await this.runQueue.quit(); + await this.worker.stop(); + await this.runLock.quit(); + + // This is just a failsafe + await this.runLockRedis.quit(); + } catch (error) { + // And should always throw } } @@ -3703,7 +1236,7 @@ export class RunEngine { tx?: PrismaClientOrTransaction; }) { const prisma = tx ?? this.prisma; - return await this.runLock.lock([runId], 5_000, async (signal) => { + return await this.runLock.lock([runId], 5_000, async () => { const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); if (latestSnapshot.id !== snapshotId) { this.logger.log( @@ -3748,6 +1281,9 @@ export class RunEngine { case "QUEUED": { throw new NotImplementedError("There shouldn't be a heartbeat for QUEUED"); } + case "QUEUED_EXECUTING": { + throw new NotImplementedError("There shouldn't be a heartbeat for QUEUED_EXECUTING"); + } case "PENDING_EXECUTING": { //the run didn't start executing, we need to requeue it const run = await prisma.taskRun.findFirst({ @@ -3774,13 +1310,14 @@ export class RunEngine { } //it will automatically be requeued X times depending on the queue retry settings - const gotRequeued = await this.#tryNackAndRequeue({ + await this.runAttemptSystem.tryNackAndRequeue({ run, environment: { id: latestSnapshot.environmentId, type: latestSnapshot.environmentType, }, - orgId: run.runtimeEnvironment.organizationId, + orgId: latestSnapshot.organizationId, + projectId: latestSnapshot.projectId, error: { type: "INTERNAL_ERROR", code: "TASK_RUN_DEQUEUED_MAX_RETRIES", @@ -3795,7 +1332,7 @@ export class RunEngine { const retryDelay = 250; //todo call attemptFailed and force requeuing - await this.#attemptFailed({ + await this.runAttemptSystem.attemptFailed({ runId, snapshotId: latestSnapshot.id, completion: { @@ -3845,128 +1382,6 @@ export class RunEngine { }); } - //#endregion - - /** - * Sends a notification that a run has changed and we need to fetch the latest run state. - * The worker will call `getRunExecutionData` via the API and act accordingly. - */ - async #sendNotificationToWorker({ - runId, - snapshot, - }: { - runId: string; - snapshot: { - id: string; - executionStatus: TaskRunExecutionStatus; - }; - }) { - this.eventBus.emit("workerNotification", { - time: new Date(), - run: { - id: runId, - }, - snapshot: { - id: snapshot.id, - executionStatus: snapshot.executionStatus, - }, - }); - } - - /* - * Whether the run succeeds, fails, is cancelled… we need to run these operations - */ - async #finalizeRun({ id, batchId }: { id: string; batchId: string | null }) { - if (batchId) { - await this.tryCompleteBatch({ batchId }); - } - - //cancel the heartbeats - await this.worker.ack(`heartbeatSnapshot.${id}`); - } - - /** - * Checks to see if all runs for a BatchTaskRun are completed, if they are then update the status. - * This isn't used operationally, but it's used for the Batches dashboard page. - */ - async #tryCompleteBatch({ batchId }: { batchId: string }) { - return this.#trace( - "#tryCompleteBatch", - { - batchId, - }, - async (span) => { - const batch = await this.prisma.batchTaskRun.findUnique({ - select: { - status: true, - runtimeEnvironmentId: true, - }, - where: { - id: batchId, - }, - }); - - if (!batch) { - this.logger.error("#tryCompleteBatch batch doesn't exist", { batchId }); - return; - } - - if (batch.status === "COMPLETED") { - this.logger.debug("#tryCompleteBatch: Batch already completed", { batchId }); - return; - } - - const runs = await this.prisma.taskRun.findMany({ - select: { - id: true, - status: true, - }, - where: { - batchId, - runtimeEnvironmentId: batch.runtimeEnvironmentId, - }, - }); - - if (runs.every((r) => isFinalRunStatus(r.status))) { - this.logger.debug("#tryCompleteBatch: All runs are completed", { batchId }); - await this.prisma.batchTaskRun.update({ - where: { - id: batchId, - }, - data: { - status: "COMPLETED", - }, - }); - } else { - this.logger.debug("#tryCompleteBatch: Not all runs are completed", { batchId }); - } - } - ); - } - - async #getAuthenticatedEnvironmentFromRun(runId: string, tx?: PrismaClientOrTransaction) { - const prisma = tx ?? this.prisma; - const taskRun = await prisma.taskRun.findUnique({ - where: { - id: runId, - }, - include: { - runtimeEnvironment: { - include: { - organization: true, - project: true, - }, - }, - }, - }); - - if (!taskRun) { - return; - } - - return taskRun?.runtimeEnvironment; - } - #environmentMasterQueueKey(environmentId: string) { return `master-env:${environmentId}`; } @@ -3974,58 +1389,4 @@ export class RunEngine { #backgroundWorkerQueueKey(backgroundWorkerId: string) { return `master-background-worker:${backgroundWorkerId}`; } - - async #trace( - trace: string, - attributes: Attributes | undefined, - fn: (span: Span) => Promise - ): Promise { - return this.tracer.startActiveSpan( - `${this.constructor.name}.${trace}`, - { attributes, kind: SpanKind.SERVER }, - async (span) => { - try { - return await fn(span); - } catch (e) { - if (e instanceof ServiceValidationError) { - throw e; - } - - if (e instanceof Error) { - span.recordException(e); - } else { - span.recordException(new Error(String(e))); - } - - throw e; - } finally { - span.end(); - } - } - ); - } -} - -export class ServiceValidationError extends Error { - constructor( - message: string, - public status?: number - ) { - super(message); - this.name = "ServiceValidationError"; - } -} - -class NotImplementedError extends Error { - constructor(message: string) { - console.error("This isn't implemented", { message }); - super(message); - } -} - -export class RunDuplicateIdempotencyKeyError extends Error { - constructor(message: string) { - super(message); - this.name = "RunDuplicateIdempotencyKeyError"; - } } diff --git a/internal-packages/run-engine/src/engine/releaseConcurrencyTokenBucketQueue.ts b/internal-packages/run-engine/src/engine/releaseConcurrencyTokenBucketQueue.ts new file mode 100644 index 0000000000..fcdfb774e3 --- /dev/null +++ b/internal-packages/run-engine/src/engine/releaseConcurrencyTokenBucketQueue.ts @@ -0,0 +1,633 @@ +import { Callback, createRedisClient, Redis, Result, type RedisOptions } from "@internal/redis"; +import { Tracer } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { setInterval } from "node:timers/promises"; +import { z } from "zod"; + +export type ReleaseConcurrencyQueueRetryOptions = { + maxRetries?: number; + backoff?: { + minDelay?: number; // Defaults to 1000 + maxDelay?: number; // Defaults to 60000 + factor?: number; // Defaults to 2 + }; +}; + +export type ReleaseConcurrencyQueueOptions = { + redis: RedisOptions; + executor: (releaseQueue: T, releaserId: string) => Promise; + keys: { + fromDescriptor: (releaseQueue: T) => string; + toDescriptor: (releaseQueue: string) => T; + }; + maxTokens: (descriptor: T) => Promise; + consumersCount?: number; + masterQueuesKey?: string; + tracer?: Tracer; + logger?: Logger; + pollInterval?: number; + batchSize?: number; + retry?: ReleaseConcurrencyQueueRetryOptions; +}; + +const QueueItemMetadata = z.object({ + retryCount: z.number(), + lastAttempt: z.number(), +}); + +type QueueItemMetadata = z.infer; + +export class ReleaseConcurrencyTokenBucketQueue { + private redis: Redis; + private logger: Logger; + private abortController: AbortController; + private consumers: ReleaseConcurrencyQueueConsumer[]; + + private keyPrefix: string; + private masterQueuesKey: string; + private consumersCount: number; + private pollInterval: number; + private keys: ReleaseConcurrencyQueueOptions["keys"]; + private maxTokens: ReleaseConcurrencyQueueOptions["maxTokens"]; + private batchSize: number; + private maxRetries: number; + private backoff: NonNullable>; + + constructor(private readonly options: ReleaseConcurrencyQueueOptions) { + this.redis = createRedisClient(options.redis); + this.keyPrefix = options.redis.keyPrefix ?? "re2:release-concurrency-queue:"; + this.logger = options.logger ?? new Logger("ReleaseConcurrencyQueue"); + this.abortController = new AbortController(); + this.consumers = []; + + this.masterQueuesKey = options.masterQueuesKey ?? "master-queue"; + this.consumersCount = options.consumersCount ?? 1; + this.pollInterval = options.pollInterval ?? 1000; + this.keys = options.keys; + this.maxTokens = options.maxTokens; + this.batchSize = options.batchSize ?? 5; + this.maxRetries = options.retry?.maxRetries ?? 3; + this.backoff = { + minDelay: options.retry?.backoff?.minDelay ?? 1000, + maxDelay: options.retry?.backoff?.maxDelay ?? 60000, + factor: options.retry?.backoff?.factor ?? 2, + }; + + this.#registerCommands(); + this.#startConsumers(); + } + + public async quit() { + this.abortController.abort(); + await this.redis.quit(); + } + + /** + * Attempt to release concurrency for a run. + * + * If there is a token available, then immediately release the concurrency + * If there is no token available, then we'll add the operation to a queue + * and wait until the token is available. + */ + public async attemptToRelease(releaseQueueDescriptor: T, releaserId: string) { + const maxTokens = await this.#callMaxTokens(releaseQueueDescriptor); + + if (maxTokens === 0) { + return; + } + + const releaseQueue = this.keys.fromDescriptor(releaseQueueDescriptor); + + const result = await this.redis.consumeToken( + this.masterQueuesKey, + this.#bucketKey(releaseQueue), + this.#queueKey(releaseQueue), + this.#metadataKey(releaseQueue), + releaseQueue, + releaserId, + String(maxTokens), + String(Date.now()) + ); + + if (!!result) { + await this.#callExecutor(releaseQueueDescriptor, releaserId, { + retryCount: 0, + lastAttempt: Date.now(), + }); + } else { + this.logger.info("No token available, adding to queue", { + releaseQueueDescriptor, + releaserId, + maxTokens, + }); + } + } + + /** + * Consume a token from the token bucket for a release queue. + * + * This is mainly used for testing purposes + */ + public async consumeToken(releaseQueueDescriptor: T, releaserId: string) { + const maxTokens = await this.#callMaxTokens(releaseQueueDescriptor); + + if (maxTokens === 0) { + return; + } + + const releaseQueue = this.keys.fromDescriptor(releaseQueueDescriptor); + + await this.redis.consumeToken( + this.masterQueuesKey, + this.#bucketKey(releaseQueue), + this.#queueKey(releaseQueue), + this.#metadataKey(releaseQueue), + releaseQueue, + releaserId, + String(maxTokens), + String(Date.now()) + ); + } + + /** + * Return a token to the token bucket for a release queue. + * + * This is mainly used for testing purposes + */ + public async returnToken(releaseQueueDescriptor: T, releaserId: string) { + const releaseQueue = this.keys.fromDescriptor(releaseQueueDescriptor); + + await this.redis.returnTokenOnly( + this.masterQueuesKey, + this.#bucketKey(releaseQueue), + this.#queueKey(releaseQueue), + this.#metadataKey(releaseQueue), + releaseQueue, + releaserId + ); + } + + /** + * Refill the token bucket for a release queue. + * + * This will add the amount of tokens to the token bucket. + */ + public async refillTokens(releaseQueueDescriptor: T, amount: number = 1) { + const maxTokens = await this.#callMaxTokens(releaseQueueDescriptor); + const releaseQueue = this.keys.fromDescriptor(releaseQueueDescriptor); + + if (amount < 0) { + throw new Error("Cannot refill with negative tokens"); + } + + if (amount === 0) { + return []; + } + + await this.redis.refillTokens( + this.masterQueuesKey, + this.#bucketKey(releaseQueue), + this.#queueKey(releaseQueue), + releaseQueue, + String(amount), + String(maxTokens) + ); + } + + /** + * Get the next queue that has available capacity and process one item from it + * Returns true if an item was processed, false if no items were available + */ + public async processNextAvailableQueue(): Promise { + const result = await this.redis.processMasterQueue( + this.masterQueuesKey, + this.keyPrefix, + this.batchSize, + String(Date.now()) + ); + + if (!result || result.length === 0) { + return false; + } + + await Promise.all( + result.map(([queue, releaserId, metadata]) => { + const itemMetadata = QueueItemMetadata.parse(JSON.parse(metadata)); + const releaseQueueDescriptor = this.keys.toDescriptor(queue); + return this.#callExecutor(releaseQueueDescriptor, releaserId, itemMetadata); + }) + ); + + return true; + } + + async #callExecutor(releaseQueueDescriptor: T, releaserId: string, metadata: QueueItemMetadata) { + try { + this.logger.info("Executing run:", { releaseQueueDescriptor, releaserId }); + + await this.options.executor(releaseQueueDescriptor, releaserId); + } catch (error) { + this.logger.error("Error executing run:", { error }); + + if (metadata.retryCount >= this.maxRetries) { + this.logger.error("Max retries reached:", { + releaseQueueDescriptor, + releaserId, + retryCount: metadata.retryCount, + }); + + // Return the token but don't requeue + const releaseQueue = this.keys.fromDescriptor(releaseQueueDescriptor); + await this.redis.returnTokenOnly( + this.masterQueuesKey, + this.#bucketKey(releaseQueue), + this.#queueKey(releaseQueue), + this.#metadataKey(releaseQueue), + releaseQueue, + releaserId + ); + + this.logger.info("Returned token:", { releaseQueueDescriptor, releaserId }); + + return; + } + + const updatedMetadata: QueueItemMetadata = { + ...metadata, + retryCount: metadata.retryCount + 1, + lastAttempt: Date.now(), + }; + + const releaseQueue = this.keys.fromDescriptor(releaseQueueDescriptor); + + await this.redis.returnTokenAndRequeue( + this.masterQueuesKey, + this.#bucketKey(releaseQueue), + this.#queueKey(releaseQueue), + this.#metadataKey(releaseQueue), + releaseQueue, + releaserId, + JSON.stringify(updatedMetadata), + this.#calculateBackoffScore(updatedMetadata) + ); + } + } + + // Make sure maxTokens is an integer (round down) + // And if it throws, return 0 + async #callMaxTokens(releaseQueueDescriptor: T) { + try { + const maxTokens = await this.maxTokens(releaseQueueDescriptor); + return Math.floor(maxTokens); + } catch (error) { + return 0; + } + } + + #bucketKey(releaseQueue: string) { + return `${releaseQueue}:bucket`; + } + + #queueKey(releaseQueue: string) { + return `${releaseQueue}:queue`; + } + + #metadataKey(releaseQueue: string) { + return `${releaseQueue}:metadata`; + } + + #startConsumers() { + const consumerCount = this.consumersCount; + + for (let i = 0; i < consumerCount; i++) { + const consumer = new ReleaseConcurrencyQueueConsumer( + this, + this.pollInterval, + this.abortController.signal, + this.logger + ); + this.consumers.push(consumer); + // Start the consumer and don't await it + consumer.start().catch((error) => { + this.logger.error("Consumer failed to start:", { error, consumerId: i }); + }); + } + } + + #calculateBackoffScore(item: QueueItemMetadata): string { + const delay = Math.min( + this.backoff.maxDelay, + this.backoff.minDelay * Math.pow(this.backoff.factor, item.retryCount) + ); + return String(Date.now() + delay); + } + + #registerCommands() { + this.redis.defineCommand("consumeToken", { + numberOfKeys: 4, + lua: ` +local masterQueuesKey = KEYS[1] +local bucketKey = KEYS[2] +local queueKey = KEYS[3] +local metadataKey = KEYS[4] + +local releaseQueue = ARGV[1] +local releaserId = ARGV[2] +local maxTokens = tonumber(ARGV[3]) +local score = ARGV[4] + +-- Get the current token count +local currentTokens = tonumber(redis.call("GET", bucketKey) or maxTokens) + +-- If we have enough tokens, then consume them +if currentTokens >= 1 then + redis.call("SET", bucketKey, currentTokens - 1) + redis.call("ZREM", queueKey, releaserId) + + -- Clean up metadata when successfully consuming + redis.call("HDEL", metadataKey, releaserId) + + -- Get queue length after removing the item + local queueLength = redis.call("ZCARD", queueKey) + + -- If we still have tokens and items in queue, update available queues + if currentTokens > 0 and queueLength > 0 then + redis.call("ZADD", masterQueuesKey, currentTokens, releaseQueue) + else + redis.call("ZREM", masterQueuesKey, releaseQueue) + end + + return true +end + +-- If we don't have enough tokens, then we need to add the operation to the queue +redis.call("ZADD", queueKey, score, releaserId) + +-- Initialize or update metadata +local metadata = cjson.encode({ + retryCount = 0, + lastAttempt = tonumber(score) +}) +redis.call("HSET", metadataKey, releaserId, metadata) + +-- Remove from the master queue +redis.call("ZREM", masterQueuesKey, releaseQueue) + +return false + `, + }); + + this.redis.defineCommand("refillTokens", { + numberOfKeys: 3, + lua: ` +local masterQueuesKey = KEYS[1] +local bucketKey = KEYS[2] +local queueKey = KEYS[3] + +local releaseQueue = ARGV[1] +local amount = tonumber(ARGV[2]) +local maxTokens = tonumber(ARGV[3]) + +local currentTokens = tonumber(redis.call("GET", bucketKey) or maxTokens) + +-- Add the amount of tokens to the token bucket +local newTokens = currentTokens + amount + +-- If we have more tokens than the max, then set the token bucket to the max +if newTokens > maxTokens then + newTokens = maxTokens +end + +redis.call("SET", bucketKey, newTokens) + +-- Get the number of items in the queue +local queueLength = redis.call("ZCARD", queueKey) + +-- If we have tokens available and items in the queue, add to available queues +if newTokens > 0 and queueLength > 0 then + redis.call("ZADD", masterQueuesKey, newTokens, releaseQueue) +else + redis.call("ZREM", masterQueuesKey, releaseQueue) +end + `, + }); + + this.redis.defineCommand("processMasterQueue", { + numberOfKeys: 1, + lua: ` +local masterQueuesKey = KEYS[1] + +local keyPrefix = ARGV[1] +local batchSize = tonumber(ARGV[2]) +local currentTime = tonumber(ARGV[3]) +-- Get the queue with the highest number of available tokens +local queues = redis.call("ZREVRANGE", masterQueuesKey, 0, 0, "WITHSCORES") +if #queues == 0 then + return nil +end + +local queueName = queues[1] +local availableTokens = tonumber(queues[2]) + +local bucketKey = keyPrefix .. queueName .. ":bucket" +local queueKey = keyPrefix .. queueName .. ":queue" +local metadataKey = keyPrefix .. queueName .. ":metadata" + +-- Get the oldest item from the queue +local items = redis.call("ZRANGEBYSCORE", queueKey, 0, currentTime, "LIMIT", 0, batchSize) +if #items == 0 then +-- No items ready to be processed yet + return nil +end + +-- Calculate how many items we can actually process +local itemsToProcess = math.min(#items, availableTokens) +local results = {} + +-- Consume tokens and collect results +local currentTokens = tonumber(redis.call("GET", bucketKey)) +redis.call("SET", bucketKey, currentTokens - itemsToProcess) + +-- Remove the items from the queue and add to results +for i = 1, itemsToProcess do + local releaserId = items[i] + redis.call("ZREM", queueKey, releaserId) + + -- Get metadata before removing it + local metadata = redis.call("HGET", metadataKey, releaserId) + redis.call("HDEL", metadataKey, releaserId) + + table.insert(results, { queueName, releaserId, metadata }) +end + +-- Get remaining queue length +local queueLength = redis.call("ZCARD", queueKey) + +-- Update available queues score or remove if no more tokens +local remainingTokens = currentTokens - itemsToProcess +if remainingTokens > 0 and queueLength > 0 then + redis.call("ZADD", masterQueuesKey, remainingTokens, queueName) +else + redis.call("ZREM", masterQueuesKey, queueName) +end + +return results + `, + }); + + this.redis.defineCommand("returnTokenAndRequeue", { + numberOfKeys: 4, + lua: ` +local masterQueuesKey = KEYS[1] +local bucketKey = KEYS[2] +local queueKey = KEYS[3] +local metadataKey = KEYS[4] + +local releaseQueue = ARGV[1] +local releaserId = ARGV[2] +local metadata = ARGV[3] +local score = ARGV[4] + +-- Return the token to the bucket +local currentTokens = tonumber(redis.call("GET", bucketKey)) +local remainingTokens = currentTokens + 1 +redis.call("SET", bucketKey, remainingTokens) + +-- Add the item back to the queue +redis.call("ZADD", queueKey, score, releaserId) + +-- Add the metadata back to the item +redis.call("HSET", metadataKey, releaserId, metadata) + +-- Update the master queue +local queueLength = redis.call("ZCARD", queueKey) +if queueLength > 0 then + redis.call("ZADD", masterQueuesKey, remainingTokens, releaseQueue) +else + redis.call("ZREM", masterQueuesKey, releaseQueue) +end + +return true + `, + }); + + this.redis.defineCommand("returnTokenOnly", { + numberOfKeys: 4, + lua: ` +local masterQueuesKey = KEYS[1] +local bucketKey = KEYS[2] +local queueKey = KEYS[3] +local metadataKey = KEYS[4] + +local releaseQueue = ARGV[1] +local releaserId = ARGV[2] + +-- Return the token to the bucket +local currentTokens = tonumber(redis.call("GET", bucketKey)) +local remainingTokens = currentTokens + 1 +redis.call("SET", bucketKey, remainingTokens) + +-- Clean up metadata +redis.call("HDEL", metadataKey, releaserId) + +-- Update the master queue based on remaining queue length +local queueLength = redis.call("ZCARD", queueKey) +if queueLength > 0 then + redis.call("ZADD", masterQueuesKey, remainingTokens, releaseQueue) +else + redis.call("ZREM", masterQueuesKey, releaseQueue) +end + +return true + `, + }); + } +} + +declare module "@internal/redis" { + interface RedisCommander { + consumeToken( + masterQueuesKey: string, + bucketKey: string, + queueKey: string, + metadataKey: string, + releaseQueue: string, + releaserId: string, + maxTokens: string, + score: string, + callback?: Callback + ): Result; + + refillTokens( + masterQueuesKey: string, + bucketKey: string, + queueKey: string, + releaseQueue: string, + amount: string, + maxTokens: string, + callback?: Callback + ): Result; + + processMasterQueue( + masterQueuesKey: string, + keyPrefix: string, + batchSize: number, + currentTime: string, + callback?: Callback<[string, string, string][]> + ): Result<[string, string, string][], Context>; + + returnTokenAndRequeue( + masterQueuesKey: string, + bucketKey: string, + queueKey: string, + metadataKey: string, + releaseQueue: string, + releaserId: string, + metadata: string, + score: string, + callback?: Callback + ): Result; + + returnTokenOnly( + masterQueuesKey: string, + bucketKey: string, + queueKey: string, + metadataKey: string, + releaseQueue: string, + releaserId: string, + callback?: Callback + ): Result; + } +} + +class ReleaseConcurrencyQueueConsumer { + private logger: Logger; + + constructor( + private readonly queue: ReleaseConcurrencyTokenBucketQueue, + private readonly pollInterval: number, + private readonly signal: AbortSignal, + logger?: Logger + ) { + this.logger = logger ?? new Logger("QueueConsumer"); + } + + async start() { + try { + for await (const _ of setInterval(this.pollInterval, null, { signal: this.signal })) { + try { + const processed = await this.queue.processNextAvailableQueue(); + if (!processed) { + continue; + } + } catch (error) { + this.logger.error("Error processing queue:", { error }); + } + } + } catch (error) { + if (error instanceof Error && error.name !== "AbortError") { + this.logger.error("Consumer loop error:", { error }); + } + } + } +} diff --git a/internal-packages/run-engine/src/engine/retrying.ts b/internal-packages/run-engine/src/engine/retrying.ts index f214738ade..a621552e92 100644 --- a/internal-packages/run-engine/src/engine/retrying.ts +++ b/internal-packages/run-engine/src/engine/retrying.ts @@ -10,7 +10,7 @@ import { } from "@trigger.dev/core/v3"; import { PrismaClientOrTransaction } from "@trigger.dev/database"; import { MAX_TASK_RUN_ATTEMPTS } from "./consts.js"; -import { ServiceValidationError } from "./index.js"; +import { ServiceValidationError } from "./errors.js"; type Params = { runId: string; diff --git a/internal-packages/run-engine/src/engine/statuses.ts b/internal-packages/run-engine/src/engine/statuses.ts index 27ba540be1..5eb923fa3d 100644 --- a/internal-packages/run-engine/src/engine/statuses.ts +++ b/internal-packages/run-engine/src/engine/statuses.ts @@ -1,7 +1,7 @@ import { TaskRunExecutionStatus, TaskRunStatus } from "@trigger.dev/database"; export function isDequeueableExecutionStatus(status: TaskRunExecutionStatus): boolean { - const dequeuableExecutionStatuses: TaskRunExecutionStatus[] = ["QUEUED"]; + const dequeuableExecutionStatuses: TaskRunExecutionStatus[] = ["QUEUED", "QUEUED_EXECUTING"]; return dequeuableExecutionStatuses.includes(status); } @@ -26,6 +26,7 @@ export function isCheckpointable(status: TaskRunExecutionStatus): boolean { //executing "EXECUTING", "EXECUTING_WITH_WAITPOINTS", + "QUEUED_EXECUTING", ]; return checkpointableStatuses.includes(status); } @@ -44,3 +45,8 @@ export function isFinalRunStatus(status: TaskRunStatus): boolean { return finalStatuses.includes(status); } + +export function canReleaseConcurrency(status: TaskRunExecutionStatus): boolean { + const releaseableStatuses: TaskRunExecutionStatus[] = ["SUSPENDED", "EXECUTING_WITH_WAITPOINTS"]; + return releaseableStatuses.includes(status); +} diff --git a/internal-packages/run-engine/src/engine/systems/batchSystem.ts b/internal-packages/run-engine/src/engine/systems/batchSystem.ts new file mode 100644 index 0000000000..5f1948a831 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/batchSystem.ts @@ -0,0 +1,83 @@ +import { startSpan } from "@internal/tracing"; +import { isFinalRunStatus } from "../statuses.js"; +import { SystemResources } from "./systems.js"; + +export type BatchSystemOptions = { + resources: SystemResources; +}; + +export class BatchSystem { + private readonly $: SystemResources; + + constructor(private readonly options: BatchSystemOptions) { + this.$ = options.resources; + } + + public async scheduleCompleteBatch({ batchId }: { batchId: string }): Promise { + await this.$.worker.enqueue({ + //this will debounce the call + id: `tryCompleteBatch:${batchId}`, + job: "tryCompleteBatch", + payload: { batchId: batchId }, + //2s in the future + availableAt: new Date(Date.now() + 2_000), + }); + } + + public async performCompleteBatch({ batchId }: { batchId: string }): Promise { + await this.#tryCompleteBatch({ batchId }); + } + + /** + * Checks to see if all runs for a BatchTaskRun are completed, if they are then update the status. + * This isn't used operationally, but it's used for the Batches dashboard page. + */ + async #tryCompleteBatch({ batchId }: { batchId: string }) { + return startSpan(this.$.tracer, "#tryCompleteBatch", async (span) => { + const batch = await this.$.prisma.batchTaskRun.findUnique({ + select: { + status: true, + runtimeEnvironmentId: true, + }, + where: { + id: batchId, + }, + }); + + if (!batch) { + this.$.logger.error("#tryCompleteBatch batch doesn't exist", { batchId }); + return; + } + + if (batch.status === "COMPLETED") { + this.$.logger.debug("#tryCompleteBatch: Batch already completed", { batchId }); + return; + } + + const runs = await this.$.prisma.taskRun.findMany({ + select: { + id: true, + status: true, + }, + where: { + batchId, + runtimeEnvironmentId: batch.runtimeEnvironmentId, + }, + }); + + if (runs.every((r) => isFinalRunStatus(r.status))) { + this.$.logger.debug("#tryCompleteBatch: All runs are completed", { batchId }); + await this.$.prisma.batchTaskRun.update({ + where: { + id: batchId, + }, + data: { + status: "COMPLETED", + }, + }); + } else { + this.$.logger.debug("#tryCompleteBatch: Not all runs are completed", { batchId }); + } + }); + } +} diff --git a/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts new file mode 100644 index 0000000000..de06fca524 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/checkpointSystem.ts @@ -0,0 +1,295 @@ +import { CheckpointInput, CreateCheckpointResult, ExecutionResult } from "@trigger.dev/core/v3"; +import { CheckpointId } from "@trigger.dev/core/v3/isomorphic"; +import { PrismaClientOrTransaction } from "@trigger.dev/database"; +import { sendNotificationToWorker } from "../eventBus.js"; +import { isCheckpointable, isPendingExecuting } from "../statuses.js"; +import { + getLatestExecutionSnapshot, + executionResultFromSnapshot, + ExecutionSnapshotSystem, +} from "./executionSnapshotSystem.js"; +import { SystemResources } from "./systems.js"; +import { ServiceValidationError } from "../errors.js"; +import { EnqueueSystem } from "./enqueueSystem.js"; +import { ReleaseConcurrencySystem } from "./releaseConcurrencySystem.js"; +export type CheckpointSystemOptions = { + resources: SystemResources; + executionSnapshotSystem: ExecutionSnapshotSystem; + enqueueSystem: EnqueueSystem; + releaseConcurrencySystem: ReleaseConcurrencySystem; +}; + +export class CheckpointSystem { + private readonly $: SystemResources; + private readonly executionSnapshotSystem: ExecutionSnapshotSystem; + private readonly enqueueSystem: EnqueueSystem; + private readonly releaseConcurrencySystem: ReleaseConcurrencySystem; + + constructor(private readonly options: CheckpointSystemOptions) { + this.$ = options.resources; + this.executionSnapshotSystem = options.executionSnapshotSystem; + this.enqueueSystem = options.enqueueSystem; + this.releaseConcurrencySystem = options.releaseConcurrencySystem; + } + + /** + * This gets called AFTER the checkpoint has been created + * The CPU/Memory checkpoint at this point exists in our snapshot storage + */ + async createCheckpoint({ + runId, + snapshotId, + checkpoint, + workerId, + runnerId, + tx, + }: { + runId: string; + snapshotId: string; + checkpoint: CheckpointInput; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + + return await this.$.runLock.lock([runId], 5_000, async () => { + const snapshot = await getLatestExecutionSnapshot(prisma, runId); + + const isValidSnapshot = + // Case 1: The provided snapshotId matches the current snapshot + snapshot.id === snapshotId || + // Case 2: The provided snapshotId matches the previous snapshot + // AND we're in QUEUED_EXECUTING state (which is valid) + (snapshot.previousSnapshotId === snapshotId && + snapshot.executionStatus === "QUEUED_EXECUTING"); + + if (!isValidSnapshot) { + this.$.logger.error("Tried to createCheckpoint on an invalid snapshot", { + snapshot, + snapshotId, + }); + + this.$.eventBus.emit("incomingCheckpointDiscarded", { + time: new Date(), + run: { + id: runId, + }, + checkpoint: { + discardReason: "Not the latest snapshot", + metadata: checkpoint, + }, + snapshot: { + id: snapshot.id, + executionStatus: snapshot.executionStatus, + }, + }); + + return { + ok: false as const, + error: "Not the latest snapshot", + }; + } + + if (!isCheckpointable(snapshot.executionStatus)) { + this.$.logger.error("Tried to createCheckpoint on a run in an invalid state", { + snapshot, + }); + + this.$.eventBus.emit("incomingCheckpointDiscarded", { + time: new Date(), + run: { + id: runId, + }, + checkpoint: { + discardReason: `Status ${snapshot.executionStatus} is not checkpointable`, + metadata: checkpoint, + }, + snapshot: { + id: snapshot.id, + executionStatus: snapshot.executionStatus, + }, + }); + + return { + ok: false as const, + error: `Status ${snapshot.executionStatus} is not checkpointable`, + }; + } + + // Get the run and update the status + const run = await this.$.prisma.taskRun.update({ + where: { + id: runId, + }, + data: { + status: "WAITING_TO_RESUME", + }, + include: { + runtimeEnvironment: { + include: { + project: true, + organization: true, + }, + }, + }, + }); + + if (!run) { + this.$.logger.error("Run not found for createCheckpoint", { + snapshot, + }); + + throw new ServiceValidationError("Run not found", 404); + } + + // Create the checkpoint + const taskRunCheckpoint = await prisma.taskRunCheckpoint.create({ + data: { + ...CheckpointId.generate(), + type: checkpoint.type, + location: checkpoint.location, + imageRef: checkpoint.imageRef, + reason: checkpoint.reason, + runtimeEnvironmentId: run.runtimeEnvironment.id, + projectId: run.runtimeEnvironment.projectId, + }, + }); + + if (snapshot.executionStatus === "QUEUED_EXECUTING") { + // Enqueue the run again + const newSnapshot = await this.enqueueSystem.enqueueRun({ + run, + env: run.runtimeEnvironment, + timestamp: run.createdAt.getTime() - run.priorityMs, + snapshot: { + status: "QUEUED", + description: + "Run was QUEUED, because it was queued and executing and a checkpoint was created", + metadata: snapshot.metadata, + }, + previousSnapshotId: snapshot.id, + batchId: snapshot.batchId ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints.map((waitpoint) => ({ + id: waitpoint.id, + index: waitpoint.index, + })), + checkpointId: taskRunCheckpoint.id, + }); + + // Refill the token bucket for the release concurrency queue + await this.releaseConcurrencySystem.checkpointCreatedOnEnvironment(run.runtimeEnvironment); + + return { + ok: true as const, + ...executionResultFromSnapshot(newSnapshot), + checkpoint: taskRunCheckpoint, + } satisfies CreateCheckpointResult; + } else { + //create a new execution snapshot, with the checkpoint + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run, + snapshot: { + executionStatus: "SUSPENDED", + description: "Run was suspended after creating a checkpoint.", + metadata: snapshot.metadata, + }, + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + checkpointId: taskRunCheckpoint.id, + workerId, + runnerId, + }); + + // Refill the token bucket for the release concurrency queue + await this.releaseConcurrencySystem.checkpointCreatedOnEnvironment(run.runtimeEnvironment); + + return { + ok: true as const, + ...executionResultFromSnapshot(newSnapshot), + checkpoint: taskRunCheckpoint, + } satisfies CreateCheckpointResult; + } + }); + } + + /** + * This is called when a run has been restored from a checkpoint and is ready to start executing again + */ + async continueRunExecution({ + runId, + snapshotId, + workerId, + runnerId, + tx, + }: { + runId: string; + snapshotId: string; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + + return await this.$.runLock.lock([runId], 5_000, async () => { + const snapshot = await getLatestExecutionSnapshot(prisma, runId); + + if (snapshot.id !== snapshotId) { + throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400); + } + + if (!isPendingExecuting(snapshot.executionStatus)) { + throw new ServiceValidationError("Snapshot is not in a valid state to continue", 400); + } + + // Get the run and update the status + const run = await this.$.prisma.taskRun.update({ + where: { + id: runId, + }, + data: { + status: "EXECUTING", + }, + select: { + id: true, + status: true, + attemptNumber: true, + }, + }); + + if (!run) { + this.$.logger.error("Run not found for createCheckpoint", { + snapshot, + }); + + throw new ServiceValidationError("Run not found", 404); + } + + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run, + snapshot: { + executionStatus: "EXECUTING", + description: "Run was continued after being suspended", + }, + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + completedWaitpoints: snapshot.completedWaitpoints, + workerId, + runnerId, + }); + + // Let worker know about the new snapshot so it can continue the run + await sendNotificationToWorker({ runId, snapshot: newSnapshot, eventBus: this.$.eventBus }); + + return { + ...executionResultFromSnapshot(newSnapshot), + } satisfies ExecutionResult; + }); + } +} diff --git a/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts new file mode 100644 index 0000000000..c954a8d7e1 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/delayedRunSystem.ts @@ -0,0 +1,135 @@ +import { startSpan } from "@internal/tracing"; +import { SystemResources } from "./systems.js"; +import { PrismaClientOrTransaction, TaskRun } from "@trigger.dev/database"; +import { getLatestExecutionSnapshot } from "./executionSnapshotSystem.js"; +import { parseNaturalLanguageDuration } from "@trigger.dev/core/v3/isomorphic"; +import { EnqueueSystem } from "./enqueueSystem.js"; +import { ServiceValidationError } from "../errors.js"; + +export type DelayedRunSystemOptions = { + resources: SystemResources; + enqueueSystem: EnqueueSystem; +}; + +export class DelayedRunSystem { + private readonly $: SystemResources; + private readonly enqueueSystem: EnqueueSystem; + + constructor(private readonly options: DelayedRunSystemOptions) { + this.$ = options.resources; + this.enqueueSystem = options.enqueueSystem; + } + + /** + * Reschedules a delayed run where the run hasn't been queued yet + */ + async rescheduleDelayedRun({ + runId, + delayUntil, + tx, + }: { + runId: string; + delayUntil: Date; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + return startSpan( + this.$.tracer, + "rescheduleDelayedRun", + async () => { + return await this.$.runLock.lock([runId], 5_000, async () => { + const snapshot = await getLatestExecutionSnapshot(prisma, runId); + + //if the run isn't just created then we can't reschedule it + if (snapshot.executionStatus !== "RUN_CREATED") { + throw new ServiceValidationError("Cannot reschedule a run that is not delayed"); + } + + const updatedRun = await prisma.taskRun.update({ + where: { + id: runId, + }, + data: { + delayUntil: delayUntil, + executionSnapshots: { + create: { + engine: "V2", + executionStatus: "RUN_CREATED", + description: "Delayed run was rescheduled to a future date", + runStatus: "EXPIRED", + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + }, + }, + }, + }); + + await this.$.worker.reschedule(`enqueueDelayedRun:${updatedRun.id}`, delayUntil); + + return updatedRun; + }); + }, + { + attributes: { runId }, + } + ); + } + + async enqueueDelayedRun({ runId }: { runId: string }) { + const run = await this.$.prisma.taskRun.findFirst({ + where: { id: runId }, + include: { + runtimeEnvironment: { + include: { + project: true, + organization: true, + }, + }, + }, + }); + + if (!run) { + throw new Error(`#enqueueDelayedRun: run not found: ${runId}`); + } + + // Now we need to enqueue the run into the RunQueue + await this.enqueueSystem.enqueueRun({ + run, + env: run.runtimeEnvironment, + timestamp: run.createdAt.getTime() - run.priorityMs, + batchId: run.batchId ?? undefined, + }); + + await this.$.prisma.taskRun.update({ + where: { id: runId }, + data: { + status: "PENDING", + queuedAt: new Date(), + }, + }); + + if (run.ttl) { + const expireAt = parseNaturalLanguageDuration(run.ttl); + + if (expireAt) { + await this.$.worker.enqueue({ + id: `expireRun:${runId}`, + job: "expireRun", + payload: { runId }, + availableAt: expireAt, + }); + } + } + } + + async scheduleDelayedRunEnqueuing({ runId, delayUntil }: { runId: string; delayUntil: Date }) { + await this.$.worker.enqueue({ + id: `enqueueDelayedRun:${runId}`, + job: "enqueueDelayedRun", + payload: { runId }, + availableAt: delayUntil, + }); + } +} diff --git a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts new file mode 100644 index 0000000000..33bdb56563 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts @@ -0,0 +1,620 @@ +import { startSpan } from "@internal/tracing"; +import { assertExhaustive } from "@trigger.dev/core"; +import { DequeuedMessage, MachineResources, RetryOptions } from "@trigger.dev/core/v3"; +import { getMaxDuration, sanitizeQueueName } from "@trigger.dev/core/v3/isomorphic"; +import { PrismaClientOrTransaction } from "@trigger.dev/database"; +import { getRunWithBackgroundWorkerTasks } from "../db/worker.js"; +import { getMachinePreset } from "../machinePresets.js"; +import { isDequeueableExecutionStatus } from "../statuses.js"; +import { RunEngineOptions } from "../types.js"; +import { ExecutionSnapshotSystem, getLatestExecutionSnapshot } from "./executionSnapshotSystem.js"; +import { RunAttemptSystem } from "./runAttemptSystem.js"; +import { SystemResources } from "./systems.js"; +import { sendNotificationToWorker } from "../eventBus.js"; + +export type DequeueSystemOptions = { + resources: SystemResources; + machines: RunEngineOptions["machines"]; + executionSnapshotSystem: ExecutionSnapshotSystem; + runAttemptSystem: RunAttemptSystem; +}; + +export class DequeueSystem { + private readonly $: SystemResources; + private readonly executionSnapshotSystem: ExecutionSnapshotSystem; + private readonly runAttemptSystem: RunAttemptSystem; + + constructor(private readonly options: DequeueSystemOptions) { + this.$ = options.resources; + this.executionSnapshotSystem = options.executionSnapshotSystem; + this.runAttemptSystem = options.runAttemptSystem; + } + + /** + * Gets a fairly selected run from the specified master queue, returning the information required to run it. + * @param consumerId: The consumer that is pulling, allows multiple consumers to pull from the same queue + * @param masterQueue: The shared queue to pull from, can be an individual environment (for dev) + * @returns + */ + async dequeueFromMasterQueue({ + consumerId, + masterQueue, + maxRunCount, + maxResources, + backgroundWorkerId, + workerId, + runnerId, + tx, + }: { + consumerId: string; + masterQueue: string; + maxRunCount: number; + maxResources?: MachineResources; + backgroundWorkerId?: string; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + + return startSpan( + this.$.tracer, + "dequeueFromMasterQueue", + async (span) => { + //gets multiple runs from the queue + const messages = await this.$.runQueue.dequeueMessageFromMasterQueue( + consumerId, + masterQueue, + maxRunCount + ); + if (messages.length === 0) { + return []; + } + + //we can't send more than the max resources + const consumedResources: MachineResources = { + cpu: 0, + memory: 0, + }; + + const dequeuedRuns: DequeuedMessage[] = []; + + for (const message of messages) { + const orgId = message.message.orgId; + const runId = message.messageId; + + span.setAttribute("runId", runId); + + //lock the run so nothing else can modify it + try { + const dequeuedRun = await this.$.runLock.lock([runId], 5000, async (signal) => { + const snapshot = await getLatestExecutionSnapshot(prisma, runId); + + if (!isDequeueableExecutionStatus(snapshot.executionStatus)) { + //create a failed snapshot + await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run: { + id: snapshot.runId, + status: snapshot.runStatus, + }, + snapshot: { + executionStatus: snapshot.executionStatus, + description: + "Tried to dequeue a run that is not in a valid state to be dequeued.", + }, + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + checkpointId: snapshot.checkpointId ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints, + error: `Tried to dequeue a run that is not in a valid state to be dequeued.`, + workerId, + runnerId, + }); + + //todo is there a way to recover this, so the run can be retried? + //for example should we update the status to a dequeuable status and nack it? + //then at least it has a chance of succeeding and we have the error log above + await this.runAttemptSystem.systemFailure({ + runId, + error: { + type: "INTERNAL_ERROR", + code: "TASK_DEQUEUED_INVALID_STATE", + message: `Task was in the ${snapshot.executionStatus} state when it was dequeued for execution.`, + }, + tx: prisma, + }); + this.$.logger.error( + `RunEngine.dequeueFromMasterQueue(): Run is not in a valid state to be dequeued: ${runId}\n ${snapshot.id}:${snapshot.executionStatus}` + ); + return null; + } + + if (snapshot.executionStatus === "QUEUED_EXECUTING") { + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( + prisma, + { + run: { + id: runId, + status: snapshot.runStatus, + attemptNumber: snapshot.attemptNumber, + }, + snapshot: { + executionStatus: "EXECUTING", + description: "Run was continued, whilst still executing.", + }, + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + batchId: snapshot.batchId ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints.map((waitpoint) => ({ + id: waitpoint.id, + index: waitpoint.index, + })), + } + ); + + await sendNotificationToWorker({ + runId, + snapshot: newSnapshot, + eventBus: this.$.eventBus, + }); + + return null; + } + + const result = await getRunWithBackgroundWorkerTasks( + prisma, + runId, + backgroundWorkerId + ); + + if (!result.success) { + switch (result.code) { + case "NO_RUN": { + //this should not happen, the run is unrecoverable so we'll ack it + this.$.logger.error("RunEngine.dequeueFromMasterQueue(): No run found", { + runId, + latestSnapshot: snapshot.id, + }); + await this.$.runQueue.acknowledgeMessage(orgId, runId); + return null; + } + case "NO_WORKER": + case "TASK_NEVER_REGISTERED": + case "TASK_NOT_IN_LATEST": { + this.$.logger.warn(`RunEngine.dequeueFromMasterQueue(): ${result.code}`, { + runId, + latestSnapshot: snapshot.id, + result, + }); + + //not deployed yet, so we'll wait for the deploy + await this.#waitingForDeploy({ + orgId, + runId, + reason: result.message, + tx: prisma, + }); + return null; + } + case "BACKGROUND_WORKER_MISMATCH": { + this.$.logger.warn( + "RunEngine.dequeueFromMasterQueue(): Background worker mismatch", + { + runId, + latestSnapshot: snapshot.id, + result, + } + ); + + //worker mismatch so put it back in the queue + await this.$.runQueue.nackMessage({ orgId, messageId: runId }); + + return null; + } + default: { + assertExhaustive(result); + } + } + } + + //check for a valid deployment if it's not a development environment + if (result.run.runtimeEnvironment.type !== "DEVELOPMENT") { + if (!result.deployment || !result.deployment.imageReference) { + this.$.logger.warn("RunEngine.dequeueFromMasterQueue(): No deployment found", { + runId, + latestSnapshot: snapshot.id, + result, + }); + //not deployed yet, so we'll wait for the deploy + await this.#waitingForDeploy({ + orgId, + runId, + reason: "No deployment or deployment image reference found for deployed run", + tx: prisma, + }); + + return null; + } + } + + const machinePreset = getMachinePreset({ + machines: this.options.machines.machines, + defaultMachine: this.options.machines.defaultMachine, + config: result.task.machineConfig ?? {}, + run: result.run, + }); + + //increment the consumed resources + consumedResources.cpu += machinePreset.cpu; + consumedResources.memory += machinePreset.memory; + + //are we under the limit? + if (maxResources) { + if ( + consumedResources.cpu > maxResources.cpu || + consumedResources.memory > maxResources.memory + ) { + this.$.logger.debug( + "RunEngine.dequeueFromMasterQueue(): Consumed resources over limit, nacking", + { + runId, + consumedResources, + maxResources, + } + ); + + //put it back in the queue where it was + await this.$.runQueue.nackMessage({ + orgId, + messageId: runId, + incrementAttemptCount: false, + retryAt: result.run.createdAt.getTime() - result.run.priorityMs, + }); + return null; + } + } + + // Check max attempts that can optionally be set when triggering a run + let maxAttempts: number | null | undefined = result.run.maxAttempts; + + // If it's not set, we'll grab it from the task's retry config + if (!maxAttempts) { + const retryConfig = result.task.retryConfig; + + this.$.logger.debug( + "RunEngine.dequeueFromMasterQueue(): maxAttempts not set, using task's retry config", + { + runId, + task: result.task.id, + rawRetryConfig: retryConfig, + } + ); + + const parsedConfig = RetryOptions.nullable().safeParse(retryConfig); + + if (!parsedConfig.success) { + this.$.logger.error("RunEngine.dequeueFromMasterQueue(): Invalid retry config", { + runId, + task: result.task.id, + rawRetryConfig: retryConfig, + }); + + await this.runAttemptSystem.systemFailure({ + runId, + error: { + type: "INTERNAL_ERROR", + code: "TASK_DEQUEUED_INVALID_RETRY_CONFIG", + message: `Invalid retry config: ${retryConfig}`, + }, + tx: prisma, + }); + + return null; + } + + if (!parsedConfig.data) { + this.$.logger.error("RunEngine.dequeueFromMasterQueue(): No retry config", { + runId, + task: result.task.id, + rawRetryConfig: retryConfig, + }); + + await this.runAttemptSystem.systemFailure({ + runId, + error: { + type: "INTERNAL_ERROR", + code: "TASK_DEQUEUED_NO_RETRY_CONFIG", + message: `No retry config found`, + }, + tx: prisma, + }); + + return null; + } + + maxAttempts = parsedConfig.data.maxAttempts; + } + + const queue = await prisma.taskQueue.findUnique({ + where: { + runtimeEnvironmentId_name: { + runtimeEnvironmentId: result.run.runtimeEnvironmentId, + name: sanitizeQueueName(result.run.queue), + }, + }, + }); + + if (!queue) { + this.$.logger.debug( + "RunEngine.dequeueFromMasterQueue(): queue not found, so nacking message", + { + queueMessage: message, + taskRunQueue: result.run.queue, + runtimeEnvironmentId: result.run.runtimeEnvironmentId, + } + ); + + //will auto-retry + const gotRequeued = await this.$.runQueue.nackMessage({ orgId, messageId: runId }); + if (!gotRequeued) { + await this.runAttemptSystem.systemFailure({ + runId, + error: { + type: "INTERNAL_ERROR", + code: "TASK_DEQUEUED_QUEUE_NOT_FOUND", + message: `Tried to dequeue the run but the queue doesn't exist: ${result.run.queue}`, + }, + tx: prisma, + }); + } + + return null; + } + + //update the run + const lockedTaskRun = await prisma.taskRun.update({ + where: { + id: runId, + }, + data: { + lockedAt: new Date(), + lockedById: result.task.id, + lockedToVersionId: result.worker.id, + lockedQueueId: queue.id, + startedAt: result.run.startedAt ?? new Date(), + baseCostInCents: this.options.machines.baseCostInCents, + machinePreset: machinePreset.name, + taskVersion: result.worker.version, + sdkVersion: result.worker.sdkVersion, + cliVersion: result.worker.cliVersion, + maxDurationInSeconds: getMaxDuration( + result.run.maxDurationInSeconds, + result.task.maxDurationInSeconds + ), + maxAttempts: maxAttempts ?? undefined, + }, + include: { + runtimeEnvironment: true, + tags: true, + }, + }); + + if (!lockedTaskRun) { + this.$.logger.error("RunEngine.dequeueFromMasterQueue(): Failed to lock task run", { + taskRun: result.run.id, + taskIdentifier: result.run.taskIdentifier, + deployment: result.deployment?.id, + worker: result.worker.id, + task: result.task.id, + runId, + }); + + await this.$.runQueue.acknowledgeMessage(orgId, runId); + return null; + } + + const currentAttemptNumber = lockedTaskRun.attemptNumber ?? 0; + const nextAttemptNumber = currentAttemptNumber + 1; + + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( + prisma, + { + run: { + id: runId, + status: snapshot.runStatus, + attemptNumber: lockedTaskRun.attemptNumber, + }, + snapshot: { + executionStatus: "PENDING_EXECUTING", + description: "Run was dequeued for execution", + }, + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + checkpointId: snapshot.checkpointId ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints, + workerId, + runnerId, + } + ); + + return { + version: "1" as const, + dequeuedAt: new Date(), + snapshot: { + id: newSnapshot.id, + friendlyId: newSnapshot.friendlyId, + executionStatus: newSnapshot.executionStatus, + description: newSnapshot.description, + }, + image: result.deployment?.imageReference ?? undefined, + checkpoint: newSnapshot.checkpoint ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints, + backgroundWorker: { + id: result.worker.id, + friendlyId: result.worker.friendlyId, + version: result.worker.version, + }, + deployment: { + id: result.deployment?.id, + friendlyId: result.deployment?.friendlyId, + }, + run: { + id: lockedTaskRun.id, + friendlyId: lockedTaskRun.friendlyId, + isTest: lockedTaskRun.isTest, + machine: machinePreset, + attemptNumber: nextAttemptNumber, + masterQueue: lockedTaskRun.masterQueue, + traceContext: lockedTaskRun.traceContext as Record, + }, + environment: { + id: lockedTaskRun.runtimeEnvironment.id, + type: lockedTaskRun.runtimeEnvironment.type, + }, + organization: { + id: orgId, + }, + project: { + id: lockedTaskRun.projectId, + }, + } satisfies DequeuedMessage; + }); + + if (dequeuedRun !== null) { + dequeuedRuns.push(dequeuedRun); + } + } catch (error) { + this.$.logger.error( + "RunEngine.dequeueFromMasterQueue(): Thrown error while preparing run to be run", + { + error, + runId, + } + ); + + const run = await prisma.taskRun.findFirst({ + where: { id: runId }, + include: { + runtimeEnvironment: true, + }, + }); + + if (!run) { + //this isn't ideal because we're not creating a snapshot… but we can't do much else + this.$.logger.error( + "RunEngine.dequeueFromMasterQueue(): Thrown error, then run not found. Nacking.", + { + runId, + orgId, + } + ); + await this.$.runQueue.nackMessage({ orgId, messageId: runId }); + continue; + } + + //this is an unknown error, we'll reattempt (with auto-backoff and eventually DLQ) + const gotRequeued = await this.runAttemptSystem.tryNackAndRequeue({ + run, + environment: run.runtimeEnvironment, + orgId, + projectId: run.runtimeEnvironment.projectId, + error: { + type: "INTERNAL_ERROR", + code: "TASK_RUN_DEQUEUED_MAX_RETRIES", + message: `We tried to dequeue the run the maximum number of times but it wouldn't start executing`, + }, + tx: prisma, + }); + //we don't need this, but it makes it clear we're in a loop here + continue; + } + } + + return dequeuedRuns; + }, + { + attributes: { consumerId, masterQueue }, + } + ); + } + + async #waitingForDeploy({ + orgId, + runId, + workerId, + runnerId, + reason, + tx, + }: { + orgId: string; + runId: string; + workerId?: string; + runnerId?: string; + reason?: string; + tx?: PrismaClientOrTransaction; + }) { + const prisma = tx ?? this.$.prisma; + + return startSpan( + this.$.tracer, + "#waitingForDeploy", + async (span) => { + return this.$.runLock.lock([runId], 5_000, async (signal) => { + //mark run as waiting for deploy + const run = await prisma.taskRun.update({ + where: { id: runId }, + data: { + status: "WAITING_FOR_DEPLOY", + }, + select: { + id: true, + status: true, + attemptNumber: true, + runtimeEnvironment: { + select: { + id: true, + type: true, + projectId: true, + project: { select: { id: true, organizationId: true } }, + }, + }, + }, + }); + + await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run, + snapshot: { + executionStatus: "RUN_CREATED", + description: + reason ?? + "The run doesn't have a background worker, so we're going to ack it for now.", + }, + environmentId: run.runtimeEnvironment.id, + environmentType: run.runtimeEnvironment.type, + projectId: run.runtimeEnvironment.projectId, + organizationId: run.runtimeEnvironment.project.organizationId, + workerId, + runnerId, + }); + + //we ack because when it's deployed it will be requeued + await this.$.runQueue.acknowledgeMessage(orgId, runId); + }); + }, + { + attributes: { + runId, + }, + } + ); + } +} diff --git a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts new file mode 100644 index 0000000000..0ed309792e --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts @@ -0,0 +1,104 @@ +import { + Prisma, + PrismaClientOrTransaction, + TaskRun, + TaskRunExecutionStatus, +} from "@trigger.dev/database"; +import { MinimalAuthenticatedEnvironment } from "../../shared/index.js"; +import { ExecutionSnapshotSystem } from "./executionSnapshotSystem.js"; +import { SystemResources } from "./systems.js"; + +export type EnqueueSystemOptions = { + resources: SystemResources; + executionSnapshotSystem: ExecutionSnapshotSystem; +}; + +export class EnqueueSystem { + private readonly $: SystemResources; + private readonly executionSnapshotSystem: ExecutionSnapshotSystem; + + constructor(private readonly options: EnqueueSystemOptions) { + this.$ = options.resources; + this.executionSnapshotSystem = options.executionSnapshotSystem; + } + + public async enqueueRun({ + run, + env, + timestamp, + tx, + snapshot, + previousSnapshotId, + batchId, + checkpointId, + completedWaitpoints, + workerId, + runnerId, + }: { + run: TaskRun; + env: MinimalAuthenticatedEnvironment; + timestamp: number; + tx?: PrismaClientOrTransaction; + snapshot?: { + status?: Extract; + description?: string; + metadata?: Prisma.JsonValue; + }; + previousSnapshotId?: string; + batchId?: string; + checkpointId?: string; + completedWaitpoints?: { + id: string; + index?: number; + }[]; + workerId?: string; + runnerId?: string; + }) { + const prisma = tx ?? this.$.prisma; + + return await this.$.runLock.lock([run.id], 5000, async () => { + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run: run, + snapshot: { + executionStatus: snapshot?.status ?? "QUEUED", + description: snapshot?.description ?? "Run was QUEUED", + metadata: snapshot?.metadata ?? undefined, + }, + previousSnapshotId, + batchId, + environmentId: env.id, + environmentType: env.type, + projectId: env.project.id, + organizationId: env.organization.id, + checkpointId, + completedWaitpoints, + workerId, + runnerId, + }); + + const masterQueues = [run.masterQueue]; + if (run.secondaryMasterQueue) { + masterQueues.push(run.secondaryMasterQueue); + } + + await this.$.runQueue.enqueueMessage({ + env, + masterQueues, + message: { + runId: run.id, + taskIdentifier: run.taskIdentifier, + orgId: env.organization.id, + projectId: env.project.id, + environmentId: env.id, + environmentType: env.type, + queue: run.queue, + concurrencyKey: run.concurrencyKey ?? undefined, + timestamp, + attempt: 0, + }, + }); + + return newSnapshot; + }); + } +} diff --git a/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.ts b/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.ts new file mode 100644 index 0000000000..25320697b0 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/executionSnapshotSystem.ts @@ -0,0 +1,335 @@ +import { CompletedWaitpoint, ExecutionResult } from "@trigger.dev/core/v3"; +import { BatchId, RunId, SnapshotId } from "@trigger.dev/core/v3/isomorphic"; +import { + Prisma, + PrismaClientOrTransaction, + RuntimeEnvironmentType, + TaskRunCheckpoint, + TaskRunExecutionSnapshot, + TaskRunExecutionStatus, + TaskRunStatus, +} from "@trigger.dev/database"; +import { HeartbeatTimeouts } from "../types.js"; +import { SystemResources } from "./systems.js"; + +export type ExecutionSnapshotSystemOptions = { + resources: SystemResources; + heartbeatTimeouts: HeartbeatTimeouts; +}; + +export interface LatestExecutionSnapshot extends TaskRunExecutionSnapshot { + friendlyId: string; + runFriendlyId: string; + checkpoint: TaskRunCheckpoint | null; + completedWaitpoints: CompletedWaitpoint[]; +} + +/* Gets the most recent valid snapshot for a run */ +export async function getLatestExecutionSnapshot( + prisma: PrismaClientOrTransaction, + runId: string +): Promise { + const snapshot = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { runId, isValid: true }, + include: { + completedWaitpoints: true, + checkpoint: true, + }, + orderBy: { createdAt: "desc" }, + }); + + if (!snapshot) { + throw new Error(`No execution snapshot found for TaskRun ${runId}`); + } + + return { + ...snapshot, + friendlyId: SnapshotId.toFriendlyId(snapshot.id), + runFriendlyId: RunId.toFriendlyId(snapshot.runId), + completedWaitpoints: snapshot.completedWaitpoints.flatMap((w) => { + //get all indexes of the waitpoint in the completedWaitpointOrder + //we do this because the same run can be in a batch multiple times (i.e. same idempotencyKey) + let indexes: (number | undefined)[] = []; + for (let i = 0; i < snapshot.completedWaitpointOrder.length; i++) { + if (snapshot.completedWaitpointOrder[i] === w.id) { + indexes.push(i); + } + } + + if (indexes.length === 0) { + indexes.push(undefined); + } + + return indexes.map((index) => { + return { + id: w.id, + index: index === -1 ? undefined : index, + friendlyId: w.friendlyId, + type: w.type, + completedAt: w.completedAt ?? new Date(), + idempotencyKey: + w.userProvidedIdempotencyKey && !w.inactiveIdempotencyKey + ? w.idempotencyKey + : undefined, + completedByTaskRun: w.completedByTaskRunId + ? { + id: w.completedByTaskRunId, + friendlyId: RunId.toFriendlyId(w.completedByTaskRunId), + batch: snapshot.batchId + ? { + id: snapshot.batchId, + friendlyId: BatchId.toFriendlyId(snapshot.batchId), + } + : undefined, + } + : undefined, + completedAfter: w.completedAfter ?? undefined, + completedByBatch: w.completedByBatchId + ? { + id: w.completedByBatchId, + friendlyId: BatchId.toFriendlyId(w.completedByBatchId), + } + : undefined, + output: w.output ?? undefined, + outputType: w.outputType, + outputIsError: w.outputIsError, + } satisfies CompletedWaitpoint; + }); + }), + }; +} + +export async function getExecutionSnapshotCompletedWaitpoints( + prisma: PrismaClientOrTransaction, + snapshotId: string +) { + const waitpoints = await prisma.taskRunExecutionSnapshot.findFirst({ + where: { id: snapshotId }, + include: { + completedWaitpoints: true, + }, + }); + + //deduplicate waitpoints + const waitpointIds = new Set(); + return ( + waitpoints?.completedWaitpoints.filter((waitpoint) => { + if (waitpointIds.has(waitpoint.id)) { + return false; + } else { + waitpointIds.add(waitpoint.id); + return true; + } + }) ?? [] + ); +} + +export function executionResultFromSnapshot(snapshot: TaskRunExecutionSnapshot): ExecutionResult { + return { + snapshot: { + id: snapshot.id, + friendlyId: SnapshotId.toFriendlyId(snapshot.id), + executionStatus: snapshot.executionStatus, + description: snapshot.description, + }, + run: { + id: snapshot.runId, + friendlyId: RunId.toFriendlyId(snapshot.runId), + status: snapshot.runStatus, + attemptNumber: snapshot.attemptNumber, + }, + }; +} + +export class ExecutionSnapshotSystem { + private readonly $: SystemResources; + private readonly heartbeatTimeouts: HeartbeatTimeouts; + + constructor(private readonly options: ExecutionSnapshotSystemOptions) { + this.$ = options.resources; + this.heartbeatTimeouts = options.heartbeatTimeouts; + } + + public async createExecutionSnapshot( + prisma: PrismaClientOrTransaction, + { + run, + snapshot, + previousSnapshotId, + batchId, + environmentId, + environmentType, + projectId, + organizationId, + checkpointId, + workerId, + runnerId, + completedWaitpoints, + error, + }: { + run: { id: string; status: TaskRunStatus; attemptNumber?: number | null }; + snapshot: { + executionStatus: TaskRunExecutionStatus; + description: string; + metadata?: Prisma.JsonValue; + }; + previousSnapshotId?: string; + batchId?: string; + environmentId: string; + environmentType: RuntimeEnvironmentType; + projectId: string; + organizationId: string; + checkpointId?: string; + workerId?: string; + runnerId?: string; + completedWaitpoints?: { + id: string; + index?: number; + }[]; + error?: string; + } + ) { + const newSnapshot = await prisma.taskRunExecutionSnapshot.create({ + data: { + engine: "V2", + executionStatus: snapshot.executionStatus, + description: snapshot.description, + previousSnapshotId, + runId: run.id, + runStatus: run.status, + attemptNumber: run.attemptNumber ?? undefined, + batchId, + environmentId, + environmentType, + projectId, + organizationId, + checkpointId, + workerId, + runnerId, + metadata: snapshot.metadata ?? undefined, + completedWaitpoints: { + connect: completedWaitpoints?.map((w) => ({ id: w.id })), + }, + completedWaitpointOrder: completedWaitpoints + ?.filter((c) => c.index !== undefined) + .sort((a, b) => a.index! - b.index!) + .map((w) => w.id), + isValid: error ? false : true, + error, + }, + include: { + checkpoint: true, + }, + }); + + if (!error) { + //set heartbeat (if relevant) + const intervalMs = this.#getHeartbeatIntervalMs(newSnapshot.executionStatus); + if (intervalMs !== null) { + await this.$.worker.enqueue({ + id: `heartbeatSnapshot.${run.id}`, + job: "heartbeatSnapshot", + payload: { snapshotId: newSnapshot.id, runId: run.id }, + availableAt: new Date(Date.now() + intervalMs), + }); + } + } + + this.$.eventBus.emit("executionSnapshotCreated", { + time: newSnapshot.createdAt, + run: { + id: newSnapshot.runId, + }, + snapshot: { + ...newSnapshot, + completedWaitpointIds: completedWaitpoints?.map((w) => w.id) ?? [], + }, + }); + + return { + ...newSnapshot, + friendlyId: SnapshotId.toFriendlyId(newSnapshot.id), + runFriendlyId: RunId.toFriendlyId(newSnapshot.runId), + }; + } + + public async heartbeatRun({ + runId, + snapshotId, + workerId, + runnerId, + tx, + }: { + runId: string; + snapshotId: string; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + + //we don't need to acquire a run lock for any of this, it's not critical if it happens on an older version + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + if (latestSnapshot.id !== snapshotId) { + this.$.logger.log("heartbeatRun: no longer the latest snapshot, stopping the heartbeat.", { + runId, + snapshotId, + latestSnapshot, + workerId, + runnerId, + }); + + await this.$.worker.ack(`heartbeatSnapshot.${runId}`); + return executionResultFromSnapshot(latestSnapshot); + } + + if (latestSnapshot.workerId !== workerId) { + this.$.logger.debug("heartbeatRun: worker ID does not match the latest snapshot", { + runId, + snapshotId, + latestSnapshot, + workerId, + runnerId, + }); + } + + //update the snapshot heartbeat time + await prisma.taskRunExecutionSnapshot.update({ + where: { id: latestSnapshot.id }, + data: { + lastHeartbeatAt: new Date(), + }, + }); + + //extending the heartbeat + const intervalMs = this.#getHeartbeatIntervalMs(latestSnapshot.executionStatus); + if (intervalMs !== null) { + await this.$.worker.reschedule( + `heartbeatSnapshot.${runId}`, + new Date(Date.now() + intervalMs) + ); + } + + return executionResultFromSnapshot(latestSnapshot); + } + + #getHeartbeatIntervalMs(status: TaskRunExecutionStatus): number | null { + switch (status) { + case "PENDING_EXECUTING": { + return this.heartbeatTimeouts.PENDING_EXECUTING; + } + case "PENDING_CANCEL": { + return this.heartbeatTimeouts.PENDING_CANCEL; + } + case "EXECUTING": { + return this.heartbeatTimeouts.EXECUTING; + } + case "EXECUTING_WITH_WAITPOINTS": { + return this.heartbeatTimeouts.EXECUTING_WITH_WAITPOINTS; + } + default: { + return null; + } + } + } +} diff --git a/internal-packages/run-engine/src/engine/systems/releaseConcurrencySystem.ts b/internal-packages/run-engine/src/engine/systems/releaseConcurrencySystem.ts new file mode 100644 index 0000000000..bac9be1412 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/releaseConcurrencySystem.ts @@ -0,0 +1,221 @@ +import { RuntimeEnvironment, TaskRunExecutionSnapshot } from "@trigger.dev/database"; +import { SystemResources } from "./systems.js"; +import { getLatestExecutionSnapshot } from "./executionSnapshotSystem.js"; +import { canReleaseConcurrency } from "../statuses.js"; +import { z } from "zod"; +import { + ReleaseConcurrencyQueueOptions, + ReleaseConcurrencyTokenBucketQueue, +} from "../releaseConcurrencyTokenBucketQueue.js"; + +const ReleaseConcurrencyMetadata = z.object({ + releaseConcurrency: z.boolean().optional(), +}); + +type ReleaseConcurrencyMetadata = z.infer; + +export type ReleaseConcurrencySystemOptions = { + resources: SystemResources; + queueOptions?: ReleaseConcurrencyQueueOptions<{ + orgId: string; + projectId: string; + envId: string; + }>; +}; + +export class ReleaseConcurrencySystem { + private readonly $: SystemResources; + releaseConcurrencyQueue?: ReleaseConcurrencyTokenBucketQueue<{ + orgId: string; + projectId: string; + envId: string; + }>; + + constructor(private readonly options: ReleaseConcurrencySystemOptions) { + this.$ = options.resources; + + if (options.queueOptions) { + this.releaseConcurrencyQueue = new ReleaseConcurrencyTokenBucketQueue(options.queueOptions); + } + } + + public async consumeToken( + descriptor: { orgId: string; projectId: string; envId: string }, + releaserId: string + ) { + if (!this.releaseConcurrencyQueue) { + return; + } + + await this.releaseConcurrencyQueue.consumeToken(descriptor, releaserId); + } + + public async returnToken( + descriptor: { orgId: string; projectId: string; envId: string }, + releaserId: string + ) { + if (!this.releaseConcurrencyQueue) { + return; + } + + await this.releaseConcurrencyQueue.returnToken(descriptor, releaserId); + } + + public async quit() { + if (!this.releaseConcurrencyQueue) { + return; + } + + await this.releaseConcurrencyQueue.quit(); + } + + public async checkpointCreatedOnEnvironment(environment: RuntimeEnvironment) { + if (!this.releaseConcurrencyQueue) { + return; + } + + await this.releaseConcurrencyQueue.refillTokens( + { + orgId: environment.organizationId, + projectId: environment.projectId, + envId: environment.id, + }, + 1 + ); + } + + public async releaseConcurrencyForSnapshot(snapshot: TaskRunExecutionSnapshot) { + if (!this.releaseConcurrencyQueue) { + return; + } + + // Go ahead and release concurrency immediately if the run is in a development environment + if (snapshot.environmentType === "DEVELOPMENT") { + return await this.executeReleaseConcurrencyForSnapshot(snapshot.id); + } + + await this.releaseConcurrencyQueue.attemptToRelease( + { + orgId: snapshot.organizationId, + projectId: snapshot.projectId, + envId: snapshot.environmentId, + }, + snapshot.id + ); + } + + public async executeReleaseConcurrencyForSnapshot(snapshotId: string) { + if (!this.releaseConcurrencyQueue) { + return; + } + + this.$.logger.debug("Executing released concurrency", { + snapshotId, + }); + + // Fetch the snapshot + const snapshot = await this.$.prisma.taskRunExecutionSnapshot.findFirst({ + where: { id: snapshotId }, + select: { + id: true, + previousSnapshotId: true, + executionStatus: true, + organizationId: true, + metadata: true, + runId: true, + run: { + select: { + lockedQueueId: true, + }, + }, + }, + }); + + if (!snapshot) { + this.$.logger.error("Snapshot not found", { + snapshotId, + }); + + return; + } + + // - Runlock the run + // - Get latest snapshot + // - If the run is non suspended or going to be, then bail + // - If the run is suspended or going to be, then release the concurrency + await this.$.runLock.lock([snapshot.runId], 5_000, async () => { + const latestSnapshot = await getLatestExecutionSnapshot(this.$.prisma, snapshot.runId); + + const isValidSnapshot = + latestSnapshot.id === snapshot.id || + // Case 2: The provided snapshotId matches the previous snapshot + // AND we're in SUSPENDED state (which is valid) + (latestSnapshot.previousSnapshotId === snapshot.id && + latestSnapshot.executionStatus === "SUSPENDED"); + + if (!isValidSnapshot) { + this.$.logger.error("Tried to release concurrency on an invalid snapshot", { + latestSnapshot, + snapshot, + }); + + return; + } + + if (!canReleaseConcurrency(latestSnapshot.executionStatus)) { + this.$.logger.debug("Run is not in a state to release concurrency", { + runId: snapshot.runId, + snapshot: latestSnapshot, + }); + + return; + } + + const metadata = this.#parseMetadata(snapshot.metadata); + + if (typeof metadata.releaseConcurrency === "boolean") { + if (metadata.releaseConcurrency) { + return await this.$.runQueue.releaseAllConcurrency( + snapshot.organizationId, + snapshot.runId + ); + } + + return await this.$.runQueue.releaseEnvConcurrency(snapshot.organizationId, snapshot.runId); + } + + // Get the locked queue + const taskQueue = snapshot.run.lockedQueueId + ? await this.$.prisma.taskQueue.findFirst({ + where: { + id: snapshot.run.lockedQueueId, + }, + }) + : undefined; + + if ( + taskQueue && + (typeof taskQueue.concurrencyLimit === "undefined" || + taskQueue.releaseConcurrencyOnWaitpoint) + ) { + return await this.$.runQueue.releaseAllConcurrency(snapshot.organizationId, snapshot.runId); + } + + return await this.$.runQueue.releaseEnvConcurrency(snapshot.organizationId, snapshot.runId); + }); + } + + #parseMetadata(metadata?: unknown): ReleaseConcurrencyMetadata { + if (!metadata) { + return {}; + } + + const result = ReleaseConcurrencyMetadata.safeParse(metadata); + + if (!result.success) { + return {}; + } + + return result.data; + } +} diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts new file mode 100644 index 0000000000..0ba498d773 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -0,0 +1,1219 @@ +import { startSpan } from "@internal/tracing"; +import { + CompleteRunAttemptResult, + ExecutionResult, + StartRunAttemptResult, + TaskRunError, + TaskRunExecution, + TaskRunExecutionResult, + TaskRunFailedExecutionResult, + TaskRunInternalError, + TaskRunSuccessfulExecutionResult, +} from "@trigger.dev/core/v3/schemas"; +import { parsePacket } from "@trigger.dev/core/v3/utils/ioSerialization"; +import { + $transaction, + PrismaClientOrTransaction, + RuntimeEnvironmentType, + TaskRun, +} from "@trigger.dev/database"; +import { MAX_TASK_RUN_ATTEMPTS } from "../consts.js"; +import { runStatusFromError, ServiceValidationError } from "../errors.js"; +import { sendNotificationToWorker } from "../eventBus.js"; +import { getMachinePreset } from "../machinePresets.js"; +import { retryOutcomeFromCompletion } from "../retrying.js"; +import { isExecuting } from "../statuses.js"; +import { RunEngineOptions } from "../types.js"; +import { BatchSystem } from "./batchSystem.js"; +import { + executionResultFromSnapshot, + ExecutionSnapshotSystem, + getLatestExecutionSnapshot, +} from "./executionSnapshotSystem.js"; +import { SystemResources } from "./systems.js"; +import { WaitpointSystem } from "./waitpointSystem.js"; + +export type RunAttemptSystemOptions = { + resources: SystemResources; + executionSnapshotSystem: ExecutionSnapshotSystem; + batchSystem: BatchSystem; + waitpointSystem: WaitpointSystem; + retryWarmStartThresholdMs?: number; + machines: RunEngineOptions["machines"]; +}; + +export class RunAttemptSystem { + private readonly $: SystemResources; + private readonly executionSnapshotSystem: ExecutionSnapshotSystem; + private readonly batchSystem: BatchSystem; + private readonly waitpointSystem: WaitpointSystem; + + constructor(private readonly options: RunAttemptSystemOptions) { + this.$ = options.resources; + this.executionSnapshotSystem = options.executionSnapshotSystem; + this.batchSystem = options.batchSystem; + this.waitpointSystem = options.waitpointSystem; + } + + public async startRunAttempt({ + runId, + snapshotId, + workerId, + runnerId, + isWarmStart, + tx, + }: { + runId: string; + snapshotId: string; + workerId?: string; + runnerId?: string; + isWarmStart?: boolean; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + + return startSpan( + this.$.tracer, + "startRunAttempt", + async (span) => { + return this.$.runLock.lock([runId], 5000, async () => { + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + + if (latestSnapshot.id !== snapshotId) { + //if there is a big delay between the snapshot and the attempt, the snapshot might have changed + //we just want to log because elsewhere it should have been put back into a state where it can be attempted + this.$.logger.warn( + "RunEngine.createRunAttempt(): snapshot has changed since the attempt was created, ignoring." + ); + throw new ServiceValidationError("Snapshot changed", 409); + } + + const environment = await this.#getAuthenticatedEnvironmentFromRun(runId, prisma); + if (!environment) { + throw new ServiceValidationError("Environment not found", 404); + } + + const taskRun = await prisma.taskRun.findFirst({ + where: { + id: runId, + }, + include: { + tags: true, + lockedBy: { + include: { + worker: { + select: { + id: true, + version: true, + sdkVersion: true, + cliVersion: true, + supportsLazyAttempts: true, + }, + }, + }, + }, + batchItems: { + include: { + batchTaskRun: true, + }, + }, + }, + }); + + this.$.logger.debug("Creating a task run attempt", { taskRun }); + + if (!taskRun) { + throw new ServiceValidationError("Task run not found", 404); + } + + span.setAttribute("projectId", taskRun.projectId); + span.setAttribute("environmentId", taskRun.runtimeEnvironmentId); + span.setAttribute("taskRunId", taskRun.id); + span.setAttribute("taskRunFriendlyId", taskRun.friendlyId); + + if (taskRun.status === "CANCELED") { + throw new ServiceValidationError("Task run is cancelled", 400); + } + + if (!taskRun.lockedBy) { + throw new ServiceValidationError("Task run is not locked", 400); + } + + const queue = await prisma.taskQueue.findUnique({ + where: { + runtimeEnvironmentId_name: { + runtimeEnvironmentId: environment.id, + name: taskRun.queue, + }, + }, + }); + + if (!queue) { + throw new ServiceValidationError("Queue not found", 404); + } + + //increment the attempt number (start at 1) + const nextAttemptNumber = (taskRun.attemptNumber ?? 0) + 1; + + if (nextAttemptNumber > MAX_TASK_RUN_ATTEMPTS) { + await this.attemptFailed({ + runId: taskRun.id, + snapshotId, + completion: { + ok: false, + id: taskRun.id, + error: { + type: "INTERNAL_ERROR", + code: "TASK_RUN_CRASHED", + message: "Max attempts reached.", + }, + }, + tx: prisma, + }); + throw new ServiceValidationError("Max attempts reached", 400); + } + + this.$.eventBus.emit("runAttemptStarted", { + time: new Date(), + run: { + id: taskRun.id, + attemptNumber: nextAttemptNumber, + baseCostInCents: taskRun.baseCostInCents, + }, + organization: { + id: environment.organization.id, + }, + }); + + const result = await $transaction( + prisma, + async (tx) => { + const run = await tx.taskRun.update({ + where: { + id: taskRun.id, + }, + data: { + status: "EXECUTING", + attemptNumber: nextAttemptNumber, + executedAt: taskRun.attemptNumber === null ? new Date() : undefined, + }, + include: { + tags: true, + lockedBy: { + include: { worker: true }, + }, + }, + }); + + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(tx, { + run, + snapshot: { + executionStatus: "EXECUTING", + description: `Attempt created, starting execution${ + isWarmStart ? " (warm start)" : "" + }`, + }, + previousSnapshotId: latestSnapshot.id, + environmentId: latestSnapshot.environmentId, + environmentType: latestSnapshot.environmentType, + projectId: latestSnapshot.projectId, + organizationId: latestSnapshot.organizationId, + workerId, + runnerId, + }); + + if (taskRun.ttl) { + //don't expire the run, it's going to execute + await this.$.worker.ack(`expireRun:${taskRun.id}`); + } + + return { run, snapshot: newSnapshot }; + }, + (error) => { + this.$.logger.error("RunEngine.createRunAttempt(): prisma.$transaction error", { + code: error.code, + meta: error.meta, + stack: error.stack, + message: error.message, + name: error.name, + }); + throw new ServiceValidationError( + "Failed to update task run and execution snapshot", + 500 + ); + } + ); + + if (!result) { + this.$.logger.error("RunEngine.createRunAttempt(): failed to create task run attempt", { + runId: taskRun.id, + nextAttemptNumber, + }); + throw new ServiceValidationError("Failed to create task run attempt", 500); + } + + const { run, snapshot } = result; + + const machinePreset = getMachinePreset({ + machines: this.options.machines.machines, + defaultMachine: this.options.machines.defaultMachine, + config: taskRun.lockedBy.machineConfig ?? {}, + run: taskRun, + }); + + const metadata = await parsePacket({ + data: taskRun.metadata ?? undefined, + dataType: taskRun.metadataType, + }); + + const execution: TaskRunExecution = { + task: { + id: run.lockedBy!.slug, + filePath: run.lockedBy!.filePath, + exportName: run.lockedBy!.exportName, + }, + attempt: { + number: nextAttemptNumber, + startedAt: latestSnapshot.updatedAt, + /** @deprecated */ + id: "deprecated", + /** @deprecated */ + backgroundWorkerId: "deprecated", + /** @deprecated */ + backgroundWorkerTaskId: "deprecated", + /** @deprecated */ + status: "deprecated", + }, + run: { + id: run.friendlyId, + payload: run.payload, + payloadType: run.payloadType, + createdAt: run.createdAt, + tags: run.tags.map((tag) => tag.name), + isTest: run.isTest, + idempotencyKey: run.idempotencyKey ?? undefined, + startedAt: run.startedAt ?? run.createdAt, + maxAttempts: run.maxAttempts ?? undefined, + version: run.lockedBy!.worker.version, + metadata, + maxDuration: run.maxDurationInSeconds ?? undefined, + /** @deprecated */ + context: undefined, + /** @deprecated */ + durationMs: run.usageDurationMs, + /** @deprecated */ + costInCents: run.costInCents, + /** @deprecated */ + baseCostInCents: run.baseCostInCents, + traceContext: run.traceContext as Record, + priority: run.priorityMs === 0 ? undefined : run.priorityMs / 1_000, + }, + queue: { + id: queue.friendlyId, + name: queue.name, + }, + environment: { + id: environment.id, + slug: environment.slug, + type: environment.type, + }, + organization: { + id: environment.organization.id, + slug: environment.organization.slug, + name: environment.organization.title, + }, + project: { + id: environment.project.id, + ref: environment.project.externalRef, + slug: environment.project.slug, + name: environment.project.name, + }, + batch: + taskRun.batchItems[0] && taskRun.batchItems[0].batchTaskRun + ? { id: taskRun.batchItems[0].batchTaskRun.friendlyId } + : undefined, + machine: machinePreset, + }; + + return { run, snapshot, execution }; + }); + }, + { + attributes: { runId, snapshotId }, + } + ); + } + + public async completeRunAttempt({ + runId, + snapshotId, + completion, + workerId, + runnerId, + }: { + runId: string; + snapshotId: string; + completion: TaskRunExecutionResult; + workerId?: string; + runnerId?: string; + }): Promise { + if (completion.metadata) { + this.$.eventBus.emit("runMetadataUpdated", { + time: new Date(), + run: { + id: runId, + metadata: completion.metadata, + }, + }); + } + + switch (completion.ok) { + case true: { + return this.attemptSucceeded({ + runId, + snapshotId, + completion, + tx: this.$.prisma, + workerId, + runnerId, + }); + } + case false: { + return this.attemptFailed({ + runId, + snapshotId, + completion, + tx: this.$.prisma, + workerId, + runnerId, + }); + } + } + } + + public async attemptSucceeded({ + runId, + snapshotId, + completion, + tx, + workerId, + runnerId, + }: { + runId: string; + snapshotId: string; + completion: TaskRunSuccessfulExecutionResult; + tx: PrismaClientOrTransaction; + workerId?: string; + runnerId?: string; + }): Promise { + const prisma = tx ?? this.$.prisma; + + return startSpan( + this.$.tracer, + "#completeRunAttemptSuccess", + async (span) => { + return this.$.runLock.lock([runId], 5_000, async (signal) => { + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + + if (latestSnapshot.id !== snapshotId) { + throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400); + } + + span.setAttribute("completionStatus", completion.ok); + + const completedAt = new Date(); + + const run = await prisma.taskRun.update({ + where: { id: runId }, + data: { + status: "COMPLETED_SUCCESSFULLY", + completedAt, + output: completion.output, + outputType: completion.outputType, + executionSnapshots: { + create: { + executionStatus: "FINISHED", + description: "Task completed successfully", + runStatus: "COMPLETED_SUCCESSFULLY", + attemptNumber: latestSnapshot.attemptNumber, + environmentId: latestSnapshot.environmentId, + environmentType: latestSnapshot.environmentType, + projectId: latestSnapshot.projectId, + organizationId: latestSnapshot.organizationId, + workerId, + runnerId, + }, + }, + }, + select: { + id: true, + friendlyId: true, + status: true, + attemptNumber: true, + spanId: true, + associatedWaitpoint: { + select: { + id: true, + }, + }, + project: { + select: { + organizationId: true, + }, + }, + batchId: true, + createdAt: true, + completedAt: true, + taskEventStore: true, + parentTaskRunId: true, + }, + }); + const newSnapshot = await getLatestExecutionSnapshot(prisma, runId); + + await this.$.runQueue.acknowledgeMessage(run.project.organizationId, runId); + + // We need to manually emit this as we created the final snapshot as part of the task run update + this.$.eventBus.emit("executionSnapshotCreated", { + time: newSnapshot.createdAt, + run: { + id: newSnapshot.runId, + }, + snapshot: { + ...newSnapshot, + completedWaitpointIds: newSnapshot.completedWaitpoints.map((wp) => wp.id), + }, + }); + + if (!run.associatedWaitpoint) { + throw new ServiceValidationError("No associated waitpoint found", 400); + } + + await this.waitpointSystem.completeWaitpoint({ + id: run.associatedWaitpoint.id, + output: completion.output + ? { value: completion.output, type: completion.outputType, isError: false } + : undefined, + }); + + this.$.eventBus.emit("runSucceeded", { + time: completedAt, + run: { + id: runId, + spanId: run.spanId, + output: completion.output, + outputType: completion.outputType, + createdAt: run.createdAt, + completedAt: run.completedAt, + taskEventStore: run.taskEventStore, + }, + }); + + await this.#finalizeRun(run); + + return { + attemptStatus: "RUN_FINISHED", + snapshot: newSnapshot, + run, + }; + }); + }, + { + attributes: { runId, snapshotId }, + } + ); + } + + public async attemptFailed({ + runId, + snapshotId, + workerId, + runnerId, + completion, + forceRequeue, + tx, + }: { + runId: string; + snapshotId: string; + workerId?: string; + runnerId?: string; + completion: TaskRunFailedExecutionResult; + forceRequeue?: boolean; + tx: PrismaClientOrTransaction; + }): Promise { + const prisma = this.$.prisma; + + return startSpan( + this.$.tracer, + "completeRunAttemptFailure", + async (span) => { + return this.$.runLock.lock([runId], 5_000, async (signal) => { + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + + if (latestSnapshot.id !== snapshotId) { + throw new ServiceValidationError("Snapshot ID doesn't match the latest snapshot", 400); + } + + span.setAttribute("completionStatus", completion.ok); + + //remove waitpoints blocking the run + const deletedCount = await this.waitpointSystem.clearBlockingWaitpoints({ runId, tx }); + if (deletedCount > 0) { + this.$.logger.debug("Cleared blocking waitpoints", { runId, deletedCount }); + } + + const failedAt = new Date(); + + const retryResult = await retryOutcomeFromCompletion(prisma, { + runId, + error: completion.error, + retryUsingQueue: forceRequeue ?? false, + retrySettings: completion.retry, + attemptNumber: latestSnapshot.attemptNumber, + }); + + // Force requeue means it was crashed so the attempt span needs to be closed + if (forceRequeue) { + const minimalRun = await prisma.taskRun.findFirst({ + where: { + id: runId, + }, + select: { + status: true, + spanId: true, + maxAttempts: true, + runtimeEnvironment: { + select: { + organizationId: true, + }, + }, + taskEventStore: true, + createdAt: true, + completedAt: true, + }, + }); + + if (!minimalRun) { + throw new ServiceValidationError("Run not found", 404); + } + + this.$.eventBus.emit("runAttemptFailed", { + time: failedAt, + run: { + id: runId, + status: minimalRun.status, + spanId: minimalRun.spanId, + error: completion.error, + attemptNumber: latestSnapshot.attemptNumber ?? 0, + createdAt: minimalRun.createdAt, + completedAt: minimalRun.completedAt, + taskEventStore: minimalRun.taskEventStore, + }, + }); + } + + switch (retryResult.outcome) { + case "cancel_run": { + const result = await this.cancelRun({ + runId, + completedAt: failedAt, + reason: retryResult.reason, + finalizeRun: true, + tx: prisma, + }); + return { + attemptStatus: + result.snapshot.executionStatus === "PENDING_CANCEL" + ? "RUN_PENDING_CANCEL" + : "RUN_FINISHED", + ...result, + }; + } + case "fail_run": { + return await this.#permanentlyFailRun({ + runId, + snapshotId, + failedAt, + error: retryResult.sanitizedError, + workerId, + runnerId, + }); + } + case "retry": { + const retryAt = new Date(retryResult.settings.timestamp); + + const run = await prisma.taskRun.update({ + where: { + id: runId, + }, + data: { + status: "RETRYING_AFTER_FAILURE", + machinePreset: retryResult.machine, + }, + include: { + runtimeEnvironment: { + include: { + project: true, + organization: true, + orgMember: true, + }, + }, + }, + }); + + const nextAttemptNumber = + latestSnapshot.attemptNumber === null ? 1 : latestSnapshot.attemptNumber + 1; + + if (retryResult.wasOOMError) { + this.$.eventBus.emit("runAttemptFailed", { + time: failedAt, + run: { + id: runId, + status: run.status, + spanId: run.spanId, + error: completion.error, + attemptNumber: latestSnapshot.attemptNumber ?? 0, + createdAt: run.createdAt, + completedAt: run.completedAt, + taskEventStore: run.taskEventStore, + }, + }); + } + + this.$.eventBus.emit("runRetryScheduled", { + time: failedAt, + run: { + id: run.id, + friendlyId: run.friendlyId, + attemptNumber: nextAttemptNumber, + queue: run.queue, + taskIdentifier: run.taskIdentifier, + traceContext: run.traceContext as Record, + baseCostInCents: run.baseCostInCents, + spanId: run.spanId, + }, + organization: { + id: run.runtimeEnvironment.organizationId, + }, + environment: run.runtimeEnvironment, + retryAt, + }); + + //if it's a long delay and we support checkpointing, put it back in the queue + if ( + forceRequeue || + retryResult.method === "queue" || + (this.options.retryWarmStartThresholdMs !== undefined && + retryResult.settings.delay >= this.options.retryWarmStartThresholdMs) + ) { + //we nack the message, requeuing it for later + const nackResult = await this.tryNackAndRequeue({ + run, + environment: run.runtimeEnvironment, + orgId: run.runtimeEnvironment.organizationId, + projectId: run.runtimeEnvironment.project.id, + timestamp: retryAt.getTime(), + error: { + type: "INTERNAL_ERROR", + code: "TASK_RUN_DEQUEUED_MAX_RETRIES", + message: `We tried to dequeue the run the maximum number of times but it wouldn't start executing`, + }, + tx: prisma, + }); + + if (!nackResult.wasRequeued) { + return { + attemptStatus: "RUN_FINISHED", + ...nackResult, + }; + } else { + return { attemptStatus: "RETRY_QUEUED", ...nackResult }; + } + } + + //it will continue running because the retry delay is short + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( + prisma, + { + run, + snapshot: { + executionStatus: "PENDING_EXECUTING", + description: "Attempt failed with a short delay, starting a new attempt", + }, + previousSnapshotId: latestSnapshot.id, + environmentId: latestSnapshot.environmentId, + environmentType: latestSnapshot.environmentType, + projectId: latestSnapshot.projectId, + organizationId: latestSnapshot.organizationId, + workerId, + runnerId, + } + ); + + //the worker can fetch the latest snapshot and should create a new attempt + await sendNotificationToWorker({ + runId, + snapshot: newSnapshot, + eventBus: this.$.eventBus, + }); + + return { + attemptStatus: "RETRY_IMMEDIATELY", + ...executionResultFromSnapshot(newSnapshot), + }; + } + } + }); + }, + { + attributes: { runId, snapshotId }, + } + ); + } + + public async systemFailure({ + runId, + error, + tx, + }: { + runId: string; + error: TaskRunInternalError; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + + return startSpan( + this.$.tracer, + "systemFailure", + async (span) => { + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + + //already finished + if (latestSnapshot.executionStatus === "FINISHED") { + //todo check run is in the correct state + return { + attemptStatus: "RUN_FINISHED", + snapshot: latestSnapshot, + run: { + id: runId, + friendlyId: latestSnapshot.runFriendlyId, + status: latestSnapshot.runStatus, + attemptNumber: latestSnapshot.attemptNumber, + }, + }; + } + + const result = await this.attemptFailed({ + runId, + snapshotId: latestSnapshot.id, + completion: { + ok: false, + id: runId, + error, + }, + tx: prisma, + }); + + return result; + }, + { + attributes: { + runId, + }, + } + ); + } + + public async tryNackAndRequeue({ + run, + environment, + orgId, + projectId, + timestamp, + error, + workerId, + runnerId, + tx, + }: { + run: TaskRun; + environment: { + id: string; + type: RuntimeEnvironmentType; + }; + orgId: string; + projectId: string; + timestamp?: number; + error: TaskRunInternalError; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise<{ wasRequeued: boolean } & ExecutionResult> { + const prisma = tx ?? this.$.prisma; + + return await this.$.runLock.lock([run.id], 5000, async (signal) => { + //we nack the message, this allows another work to pick up the run + const gotRequeued = await this.$.runQueue.nackMessage({ + orgId, + messageId: run.id, + retryAt: timestamp, + }); + + if (!gotRequeued) { + const result = await this.systemFailure({ + runId: run.id, + error, + tx: prisma, + }); + return { wasRequeued: false, ...result }; + } + + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run: run, + snapshot: { + executionStatus: "QUEUED", + description: "Requeued the run after a failure", + }, + environmentId: environment.id, + environmentType: environment.type, + projectId: projectId, + organizationId: orgId, + workerId, + runnerId, + }); + + return { + wasRequeued: true, + snapshot: { + id: newSnapshot.id, + friendlyId: newSnapshot.friendlyId, + executionStatus: newSnapshot.executionStatus, + description: newSnapshot.description, + }, + run: { + id: newSnapshot.runId, + friendlyId: newSnapshot.runFriendlyId, + status: newSnapshot.runStatus, + attemptNumber: newSnapshot.attemptNumber, + }, + }; + }); + } + + /** + Call this to cancel a run. + If the run is in-progress it will change it's state to PENDING_CANCEL and notify the worker. + If the run is not in-progress it will finish it. + You can pass `finalizeRun` in if you know it's no longer running, e.g. the worker has messaged to say it's done. + */ + async cancelRun({ + runId, + workerId, + runnerId, + completedAt, + reason, + finalizeRun, + tx, + }: { + runId: string; + workerId?: string; + runnerId?: string; + completedAt?: Date; + reason?: string; + finalizeRun?: boolean; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + reason = reason ?? "Cancelled by user"; + + return startSpan(this.$.tracer, "cancelRun", async (span) => { + return this.$.runLock.lock([runId], 5_000, async (signal) => { + const latestSnapshot = await getLatestExecutionSnapshot(prisma, runId); + + //already finished, do nothing + if (latestSnapshot.executionStatus === "FINISHED") { + return executionResultFromSnapshot(latestSnapshot); + } + + //is pending cancellation and we're not finalizing, alert the worker again + if (latestSnapshot.executionStatus === "PENDING_CANCEL" && !finalizeRun) { + await sendNotificationToWorker({ + runId, + snapshot: latestSnapshot, + eventBus: this.$.eventBus, + }); + return executionResultFromSnapshot(latestSnapshot); + } + + //set the run to cancelled immediately + const error: TaskRunError = { + type: "STRING_ERROR", + raw: reason, + }; + + const run = await prisma.taskRun.update({ + where: { id: runId }, + data: { + status: "CANCELED", + completedAt: finalizeRun ? completedAt ?? new Date() : completedAt, + error, + }, + select: { + id: true, + friendlyId: true, + status: true, + attemptNumber: true, + spanId: true, + batchId: true, + createdAt: true, + completedAt: true, + taskEventStore: true, + parentTaskRunId: true, + runtimeEnvironment: { + select: { + organizationId: true, + }, + }, + associatedWaitpoint: { + select: { + id: true, + }, + }, + childRuns: { + select: { + id: true, + }, + }, + }, + }); + + //remove it from the queue and release concurrency + await this.$.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId); + + //if executing, we need to message the worker to cancel the run and put it into `PENDING_CANCEL` status + if (isExecuting(latestSnapshot.executionStatus)) { + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run, + snapshot: { + executionStatus: "PENDING_CANCEL", + description: "Run was cancelled", + }, + previousSnapshotId: latestSnapshot.id, + environmentId: latestSnapshot.environmentId, + environmentType: latestSnapshot.environmentType, + projectId: latestSnapshot.projectId, + organizationId: latestSnapshot.organizationId, + workerId, + runnerId, + }); + + //the worker needs to be notified so it can kill the run and complete the attempt + await sendNotificationToWorker({ + runId, + snapshot: newSnapshot, + eventBus: this.$.eventBus, + }); + return executionResultFromSnapshot(newSnapshot); + } + + //not executing, so we will actually finish the run + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run, + snapshot: { + executionStatus: "FINISHED", + description: "Run was cancelled, not finished", + }, + previousSnapshotId: latestSnapshot.id, + environmentId: latestSnapshot.environmentId, + environmentType: latestSnapshot.environmentType, + projectId: latestSnapshot.projectId, + organizationId: latestSnapshot.organizationId, + workerId, + runnerId, + }); + + if (!run.associatedWaitpoint) { + throw new ServiceValidationError("No associated waitpoint found", 400); + } + + //complete the waitpoint so the parent run can continue + await this.waitpointSystem.completeWaitpoint({ + id: run.associatedWaitpoint.id, + output: { value: JSON.stringify(error), isError: true }, + }); + + this.$.eventBus.emit("runCancelled", { + time: new Date(), + run: { + id: run.id, + friendlyId: run.friendlyId, + spanId: run.spanId, + taskEventStore: run.taskEventStore, + createdAt: run.createdAt, + completedAt: run.completedAt, + error, + }, + }); + + //schedule the cancellation of all the child runs + //it will call this function for each child, + //which will recursively cancel all children if they need to be + if (run.childRuns.length > 0) { + for (const childRun of run.childRuns) { + await this.$.worker.enqueue({ + id: `cancelRun:${childRun.id}`, + job: "cancelRun", + payload: { runId: childRun.id, completedAt: run.completedAt ?? new Date(), reason }, + }); + } + } + + return executionResultFromSnapshot(newSnapshot); + }); + }); + } + + async #permanentlyFailRun({ + runId, + snapshotId, + failedAt, + error, + workerId, + runnerId, + }: { + runId: string; + snapshotId?: string; + failedAt: Date; + error: TaskRunError; + workerId?: string; + runnerId?: string; + }): Promise { + const prisma = this.$.prisma; + + return startSpan(this.$.tracer, "permanentlyFailRun", async (span) => { + const status = runStatusFromError(error); + + //run permanently failed + const run = await prisma.taskRun.update({ + where: { + id: runId, + }, + data: { + status, + completedAt: failedAt, + error, + }, + select: { + id: true, + friendlyId: true, + status: true, + attemptNumber: true, + spanId: true, + batchId: true, + parentTaskRunId: true, + associatedWaitpoint: { + select: { + id: true, + }, + }, + runtimeEnvironment: { + select: { + id: true, + type: true, + organizationId: true, + project: { + select: { + id: true, + organizationId: true, + }, + }, + }, + }, + taskEventStore: true, + createdAt: true, + completedAt: true, + }, + }); + + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run, + snapshot: { + executionStatus: "FINISHED", + description: "Run failed", + }, + previousSnapshotId: snapshotId, + environmentId: run.runtimeEnvironment.id, + environmentType: run.runtimeEnvironment.type, + projectId: run.runtimeEnvironment.project.id, + organizationId: run.runtimeEnvironment.project.organizationId, + workerId, + runnerId, + }); + + if (!run.associatedWaitpoint) { + throw new ServiceValidationError("No associated waitpoint found", 400); + } + + await this.$.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId); + + await this.waitpointSystem.completeWaitpoint({ + id: run.associatedWaitpoint.id, + output: { value: JSON.stringify(error), isError: true }, + }); + + this.$.eventBus.emit("runFailed", { + time: failedAt, + run: { + id: runId, + status: run.status, + spanId: run.spanId, + error, + taskEventStore: run.taskEventStore, + createdAt: run.createdAt, + completedAt: run.completedAt, + }, + }); + + await this.#finalizeRun(run); + + return { + attemptStatus: "RUN_FINISHED", + snapshot: newSnapshot, + run, + }; + }); + } + + /* + * Whether the run succeeds, fails, is cancelled… we need to run these operations + */ + async #finalizeRun({ id, batchId }: { id: string; batchId: string | null }) { + if (batchId) { + await this.batchSystem.scheduleCompleteBatch({ batchId }); + } + + //cancel the heartbeats + await this.$.worker.ack(`heartbeatSnapshot.${id}`); + } + + async #getAuthenticatedEnvironmentFromRun(runId: string, tx?: PrismaClientOrTransaction) { + const prisma = tx ?? this.$.prisma; + const taskRun = await prisma.taskRun.findUnique({ + where: { + id: runId, + }, + include: { + runtimeEnvironment: { + include: { + organization: true, + project: true, + }, + }, + }, + }); + + if (!taskRun) { + return; + } + + return taskRun?.runtimeEnvironment; + } +} diff --git a/internal-packages/run-engine/src/engine/systems/systems.ts b/internal-packages/run-engine/src/engine/systems/systems.ts new file mode 100644 index 0000000000..85ccb014ee --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/systems.ts @@ -0,0 +1,17 @@ +import { Tracer } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { PrismaClient } from "@trigger.dev/database"; +import { RunQueue } from "../../run-queue/index.js"; +import { EventBus } from "../eventBus.js"; +import { RunLocker } from "../locking.js"; +import { EngineWorker } from "../types.js"; + +export type SystemResources = { + prisma: PrismaClient; + worker: EngineWorker; + eventBus: EventBus; + logger: Logger; + tracer: Tracer; + runLock: RunLocker; + runQueue: RunQueue; +}; diff --git a/internal-packages/run-engine/src/engine/systems/ttlSystem.ts b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts new file mode 100644 index 0000000000..12910f4634 --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts @@ -0,0 +1,132 @@ +import { startSpan } from "@internal/tracing"; +import { SystemResources } from "./systems.js"; +import { PrismaClientOrTransaction, TaskRun } from "@trigger.dev/database"; +import { getLatestExecutionSnapshot } from "./executionSnapshotSystem.js"; +import { parseNaturalLanguageDuration } from "@trigger.dev/core/v3/isomorphic"; +import { ServiceValidationError } from "../errors.js"; +import { isExecuting } from "../statuses.js"; +import { TaskRunError } from "@trigger.dev/core/v3/schemas"; +import { WaitpointSystem } from "./waitpointSystem.js"; + +export type TtlSystemOptions = { + resources: SystemResources; + waitpointSystem: WaitpointSystem; +}; + +export class TtlSystem { + private readonly $: SystemResources; + private readonly waitpointSystem: WaitpointSystem; + + constructor(private readonly options: TtlSystemOptions) { + this.$ = options.resources; + this.waitpointSystem = options.waitpointSystem; + } + + async expireRun({ runId, tx }: { runId: string; tx?: PrismaClientOrTransaction }) { + const prisma = tx ?? this.$.prisma; + await this.$.runLock.lock([runId], 5_000, async () => { + const snapshot = await getLatestExecutionSnapshot(prisma, runId); + + //if we're executing then we won't expire the run + if (isExecuting(snapshot.executionStatus)) { + return; + } + + //only expire "PENDING" runs + const run = await prisma.taskRun.findUnique({ where: { id: runId } }); + + if (!run) { + this.$.logger.debug("Could not find enqueued run to expire", { + runId, + }); + return; + } + + if (run.status !== "PENDING") { + this.$.logger.debug("Run cannot be expired because it's not in PENDING status", { + run, + }); + return; + } + + if (run.lockedAt) { + this.$.logger.debug("Run cannot be expired because it's locked, so will run", { + run, + }); + return; + } + + const error: TaskRunError = { + type: "STRING_ERROR", + raw: `Run expired because the TTL (${run.ttl}) was reached`, + }; + + const updatedRun = await prisma.taskRun.update({ + where: { id: runId }, + data: { + status: "EXPIRED", + completedAt: new Date(), + expiredAt: new Date(), + error, + executionSnapshots: { + create: { + engine: "V2", + executionStatus: "FINISHED", + description: "Run was expired because the TTL was reached", + runStatus: "EXPIRED", + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + }, + }, + }, + select: { + id: true, + spanId: true, + ttl: true, + associatedWaitpoint: { + select: { + id: true, + }, + }, + runtimeEnvironment: { + select: { + organizationId: true, + }, + }, + createdAt: true, + completedAt: true, + taskEventStore: true, + parentTaskRunId: true, + }, + }); + + await this.$.runQueue.acknowledgeMessage(updatedRun.runtimeEnvironment.organizationId, runId); + + if (!updatedRun.associatedWaitpoint) { + throw new ServiceValidationError("No associated waitpoint found", 400); + } + + await this.waitpointSystem.completeWaitpoint({ + id: updatedRun.associatedWaitpoint.id, + output: { value: JSON.stringify(error), isError: true }, + }); + + this.$.eventBus.emit("runExpired", { run: updatedRun, time: new Date() }); + }); + } + + async scheduleExpireRun({ runId, ttl }: { runId: string; ttl: string }) { + const expireAt = parseNaturalLanguageDuration(ttl); + + if (expireAt) { + await this.$.worker.enqueue({ + id: `expireRun:${runId}`, + job: "expireRun", + payload: { runId }, + availableAt: expireAt, + }); + } + } +} diff --git a/internal-packages/run-engine/src/engine/systems/waitingForWorkerSystem.ts b/internal-packages/run-engine/src/engine/systems/waitingForWorkerSystem.ts new file mode 100644 index 0000000000..517e1c303d --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/waitingForWorkerSystem.ts @@ -0,0 +1,102 @@ +import { EnqueueSystem } from "./enqueueSystem.js"; +import { SystemResources } from "./systems.js"; + +export type WaitingForWorkerSystemOptions = { + resources: SystemResources; + enqueueSystem: EnqueueSystem; + queueRunsWaitingForWorkerBatchSize?: number; +}; + +export class WaitingForWorkerSystem { + private readonly $: SystemResources; + private readonly enqueueSystem: EnqueueSystem; + + constructor(private readonly options: WaitingForWorkerSystemOptions) { + this.$ = options.resources; + this.enqueueSystem = options.enqueueSystem; + } + + async enqueueRunsWaitingForWorker({ backgroundWorkerId }: { backgroundWorkerId: string }) { + //It could be a lot of runs, so we will process them in a batch + //if there are still more to process we will enqueue this function again + const maxCount = this.options.queueRunsWaitingForWorkerBatchSize ?? 200; + + const backgroundWorker = await this.$.prisma.backgroundWorker.findFirst({ + where: { + id: backgroundWorkerId, + }, + include: { + runtimeEnvironment: { + include: { + project: true, + organization: true, + }, + }, + tasks: true, + }, + }); + + if (!backgroundWorker) { + this.$.logger.error("#queueRunsWaitingForWorker: background worker not found", { + id: backgroundWorkerId, + }); + return; + } + + const runsWaitingForDeploy = await this.$.prisma.taskRun.findMany({ + where: { + runtimeEnvironmentId: backgroundWorker.runtimeEnvironmentId, + projectId: backgroundWorker.projectId, + status: "WAITING_FOR_DEPLOY", + taskIdentifier: { + in: backgroundWorker.tasks.map((task) => task.slug), + }, + }, + orderBy: { + createdAt: "asc", + }, + take: maxCount + 1, + }); + + //none to process + if (!runsWaitingForDeploy.length) return; + + for (const run of runsWaitingForDeploy) { + await this.$.prisma.$transaction(async (tx) => { + const updatedRun = await tx.taskRun.update({ + where: { + id: run.id, + }, + data: { + status: "PENDING", + }, + }); + await this.enqueueSystem.enqueueRun({ + run: updatedRun, + env: backgroundWorker.runtimeEnvironment, + //add to the queue using the original run created time + //this should ensure they're in the correct order in the queue + timestamp: updatedRun.createdAt.getTime() - updatedRun.priorityMs, + tx, + }); + }); + } + + //enqueue more if needed + if (runsWaitingForDeploy.length > maxCount) { + await this.scheduleEnqueueRunsWaitingForWorker({ backgroundWorkerId }); + } + } + + async scheduleEnqueueRunsWaitingForWorker({ + backgroundWorkerId, + }: { + backgroundWorkerId: string; + }): Promise { + //we want this to happen in the background + await this.$.worker.enqueue({ + job: "queueRunsWaitingForWorker", + payload: { backgroundWorkerId }, + }); + } +} diff --git a/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts new file mode 100644 index 0000000000..ee5d79895d --- /dev/null +++ b/internal-packages/run-engine/src/engine/systems/waitpointSystem.ts @@ -0,0 +1,610 @@ +import { timeoutError } from "@trigger.dev/core/v3"; +import { WaitpointId } from "@trigger.dev/core/v3/isomorphic"; +import { + $transaction, + Prisma, + PrismaClientOrTransaction, + TaskRunExecutionSnapshot, + TaskRunExecutionStatus, + Waitpoint, +} from "@trigger.dev/database"; +import { nanoid } from "nanoid"; +import { sendNotificationToWorker } from "../eventBus.js"; +import { isExecuting } from "../statuses.js"; +import { EnqueueSystem } from "./enqueueSystem.js"; +import { ExecutionSnapshotSystem, getLatestExecutionSnapshot } from "./executionSnapshotSystem.js"; +import { SystemResources } from "./systems.js"; +import { ReleaseConcurrencySystem } from "./releaseConcurrencySystem.js"; + +export type WaitpointSystemOptions = { + resources: SystemResources; + executionSnapshotSystem: ExecutionSnapshotSystem; + enqueueSystem: EnqueueSystem; + releaseConcurrencySystem: ReleaseConcurrencySystem; +}; + +export class WaitpointSystem { + private readonly $: SystemResources; + private readonly executionSnapshotSystem: ExecutionSnapshotSystem; + private readonly releaseConcurrencySystem: ReleaseConcurrencySystem; + private readonly enqueueSystem: EnqueueSystem; + + constructor(private readonly options: WaitpointSystemOptions) { + this.$ = options.resources; + this.executionSnapshotSystem = options.executionSnapshotSystem; + this.enqueueSystem = options.enqueueSystem; + this.releaseConcurrencySystem = options.releaseConcurrencySystem; + } + + public async clearBlockingWaitpoints({ + runId, + tx, + }: { + runId: string; + tx?: PrismaClientOrTransaction; + }) { + const prisma = tx ?? this.$.prisma; + const deleted = await prisma.taskRunWaitpoint.deleteMany({ + where: { + taskRunId: runId, + }, + }); + + return deleted.count; + } + + /** This completes a waitpoint and updates all entries so the run isn't blocked, + * if they're no longer blocked. This doesn't suffer from race conditions. */ + async completeWaitpoint({ + id, + output, + }: { + id: string; + output?: { + value: string; + type?: string; + isError: boolean; + }; + }): Promise { + const result = await $transaction( + this.$.prisma, + async (tx) => { + // 1. Find the TaskRuns blocked by this waitpoint + const affectedTaskRuns = await tx.taskRunWaitpoint.findMany({ + where: { waitpointId: id }, + select: { taskRunId: true, spanIdToComplete: true, createdAt: true }, + }); + + if (affectedTaskRuns.length === 0) { + this.$.logger.warn(`completeWaitpoint: No TaskRunWaitpoints found for waitpoint`, { + waitpointId: id, + }); + } + + // 2. Update the waitpoint to completed (only if it's pending) + let waitpoint: Waitpoint | null = null; + try { + waitpoint = await tx.waitpoint.update({ + where: { id, status: "PENDING" }, + data: { + status: "COMPLETED", + completedAt: new Date(), + output: output?.value, + outputType: output?.type, + outputIsError: output?.isError, + }, + }); + } catch (error) { + if (error instanceof Prisma.PrismaClientKnownRequestError && error.code === "P2025") { + waitpoint = await tx.waitpoint.findFirst({ + where: { id }, + }); + } else { + this.$.logger.log("completeWaitpoint: error updating waitpoint:", { error }); + throw error; + } + } + + return { waitpoint, affectedTaskRuns }; + }, + (error) => { + this.$.logger.error(`completeWaitpoint: Error completing waitpoint ${id}, retrying`, { + error, + }); + throw error; + } + ); + + if (!result) { + throw new Error(`Waitpoint couldn't be updated`); + } + + if (!result.waitpoint) { + throw new Error(`Waitpoint ${id} not found`); + } + + //schedule trying to continue the runs + for (const run of result.affectedTaskRuns) { + await this.$.worker.enqueue({ + //this will debounce the call + id: `continueRunIfUnblocked:${run.taskRunId}`, + job: "continueRunIfUnblocked", + payload: { runId: run.taskRunId }, + //50ms in the future + availableAt: new Date(Date.now() + 50), + }); + + // emit an event to complete associated cached runs + if (run.spanIdToComplete) { + this.$.eventBus.emit("cachedRunCompleted", { + time: new Date(), + span: { + id: run.spanIdToComplete, + createdAt: run.createdAt, + }, + blockedRunId: run.taskRunId, + hasError: output?.isError ?? false, + }); + } + } + + return result.waitpoint; + } + + /** + * This creates a DATETIME waitpoint, that will be completed automatically when the specified date is reached. + * If you pass an `idempotencyKey`, the waitpoint will be created only if it doesn't already exist. + */ + async createDateTimeWaitpoint({ + projectId, + environmentId, + completedAfter, + idempotencyKey, + idempotencyKeyExpiresAt, + tx, + }: { + projectId: string; + environmentId: string; + completedAfter: Date; + idempotencyKey?: string; + idempotencyKeyExpiresAt?: Date; + tx?: PrismaClientOrTransaction; + }) { + const prisma = tx ?? this.$.prisma; + + const existingWaitpoint = idempotencyKey + ? await prisma.waitpoint.findUnique({ + where: { + environmentId_idempotencyKey: { + environmentId, + idempotencyKey, + }, + }, + }) + : undefined; + + if (existingWaitpoint) { + if ( + existingWaitpoint.idempotencyKeyExpiresAt && + new Date() > existingWaitpoint.idempotencyKeyExpiresAt + ) { + //the idempotency key has expired + //remove the waitpoint idempotencyKey + await prisma.waitpoint.update({ + where: { + id: existingWaitpoint.id, + }, + data: { + idempotencyKey: nanoid(24), + inactiveIdempotencyKey: existingWaitpoint.idempotencyKey, + }, + }); + + //let it fall through to create a new waitpoint + } else { + return { waitpoint: existingWaitpoint, isCached: true }; + } + } + + const waitpoint = await prisma.waitpoint.upsert({ + where: { + environmentId_idempotencyKey: { + environmentId, + idempotencyKey: idempotencyKey ?? nanoid(24), + }, + }, + create: { + ...WaitpointId.generate(), + type: "DATETIME", + idempotencyKey: idempotencyKey ?? nanoid(24), + idempotencyKeyExpiresAt, + userProvidedIdempotencyKey: !!idempotencyKey, + environmentId, + projectId, + completedAfter, + }, + update: {}, + }); + + await this.$.worker.enqueue({ + id: `finishWaitpoint.${waitpoint.id}`, + job: "finishWaitpoint", + payload: { waitpointId: waitpoint.id }, + availableAt: completedAfter, + }); + + return { waitpoint, isCached: false }; + } + + /** This creates a MANUAL waitpoint, that can be explicitly completed (or failed). + * If you pass an `idempotencyKey` and it already exists, it will return the existing waitpoint. + */ + async createManualWaitpoint({ + environmentId, + projectId, + idempotencyKey, + idempotencyKeyExpiresAt, + timeout, + }: { + environmentId: string; + projectId: string; + idempotencyKey?: string; + idempotencyKeyExpiresAt?: Date; + timeout?: Date; + }): Promise<{ waitpoint: Waitpoint; isCached: boolean }> { + const existingWaitpoint = idempotencyKey + ? await this.$.prisma.waitpoint.findUnique({ + where: { + environmentId_idempotencyKey: { + environmentId, + idempotencyKey, + }, + }, + }) + : undefined; + + if (existingWaitpoint) { + if ( + existingWaitpoint.idempotencyKeyExpiresAt && + new Date() > existingWaitpoint.idempotencyKeyExpiresAt + ) { + //the idempotency key has expired + //remove the waitpoint idempotencyKey + await this.$.prisma.waitpoint.update({ + where: { + id: existingWaitpoint.id, + }, + data: { + idempotencyKey: nanoid(24), + inactiveIdempotencyKey: existingWaitpoint.idempotencyKey, + }, + }); + + //let it fall through to create a new waitpoint + } else { + return { waitpoint: existingWaitpoint, isCached: true }; + } + } + + const waitpoint = await this.$.prisma.waitpoint.upsert({ + where: { + environmentId_idempotencyKey: { + environmentId, + idempotencyKey: idempotencyKey ?? nanoid(24), + }, + }, + create: { + ...WaitpointId.generate(), + type: "MANUAL", + idempotencyKey: idempotencyKey ?? nanoid(24), + idempotencyKeyExpiresAt, + userProvidedIdempotencyKey: !!idempotencyKey, + environmentId, + projectId, + completedAfter: timeout, + }, + update: {}, + }); + + //schedule the timeout + if (timeout) { + await this.$.worker.enqueue({ + id: `finishWaitpoint.${waitpoint.id}`, + job: "finishWaitpoint", + payload: { + waitpointId: waitpoint.id, + error: JSON.stringify(timeoutError(timeout)), + }, + availableAt: timeout, + }); + } + + return { waitpoint, isCached: false }; + } + + /** + * Prevents a run from continuing until the waitpoint is completed. + */ + async blockRunWithWaitpoint({ + runId, + waitpoints, + projectId, + releaseConcurrency, + timeout, + spanIdToComplete, + batch, + workerId, + runnerId, + tx, + }: { + runId: string; + waitpoints: string | string[]; + projectId: string; + releaseConcurrency?: boolean; + timeout?: Date; + spanIdToComplete?: string; + batch?: { id: string; index?: number }; + workerId?: string; + runnerId?: string; + tx?: PrismaClientOrTransaction; + }): Promise { + const prisma = tx ?? this.$.prisma; + + let $waitpoints = typeof waitpoints === "string" ? [waitpoints] : waitpoints; + + return await this.$.runLock.lock([runId], 5000, async () => { + let snapshot: TaskRunExecutionSnapshot = await getLatestExecutionSnapshot(prisma, runId); + + //block the run with the waitpoints, returning how many waitpoints are pending + const insert = await prisma.$queryRaw<{ pending_count: BigInt }[]>` + WITH inserted AS ( + INSERT INTO "TaskRunWaitpoint" ("id", "taskRunId", "waitpointId", "projectId", "createdAt", "updatedAt", "spanIdToComplete", "batchId", "batchIndex") + SELECT + gen_random_uuid(), + ${runId}, + w.id, + ${projectId}, + NOW(), + NOW(), + ${spanIdToComplete ?? null}, + ${batch?.id ?? null}, + ${batch?.index ?? null} + FROM "Waitpoint" w + WHERE w.id IN (${Prisma.join($waitpoints)}) + ON CONFLICT DO NOTHING + RETURNING "waitpointId" + ) + SELECT COUNT(*) as pending_count + FROM inserted i + JOIN "Waitpoint" w ON w.id = i."waitpointId" + WHERE w.status = 'PENDING';`; + + const isRunBlocked = Number(insert.at(0)?.pending_count ?? 0) > 0; + + let newStatus: TaskRunExecutionStatus = "SUSPENDED"; + if ( + snapshot.executionStatus === "EXECUTING" || + snapshot.executionStatus === "EXECUTING_WITH_WAITPOINTS" + ) { + newStatus = "EXECUTING_WITH_WAITPOINTS"; + } + + //if the state has changed, create a new snapshot + if (newStatus !== snapshot.executionStatus) { + snapshot = await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run: { + id: snapshot.runId, + status: snapshot.runStatus, + attemptNumber: snapshot.attemptNumber, + }, + snapshot: { + executionStatus: newStatus, + description: "Run was blocked by a waitpoint.", + metadata: { + releaseConcurrency, + }, + }, + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + batchId: batch?.id ?? snapshot.batchId ?? undefined, + workerId, + runnerId, + }); + + // Let the worker know immediately, so it can suspend the run + await sendNotificationToWorker({ runId, snapshot, eventBus: this.$.eventBus }); + } + + if (timeout) { + for (const waitpoint of $waitpoints) { + await this.$.worker.enqueue({ + id: `finishWaitpoint.${waitpoint}`, + job: "finishWaitpoint", + payload: { + waitpointId: waitpoint, + error: JSON.stringify(timeoutError(timeout)), + }, + availableAt: timeout, + }); + } + } + + //no pending waitpoint, schedule unblocking the run + //debounce if we're rapidly adding waitpoints + if (isRunBlocked) { + //release concurrency + await this.releaseConcurrencySystem.releaseConcurrencyForSnapshot(snapshot); + } else { + await this.$.worker.enqueue({ + //this will debounce the call + id: `continueRunIfUnblocked:${runId}`, + job: "continueRunIfUnblocked", + payload: { runId: runId }, + //in the near future + availableAt: new Date(Date.now() + 50), + }); + } + + return snapshot; + }); + } + + public async continueRunIfUnblocked({ runId }: { runId: string }) { + // 1. Get the any blocking waitpoints + const blockingWaitpoints = await this.$.prisma.taskRunWaitpoint.findMany({ + where: { taskRunId: runId }, + select: { + batchId: true, + batchIndex: true, + waitpoint: { + select: { id: true, status: true }, + }, + }, + }); + + // 2. There are blockers still, so do nothing + if (blockingWaitpoints.some((w) => w.waitpoint.status !== "COMPLETED")) { + return; + } + + // 3. Get the run with environment + const run = await this.$.prisma.taskRun.findFirst({ + where: { + id: runId, + }, + include: { + runtimeEnvironment: { + select: { + id: true, + type: true, + maximumConcurrencyLimit: true, + project: { select: { id: true } }, + organization: { select: { id: true } }, + }, + }, + }, + }); + + if (!run) { + throw new Error(`#continueRunIfUnblocked: run not found: ${runId}`); + } + + //4. Continue the run whether it's executing or not + await this.$.runLock.lock([runId], 5000, async () => { + const snapshot = await getLatestExecutionSnapshot(this.$.prisma, runId); + + //run is still executing, send a message to the worker + if (isExecuting(snapshot.executionStatus)) { + const result = await this.$.runQueue.reacquireConcurrency( + run.runtimeEnvironment.organization.id, + runId + ); + + if (result) { + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( + this.$.prisma, + { + run: { + id: runId, + status: snapshot.runStatus, + attemptNumber: snapshot.attemptNumber, + }, + snapshot: { + executionStatus: "EXECUTING", + description: "Run was continued, whilst still executing.", + }, + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + batchId: snapshot.batchId ?? undefined, + completedWaitpoints: blockingWaitpoints.map((b) => ({ + id: b.waitpoint.id, + index: b.batchIndex ?? undefined, + })), + } + ); + + await sendNotificationToWorker({ + runId, + snapshot: newSnapshot, + eventBus: this.$.eventBus, + }); + } else { + // Because we cannot reacquire the concurrency, we need to enqueue the run again + // and because the run is still executing, we need to set the status to QUEUED_EXECUTING + await this.enqueueSystem.enqueueRun({ + run, + env: run.runtimeEnvironment, + timestamp: run.createdAt.getTime() - run.priorityMs, + snapshot: { + status: "QUEUED_EXECUTING", + description: "Run can continue, but is waiting for concurrency", + }, + previousSnapshotId: snapshot.id, + batchId: snapshot.batchId ?? undefined, + completedWaitpoints: blockingWaitpoints.map((b) => ({ + id: b.waitpoint.id, + index: b.batchIndex ?? undefined, + })), + }); + } + } else { + if (snapshot.executionStatus !== "RUN_CREATED" && !snapshot.checkpointId) { + // TODO: We're screwed, should probably fail the run immediately + throw new Error(`#continueRunIfUnblocked: run has no checkpoint: ${run.id}`); + } + + //put it back in the queue, with the original timestamp (w/ priority) + //this prioritizes dequeuing waiting runs over new runs + await this.enqueueSystem.enqueueRun({ + run, + env: run.runtimeEnvironment, + timestamp: run.createdAt.getTime() - run.priorityMs, + snapshot: { + description: "Run was QUEUED, because all waitpoints are completed", + }, + batchId: snapshot.batchId ?? undefined, + completedWaitpoints: blockingWaitpoints.map((b) => ({ + id: b.waitpoint.id, + index: b.batchIndex ?? undefined, + })), + checkpointId: snapshot.checkpointId ?? undefined, + }); + } + }); + + //5. Remove the blocking waitpoints + await this.$.prisma.taskRunWaitpoint.deleteMany({ + where: { + taskRunId: runId, + }, + }); + } + + public async createRunAssociatedWaitpoint( + tx: PrismaClientOrTransaction, + { + projectId, + environmentId, + completedByTaskRunId, + }: { projectId: string; environmentId: string; completedByTaskRunId: string } + ) { + return tx.waitpoint.create({ + data: { + ...WaitpointId.generate(), + type: "RUN", + status: "PENDING", + idempotencyKey: nanoid(24), + userProvidedIdempotencyKey: false, + projectId, + environmentId, + completedByTaskRunId, + }, + }); + } +} diff --git a/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts b/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts index e46f2ae2fb..6ff3f4d7e8 100644 --- a/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts +++ b/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts @@ -10,164 +10,160 @@ import { expect } from "vitest"; import { RunEngine } from "../index.js"; describe("RunEngine attempt failures", () => { - containerTest( - "Retry user error and succeed", - { timeout: 15_000 }, - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, + containerTest("Retry user error and succeed", async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, }, - baseCostInCents: 0.0001, }, - tracer: trace.getTracer("test", "0.0.0"), + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + //create background worker + const backgroundWorker = await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier + ); + + //trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + //dequeue the run + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, }); - try { - const taskIdentifier = "test-task"; - - //create background worker - const backgroundWorker = await setupBackgroundWorker( - prisma, - authenticatedEnvironment, - taskIdentifier - ); - - //trigger the run - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queueName: "task/test-task", - isTest: false, - tags: [], - }, - prisma - ); - - //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); - - //create an attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); - - //fail the attempt - const error = { - type: "BUILT_IN_ERROR" as const, - name: "UserError", - message: "This is a user error", - stackTrace: "Error: This is a user error\n at :1:1", - }; - const result = await engine.completeRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: attemptResult.snapshot.id, - completion: { - ok: false, - id: dequeued[0].run.id, - error, - retry: { - timestamp: Date.now(), - delay: 0, - }, - }, - }); - expect(result.attemptStatus).toBe("RETRY_IMMEDIATELY"); - expect(result.snapshot.executionStatus).toBe("PENDING_EXECUTING"); - expect(result.run.status).toBe("RETRYING_AFTER_FAILURE"); - - //state should be pending - const executionData3 = await engine.getRunExecutionData({ runId: run.id }); - assertNonNullable(executionData3); - expect(executionData3.snapshot.executionStatus).toBe("PENDING_EXECUTING"); - //only when the new attempt is created, should the attempt be increased - expect(executionData3.run.attemptNumber).toBe(1); - expect(executionData3.run.status).toBe("RETRYING_AFTER_FAILURE"); - - //create a second attempt - const attemptResult2 = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: executionData3.snapshot.id, - }); - expect(attemptResult2.run.attemptNumber).toBe(2); - - //now complete it successfully - const result2 = await engine.completeRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: attemptResult2.snapshot.id, - completion: { - ok: true, - id: dequeued[0].run.id, - output: `{"foo":"bar"}`, - outputType: "application/json", - }, - }); - expect(result2.snapshot.executionStatus).toBe("FINISHED"); - expect(result2.run.attemptNumber).toBe(2); - expect(result2.run.status).toBe("COMPLETED_SUCCESSFULLY"); - - //waitpoint should have been completed, with the output - const runWaitpointAfter = await prisma.waitpoint.findMany({ - where: { - completedByTaskRunId: run.id, + //create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + //fail the attempt + const error = { + type: "BUILT_IN_ERROR" as const, + name: "UserError", + message: "This is a user error", + stackTrace: "Error: This is a user error\n at :1:1", + }; + const result = await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: false, + id: dequeued[0].run.id, + error, + retry: { + timestamp: Date.now(), + delay: 0, }, - }); - expect(runWaitpointAfter.length).toBe(1); - expect(runWaitpointAfter[0].type).toBe("RUN"); - expect(runWaitpointAfter[0].output).toBe(`{"foo":"bar"}`); - expect(runWaitpointAfter[0].outputIsError).toBe(false); - - //state should be completed - const executionData4 = await engine.getRunExecutionData({ runId: run.id }); - assertNonNullable(executionData4); - expect(executionData4.snapshot.executionStatus).toBe("FINISHED"); - expect(executionData4.run.attemptNumber).toBe(2); - expect(executionData4.run.status).toBe("COMPLETED_SUCCESSFULLY"); - } finally { - engine.quit(); - } + }, + }); + expect(result.attemptStatus).toBe("RETRY_IMMEDIATELY"); + expect(result.snapshot.executionStatus).toBe("PENDING_EXECUTING"); + expect(result.run.status).toBe("RETRYING_AFTER_FAILURE"); + + //state should be pending + const executionData3 = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData3); + expect(executionData3.snapshot.executionStatus).toBe("PENDING_EXECUTING"); + //only when the new attempt is created, should the attempt be increased + expect(executionData3.run.attemptNumber).toBe(1); + expect(executionData3.run.status).toBe("RETRYING_AFTER_FAILURE"); + + //create a second attempt + const attemptResult2 = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: executionData3.snapshot.id, + }); + expect(attemptResult2.run.attemptNumber).toBe(2); + + //now complete it successfully + const result2 = await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult2.snapshot.id, + completion: { + ok: true, + id: dequeued[0].run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", + }, + }); + expect(result2.snapshot.executionStatus).toBe("FINISHED"); + expect(result2.run.attemptNumber).toBe(2); + expect(result2.run.status).toBe("COMPLETED_SUCCESSFULLY"); + + //waitpoint should have been completed, with the output + const runWaitpointAfter = await prisma.waitpoint.findMany({ + where: { + completedByTaskRunId: run.id, + }, + }); + expect(runWaitpointAfter.length).toBe(1); + expect(runWaitpointAfter[0].type).toBe("RUN"); + expect(runWaitpointAfter[0].output).toBe(`{"foo":"bar"}`); + expect(runWaitpointAfter[0].outputIsError).toBe(false); + + //state should be completed + const executionData4 = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData4); + expect(executionData4.snapshot.executionStatus).toBe("FINISHED"); + expect(executionData4.run.attemptNumber).toBe(2); + expect(executionData4.run.status).toBe("COMPLETED_SUCCESSFULLY"); + } finally { + engine.quit(); } - ); + }); - containerTest("Fail (no more retries)", { timeout: 15_000 }, async ({ prisma, redisOptions }) => { + containerTest("Fail (no more retries)", async ({ prisma, redisOptions }) => { //create environment const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); @@ -278,120 +274,116 @@ describe("RunEngine attempt failures", () => { } }); - containerTest( - "Fail (not a retriable error)", - { timeout: 15_000 }, - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + containerTest("Fail (not a retriable error)", async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, }, - baseCostInCents: 0.0001, }, - tracer: trace.getTracer("test", "0.0.0"), + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + //create background worker + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, { + maxAttempts: 1, }); - try { - const taskIdentifier = "test-task"; - - //create background worker - await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, { - maxAttempts: 1, - }); - - //trigger the run - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queueName: "task/test-task", - isTest: false, - tags: [], - }, - prisma - ); - - //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); - - //create an attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); - - //fail the attempt with an unretriable error - const error = { - type: "INTERNAL_ERROR" as const, - code: "DISK_SPACE_EXCEEDED" as const, - }; - const result = await engine.completeRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: attemptResult.snapshot.id, - completion: { - ok: false, - id: dequeued[0].run.id, - error, - retry: { - timestamp: Date.now(), - delay: 0, - }, + //trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + //dequeue the run + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + //create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + //fail the attempt with an unretriable error + const error = { + type: "INTERNAL_ERROR" as const, + code: "DISK_SPACE_EXCEEDED" as const, + }; + const result = await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: false, + id: dequeued[0].run.id, + error, + retry: { + timestamp: Date.now(), + delay: 0, }, - }); - expect(result.attemptStatus).toBe("RUN_FINISHED"); - expect(result.snapshot.executionStatus).toBe("FINISHED"); - expect(result.run.status).toBe("CRASHED"); - - //state should be pending - const executionData3 = await engine.getRunExecutionData({ runId: run.id }); - assertNonNullable(executionData3); - expect(executionData3.snapshot.executionStatus).toBe("FINISHED"); - //only when the new attempt is created, should the attempt be increased - expect(executionData3.run.attemptNumber).toBe(1); - expect(executionData3.run.status).toBe("CRASHED"); - } finally { - engine.quit(); - } + }, + }); + expect(result.attemptStatus).toBe("RUN_FINISHED"); + expect(result.snapshot.executionStatus).toBe("FINISHED"); + expect(result.run.status).toBe("CRASHED"); + + //state should be pending + const executionData3 = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData3); + expect(executionData3.snapshot.executionStatus).toBe("FINISHED"); + //only when the new attempt is created, should the attempt be increased + expect(executionData3.run.attemptNumber).toBe(1); + expect(executionData3.run.status).toBe("CRASHED"); + } finally { + engine.quit(); } - ); + }); - containerTest("OOM fail", { timeout: 15_000 }, async ({ prisma, redisOptions }) => { + containerTest("OOM fail", async ({ prisma, redisOptions }) => { //create environment const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); @@ -498,332 +490,324 @@ describe("RunEngine attempt failures", () => { } }); - containerTest( - "OOM retry on larger machine", - { timeout: 15_000 }, - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + containerTest("OOM retry on larger machine", async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - "small-2x": { - name: "small-2x" as const, - cpu: 1, - memory: 1, - centsPerMs: 0.0002, - }, + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + "small-2x": { + name: "small-2x" as const, + cpu: 1, + memory: 1, + centsPerMs: 0.0002, }, - baseCostInCents: 0.0001, }, - tracer: trace.getTracer("test", "0.0.0"), + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + //create background worker + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, { + outOfMemory: { + machine: "small-2x", + }, }); - try { - const taskIdentifier = "test-task"; + //trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); - //create background worker - await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, { - outOfMemory: { - machine: "small-2x", - }, - }); - - //trigger the run - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queueName: "task/test-task", - isTest: false, - tags: [], - }, - prisma - ); - - //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); - - //create an attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); - - //fail the attempt with an OOM error - const error = { - type: "INTERNAL_ERROR" as const, - code: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" as const, - message: "Process exited with code -1 after signal SIGKILL.", - stackTrace: "JavaScript heap out of memory", - }; - - const result = await engine.completeRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: attemptResult.snapshot.id, - completion: { - ok: false, - id: dequeued[0].run.id, - error, - }, - }); - - // The run should be retried with a larger machine - expect(result.attemptStatus).toBe("RETRY_QUEUED"); - expect(result.snapshot.executionStatus).toBe("QUEUED"); - expect(result.run.status).toBe("RETRYING_AFTER_FAILURE"); - - //state should be pending - const executionData = await engine.getRunExecutionData({ runId: run.id }); - assertNonNullable(executionData); - expect(executionData.snapshot.executionStatus).toBe("QUEUED"); - expect(executionData.run.attemptNumber).toBe(1); - expect(executionData.run.status).toBe("RETRYING_AFTER_FAILURE"); - - //create a second attempt - const attemptResult2 = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: executionData.snapshot.id, - }); - expect(attemptResult2.run.attemptNumber).toBe(2); - - //now complete it successfully - const result2 = await engine.completeRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: attemptResult2.snapshot.id, - completion: { - ok: true, - id: dequeued[0].run.id, - output: `{"foo":"bar"}`, - outputType: "application/json", - }, - }); - expect(result2.snapshot.executionStatus).toBe("FINISHED"); - expect(result2.run.attemptNumber).toBe(2); - expect(result2.run.status).toBe("COMPLETED_SUCCESSFULLY"); - - //waitpoint should have been completed, with the output - const runWaitpointAfter = await prisma.waitpoint.findMany({ - where: { - completedByTaskRunId: run.id, - }, - }); - expect(runWaitpointAfter.length).toBe(1); - expect(runWaitpointAfter[0].type).toBe("RUN"); - expect(runWaitpointAfter[0].output).toBe(`{"foo":"bar"}`); - expect(runWaitpointAfter[0].outputIsError).toBe(false); - - //state should be completed - const executionData4 = await engine.getRunExecutionData({ runId: run.id }); - assertNonNullable(executionData4); - expect(executionData4.snapshot.executionStatus).toBe("FINISHED"); - expect(executionData4.run.attemptNumber).toBe(2); - expect(executionData4.run.status).toBe("COMPLETED_SUCCESSFULLY"); - } finally { - engine.quit(); - } - } - ); + //dequeue the run + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); - containerTest( - "OOM fails after retrying on larger machine", - { timeout: 15_000 }, - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + //create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, + //fail the attempt with an OOM error + const error = { + type: "INTERNAL_ERROR" as const, + code: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" as const, + message: "Process exited with code -1 after signal SIGKILL.", + stackTrace: "JavaScript heap out of memory", + }; + + const result = await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: false, + id: dequeued[0].run.id, + error, }, - queue: { - redis: redisOptions, + }); + + // The run should be retried with a larger machine + expect(result.attemptStatus).toBe("RETRY_QUEUED"); + expect(result.snapshot.executionStatus).toBe("QUEUED"); + expect(result.run.status).toBe("RETRYING_AFTER_FAILURE"); + + //state should be pending + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("QUEUED"); + expect(executionData.run.attemptNumber).toBe(1); + expect(executionData.run.status).toBe("RETRYING_AFTER_FAILURE"); + + //create a second attempt + const attemptResult2 = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: executionData.snapshot.id, + }); + expect(attemptResult2.run.attemptNumber).toBe(2); + + //now complete it successfully + const result2 = await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult2.snapshot.id, + completion: { + ok: true, + id: dequeued[0].run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", }, - runLock: { - redis: redisOptions, + }); + expect(result2.snapshot.executionStatus).toBe("FINISHED"); + expect(result2.run.attemptNumber).toBe(2); + expect(result2.run.status).toBe("COMPLETED_SUCCESSFULLY"); + + //waitpoint should have been completed, with the output + const runWaitpointAfter = await prisma.waitpoint.findMany({ + where: { + completedByTaskRunId: run.id, }, + }); + expect(runWaitpointAfter.length).toBe(1); + expect(runWaitpointAfter[0].type).toBe("RUN"); + expect(runWaitpointAfter[0].output).toBe(`{"foo":"bar"}`); + expect(runWaitpointAfter[0].outputIsError).toBe(false); + + //state should be completed + const executionData4 = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData4); + expect(executionData4.snapshot.executionStatus).toBe("FINISHED"); + expect(executionData4.run.attemptNumber).toBe(2); + expect(executionData4.run.status).toBe("COMPLETED_SUCCESSFULLY"); + } finally { + engine.quit(); + } + }); + + containerTest("OOM fails after retrying on larger machine", async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - "small-2x": { - name: "small-2x" as const, - cpu: 1, - memory: 1, - centsPerMs: 0.0002, - }, + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + "small-2x": { + name: "small-2x" as const, + cpu: 1, + memory: 1, + centsPerMs: 0.0002, }, - baseCostInCents: 0.0001, }, - tracer: trace.getTracer("test", "0.0.0"), + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + //create background worker + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, { + maxTimeoutInMs: 10, + maxAttempts: 10, + outOfMemory: { + machine: "small-2x", + }, }); - try { - const taskIdentifier = "test-task"; + //trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); - //create background worker - await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier, undefined, { - maxTimeoutInMs: 10, - maxAttempts: 10, - outOfMemory: { - machine: "small-2x", - }, - }); - - //trigger the run - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queueName: "task/test-task", - isTest: false, - tags: [], - }, - prisma - ); - - //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); - - //create first attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); - - //fail the first attempt with an OOM error - const error = { - type: "INTERNAL_ERROR" as const, - code: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" as const, - message: "Process exited with code -1 after signal SIGKILL.", - stackTrace: "JavaScript heap out of memory", - }; - - const result = await engine.completeRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: attemptResult.snapshot.id, - completion: { - ok: false, - id: dequeued[0].run.id, - error, - }, - }); - - // The run should be retried with a larger machine - expect(result.attemptStatus).toBe("RETRY_QUEUED"); - expect(result.snapshot.executionStatus).toBe("QUEUED"); - expect(result.run.status).toBe("RETRYING_AFTER_FAILURE"); - - //state should be queued - const executionData = await engine.getRunExecutionData({ runId: run.id }); - assertNonNullable(executionData); - expect(executionData.snapshot.executionStatus).toBe("QUEUED"); - expect(executionData.run.attemptNumber).toBe(1); - expect(executionData.run.status).toBe("RETRYING_AFTER_FAILURE"); - - //wait for 1s - await setTimeout(1_000); - - //dequeue again - const dequeued2 = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); - expect(dequeued2.length).toBe(1); - - //create second attempt - const attemptResult2 = await engine.startRunAttempt({ - runId: dequeued2[0].run.id, - snapshotId: dequeued2[0].snapshot.id, - }); - expect(attemptResult2.run.attemptNumber).toBe(2); - - //fail the second attempt with the same OOM error - const result2 = await engine.completeRunAttempt({ - runId: dequeued2[0].run.id, - snapshotId: attemptResult2.snapshot.id, - completion: { - ok: false, - id: dequeued2[0].run.id, - error, - retry: { - timestamp: Date.now(), - delay: 0, - }, + //dequeue the run + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + //create first attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + //fail the first attempt with an OOM error + const error = { + type: "INTERNAL_ERROR" as const, + code: "TASK_PROCESS_EXITED_WITH_NON_ZERO_CODE" as const, + message: "Process exited with code -1 after signal SIGKILL.", + stackTrace: "JavaScript heap out of memory", + }; + + const result = await engine.completeRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: attemptResult.snapshot.id, + completion: { + ok: false, + id: dequeued[0].run.id, + error, + }, + }); + + // The run should be retried with a larger machine + expect(result.attemptStatus).toBe("RETRY_QUEUED"); + expect(result.snapshot.executionStatus).toBe("QUEUED"); + expect(result.run.status).toBe("RETRYING_AFTER_FAILURE"); + + //state should be queued + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("QUEUED"); + expect(executionData.run.attemptNumber).toBe(1); + expect(executionData.run.status).toBe("RETRYING_AFTER_FAILURE"); + + //wait for 1s + await setTimeout(5_000); + + //dequeue again + const dequeued2 = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + expect(dequeued2.length).toBe(1); + + //create second attempt + const attemptResult2 = await engine.startRunAttempt({ + runId: dequeued2[0].run.id, + snapshotId: dequeued2[0].snapshot.id, + }); + expect(attemptResult2.run.attemptNumber).toBe(2); + + //fail the second attempt with the same OOM error + const result2 = await engine.completeRunAttempt({ + runId: dequeued2[0].run.id, + snapshotId: attemptResult2.snapshot.id, + completion: { + ok: false, + id: dequeued2[0].run.id, + error, + retry: { + timestamp: Date.now(), + delay: 0, }, - }); - - // The run should fail after the second OOM - expect(result2.attemptStatus).toBe("RUN_FINISHED"); - expect(result2.snapshot.executionStatus).toBe("FINISHED"); - expect(result2.run.status).toBe("CRASHED"); - - //final state should be crashed - const finalExecutionData = await engine.getRunExecutionData({ runId: run.id }); - assertNonNullable(finalExecutionData); - expect(finalExecutionData.snapshot.executionStatus).toBe("FINISHED"); - expect(finalExecutionData.run.attemptNumber).toBe(2); - expect(finalExecutionData.run.status).toBe("CRASHED"); - } finally { - engine.quit(); - } + }, + }); + + // The run should fail after the second OOM + expect(result2.attemptStatus).toBe("RUN_FINISHED"); + expect(result2.snapshot.executionStatus).toBe("FINISHED"); + expect(result2.run.status).toBe("CRASHED"); + + //final state should be crashed + const finalExecutionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(finalExecutionData); + expect(finalExecutionData.snapshot.executionStatus).toBe("FINISHED"); + expect(finalExecutionData.run.attemptNumber).toBe(2); + expect(finalExecutionData.run.status).toBe("CRASHED"); + } finally { + engine.quit(); } - ); + }); }); diff --git a/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts b/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts index a5a1a8b3b4..50e6dea856 100644 --- a/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts +++ b/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts @@ -8,12 +8,977 @@ import { import { trace } from "@internal/tracing"; import { expect } from "vitest"; import { RunEngine } from "../index.js"; -import { setTimeout } from "timers/promises"; +import { setTimeout } from "node:timers/promises"; import { EventBusEventArgs } from "../eventBus.js"; vi.setConfig({ testTimeout: 60_000 }); describe("RunEngine checkpoints", () => { - //todo checkpoint tests - test("empty test", async () => {}); + containerTest("Create checkpoint and continue execution", async ({ prisma, redisOptions }) => { + // Create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + // Create background worker + const backgroundWorker = await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier + ); + + // Trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + // Dequeue the run + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + // Create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + //create a manual waitpoint + const waitpointResult = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + expect(waitpointResult.waitpoint.status).toBe("PENDING"); + + //block the run + const blockedResult = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpointResult.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + const blockedExecutionData = await engine.getRunExecutionData({ runId: run.id }); + expect(blockedExecutionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Create a checkpoint + const checkpointResult = await engine.createCheckpoint({ + runId: run.id, + snapshotId: blockedResult.id, + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + + expect(checkpointResult.ok).toBe(true); + + const snapshot = checkpointResult.ok ? checkpointResult.snapshot : null; + + assertNonNullable(snapshot); + + const checkpointRun = checkpointResult.ok ? checkpointResult.run : null; + assertNonNullable(checkpointRun); + + // Verify checkpoint creation + expect(snapshot.executionStatus).toBe("SUSPENDED"); + expect(checkpointRun.status).toBe("WAITING_TO_RESUME"); + + // Get execution data to verify state + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("SUSPENDED"); + expect(executionData.checkpoint).toBeDefined(); + expect(executionData.checkpoint?.type).toBe("DOCKER"); + expect(executionData.checkpoint?.reason).toBe("TEST_CHECKPOINT"); + + //complete the waitpoint + await engine.completeWaitpoint({ + id: waitpointResult.waitpoint.id, + }); + + await setTimeout(500); + + // Dequeue the run again + const dequeuedAgain = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + expect(dequeuedAgain.length).toBe(1); + + // Continue execution from checkpoint + const continueResult = await engine.continueRunExecution({ + runId: run.id, + snapshotId: dequeuedAgain[0].snapshot.id, + }); + + // Verify continuation + expect(continueResult.snapshot.executionStatus).toBe("EXECUTING"); + expect(continueResult.run.status).toBe("EXECUTING"); + + // Complete the run + const result = await engine.completeRunAttempt({ + runId: run.id, + snapshotId: continueResult.snapshot.id, + completion: { + ok: true, + id: run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", + }, + }); + + // Verify final state + expect(result.snapshot.executionStatus).toBe("FINISHED"); + expect(result.run.status).toBe("COMPLETED_SUCCESSFULLY"); + } finally { + await engine.quit(); + } + }); + + containerTest("Failed checkpoint creation", async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + // Create background worker + const backgroundWorker = await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier + ); + + // Trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + // Try to create checkpoint with invalid snapshot ID + const result = await engine.createCheckpoint({ + runId: run.id, + snapshotId: "invalid-snapshot-id", + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + + const error = !result.ok ? result.error : null; + + expect(error).toBe("Not the latest snapshot"); + + // Verify run is still in initial state + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.checkpoint).toBeUndefined(); + } finally { + await engine.quit(); + } + }); + + containerTest("Multiple checkpoints in single run", async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + const backgroundWorker = await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier + ); + + // Trigger run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + // First checkpoint sequence + const dequeued1 = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + const attemptResult1 = await engine.startRunAttempt({ + runId: dequeued1[0].run.id, + snapshotId: dequeued1[0].snapshot.id, + }); + + // Create waitpoint and block run + const waitpoint1 = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + const blocked1 = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint1.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + // Create first checkpoint + const checkpoint1 = await engine.createCheckpoint({ + runId: run.id, + snapshotId: blocked1.id, + checkpoint: { + type: "DOCKER", + reason: "CHECKPOINT_1", + location: "location-1", + imageRef: "image-1", + }, + }); + + expect(checkpoint1.ok).toBe(true); + const snapshot1 = checkpoint1.ok ? checkpoint1.snapshot : null; + assertNonNullable(snapshot1); + + // Complete first waitpoint + await engine.completeWaitpoint({ + id: waitpoint1.waitpoint.id, + }); + + await setTimeout(500); + + // Dequeue again after waitpoint completion + const dequeued2 = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + // Continue execution from first checkpoint + const continueResult1 = await engine.continueRunExecution({ + runId: run.id, + snapshotId: dequeued2[0].snapshot.id, + }); + + // Second checkpoint sequence + // Create another waitpoint and block run + const waitpoint2 = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + const blocked2 = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpoint2.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + // Create second checkpoint + const checkpoint2 = await engine.createCheckpoint({ + runId: run.id, + snapshotId: blocked2.id, + checkpoint: { + type: "DOCKER", + reason: "CHECKPOINT_2", + location: "location-2", + imageRef: "image-2", + }, + }); + + expect(checkpoint2.ok).toBe(true); + const snapshot2 = checkpoint2.ok ? checkpoint2.snapshot : null; + assertNonNullable(snapshot2); + + // Complete second waitpoint + await engine.completeWaitpoint({ + id: waitpoint2.waitpoint.id, + }); + + await setTimeout(500); + + // Dequeue again after second waitpoint completion + const dequeued3 = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + expect(dequeued3.length).toBe(1); + + // Verify latest checkpoint + expect(dequeued3[0].checkpoint?.reason).toBe("CHECKPOINT_2"); + expect(dequeued3[0].checkpoint?.location).toBe("location-2"); + + // Continue execution from second checkpoint + const continueResult2 = await engine.continueRunExecution({ + runId: run.id, + snapshotId: dequeued3[0].snapshot.id, + }); + + // Complete the run + const result = await engine.completeRunAttempt({ + runId: run.id, + snapshotId: continueResult2.snapshot.id, + completion: { + ok: true, + id: run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", + }, + }); + + expect(result.snapshot.executionStatus).toBe("FINISHED"); + expect(result.run.status).toBe("COMPLETED_SUCCESSFULLY"); + } finally { + await engine.quit(); + } + }); + + containerTest( + "Checkpoint after waitpoint completion with concurrency reacquisition", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + // Create background worker + const backgroundWorker = await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier + ); + + // Trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + // Dequeue the run + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + // Create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + // Create and block with waitpoint + const waitpointResult = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + expect(waitpointResult.waitpoint.status).toBe("PENDING"); + + const blockedResult = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: waitpointResult.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: true, // Important: Release concurrency when blocking + }); + + // Verify run is blocked + const blockedExecutionData = await engine.getRunExecutionData({ runId: run.id }); + expect(blockedExecutionData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Complete the waitpoint before checkpoint + await engine.completeWaitpoint({ + id: waitpointResult.waitpoint.id, + }); + + await setTimeout(500); // Wait for continueRunIfUnblocked to process + + // Create checkpoint after waitpoint completion + const checkpointResult = await engine.createCheckpoint({ + runId: run.id, + snapshotId: blockedResult.id, + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + + expect(checkpointResult.ok).toBe(false); + const error = !checkpointResult.ok ? checkpointResult.error : null; + expect(error).toBe("Not the latest snapshot"); + + // Verify checkpoint state + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("EXECUTING"); + + // Complete the run + const result = await engine.completeRunAttempt({ + runId: run.id, + snapshotId: executionData.snapshot.id, + completion: { + ok: true, + id: run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", + }, + }); + + // Verify final state + expect(result.snapshot.executionStatus).toBe("FINISHED"); + expect(result.run.status).toBe("COMPLETED_SUCCESSFULLY"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "Cannot create checkpoint in non-checkpointable state", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0005, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + // Create background worker + const backgroundWorker = await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier + ); + + // Trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + }, + prisma + ); + + // Dequeue the run + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + // Create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + // First create a valid checkpoint to get into SUSPENDED state + const checkpoint1 = await engine.createCheckpoint({ + runId: run.id, + snapshotId: attemptResult.snapshot.id, + checkpoint: { + type: "DOCKER", + reason: "FIRST_CHECKPOINT", + location: "test-location-1", + imageRef: "test-image-ref-1", + }, + }); + + expect(checkpoint1.ok).toBe(true); + const snapshot1 = checkpoint1.ok ? checkpoint1.snapshot : null; + assertNonNullable(snapshot1); + + // Verify we're in SUSPENDED state + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("SUSPENDED"); + + let event: EventBusEventArgs<"incomingCheckpointDiscarded">[0] | undefined = undefined; + engine.eventBus.on("incomingCheckpointDiscarded", (result) => { + event = result; + }); + + // Try to create another checkpoint while in SUSPENDED state + const checkpoint2 = await engine.createCheckpoint({ + runId: run.id, + snapshotId: snapshot1.id, + checkpoint: { + type: "DOCKER", + reason: "SECOND_CHECKPOINT", + location: "test-location-2", + imageRef: "test-image-ref-2", + }, + }); + + assertNonNullable(event); + + const notificationEvent = event as EventBusEventArgs<"incomingCheckpointDiscarded">[0]; + expect(notificationEvent.run.id).toBe(run.id); + + expect(notificationEvent.run.id).toBe(run.id); + expect(notificationEvent.checkpoint.discardReason).toBe( + "Status SUSPENDED is not checkpointable" + ); + + // Verify the checkpoint creation was rejected + expect(checkpoint2.ok).toBe(false); + const error = !checkpoint2.ok ? checkpoint2.error : null; + expect(error).toBe("Status SUSPENDED is not checkpointable"); + + // Verify the run state hasn't changed + const finalExecutionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(finalExecutionData); + expect(finalExecutionData.snapshot.executionStatus).toBe("SUSPENDED"); + expect(finalExecutionData.checkpoint?.reason).toBe("FIRST_CHECKPOINT"); + } finally { + await engine.quit(); + } + } + ); + + containerTest( + "when a checkpoint is created while the run is in QUEUED_EXECUTING state, the run is QUEUED", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + const queueName = "task/test-task-limited"; + + // Create background worker + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier); + + // Create first run with queue concurrency limit of 1 + const firstRun = await engine.trigger( + { + number: 1, + friendlyId: "run_first", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345-first", + spanId: "s12345-first", + masterQueue: "main", + queueName, + isTest: false, + tags: [], + queue: { concurrencyLimit: 1 }, + }, + prisma + ); + + // Dequeue and start the first run + const dequeuedFirst = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: firstRun.masterQueue, + maxRunCount: 10, + }); + + const firstAttempt = await engine.startRunAttempt({ + runId: dequeuedFirst[0].run.id, + snapshotId: dequeuedFirst[0].snapshot.id, + }); + expect(firstAttempt.snapshot.executionStatus).toBe("EXECUTING"); + + // Create a manual waitpoint for the first run + const waitpoint = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + expect(waitpoint.waitpoint.status).toBe("PENDING"); + + // Block the first run with releaseConcurrency set to true + const blockedResult = await engine.blockRunWithWaitpoint({ + runId: firstRun.id, + waitpoints: waitpoint.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: true, + }); + + // Verify first run is blocked + const firstRunData = await engine.getRunExecutionData({ runId: firstRun.id }); + expect(firstRunData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Create and start second run on the same queue + const secondRun = await engine.trigger( + { + number: 2, + friendlyId: "run_second", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345-second", + spanId: "s12345-second", + masterQueue: "main", + queueName, + isTest: false, + tags: [], + queue: { concurrencyLimit: 1 }, + }, + prisma + ); + + // Dequeue and start the second run + const dequeuedSecond = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: secondRun.masterQueue, + maxRunCount: 10, + }); + + const secondAttempt = await engine.startRunAttempt({ + runId: dequeuedSecond[0].run.id, + snapshotId: dequeuedSecond[0].snapshot.id, + }); + expect(secondAttempt.snapshot.executionStatus).toBe("EXECUTING"); + + // Now complete the waitpoint for the first run + await engine.completeWaitpoint({ + id: waitpoint.waitpoint.id, + }); + + // Wait for the continueRunIfUnblocked to process + await setTimeout(500); + + // Verify the first run is now in QUEUED_EXECUTING state + const executionDataAfter = await engine.getRunExecutionData({ runId: firstRun.id }); + expect(executionDataAfter?.snapshot.executionStatus).toBe("QUEUED_EXECUTING"); + expect(executionDataAfter?.snapshot.description).toBe( + "Run can continue, but is waiting for concurrency" + ); + + // Verify the waitpoint is no longer blocking the first run + const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({ + where: { + taskRunId: firstRun.id, + }, + include: { + waitpoint: true, + }, + }); + expect(runWaitpoint).toBeNull(); + + // Verify the waitpoint itself is completed + const completedWaitpoint = await prisma.waitpoint.findUnique({ + where: { + id: waitpoint.waitpoint.id, + }, + }); + assertNonNullable(completedWaitpoint); + expect(completedWaitpoint.status).toBe("COMPLETED"); + + // Create checkpoint after waitpoint completion + const checkpointResult = await engine.createCheckpoint({ + runId: firstRun.id, + snapshotId: firstRunData?.snapshot.id!, + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + + expect(checkpointResult.ok).toBe(true); + const checkpoint = checkpointResult.ok ? checkpointResult.snapshot : null; + assertNonNullable(checkpoint); + expect(checkpoint.executionStatus).toBe("QUEUED"); + + // Complete the second run so the first run can be dequeued + const result = await engine.completeRunAttempt({ + runId: dequeuedSecond[0].run.id, + snapshotId: secondAttempt.snapshot.id, + completion: { + ok: true, + id: dequeuedSecond[0].run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", + }, + }); + + await setTimeout(500); + + // Verify the first run is back in the queue + const queuedRun = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: firstRun.masterQueue, + maxRunCount: 10, + }); + + expect(queuedRun.length).toBe(1); + expect(queuedRun[0].run.id).toBe(firstRun.id); + expect(queuedRun[0].snapshot.executionStatus).toBe("PENDING_EXECUTING"); + + // Now we can continue the run + const continueResult = await engine.continueRunExecution({ + runId: firstRun.id, + snapshotId: queuedRun[0].snapshot.id, + }); + + expect(continueResult.snapshot.executionStatus).toBe("EXECUTING"); + } finally { + await engine.quit(); + } + } + ); }); diff --git a/internal-packages/run-engine/src/engine/tests/delays.test.ts b/internal-packages/run-engine/src/engine/tests/delays.test.ts index 0d87d27e47..655b01941d 100644 --- a/internal-packages/run-engine/src/engine/tests/delays.test.ts +++ b/internal-packages/run-engine/src/engine/tests/delays.test.ts @@ -8,6 +8,7 @@ import { trace } from "@internal/tracing"; import { expect } from "vitest"; import { RunEngine } from "../index.js"; import { setTimeout } from "timers/promises"; +import { TaskRunErrorCodes } from "@trigger.dev/core/v3"; vi.setConfig({ testTimeout: 60_000 }); @@ -154,7 +155,7 @@ describe("RunEngine delays", () => { queueName: "task/test-task", isTest: false, tags: [], - delayUntil: new Date(Date.now() + 200), + delayUntil: new Date(Date.now() + 400), }, prisma ); @@ -165,7 +166,10 @@ describe("RunEngine delays", () => { expect(executionData.snapshot.executionStatus).toBe("RUN_CREATED"); const rescheduleTo = new Date(Date.now() + 1_500); - const updatedRun = await engine.rescheduleRun({ runId: run.id, delayUntil: rescheduleTo }); + const updatedRun = await engine.rescheduleDelayedRun({ + runId: run.id, + delayUntil: rescheduleTo, + }); expect(updatedRun.delayUntil?.toISOString()).toBe(rescheduleTo.toISOString()); //wait so the initial delay passes @@ -187,4 +191,108 @@ describe("RunEngine delays", () => { engine.quit(); } }); + + containerTest("Delayed run with a ttl", async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + + //create background worker + const backgroundWorker = await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier + ); + + //trigger the run + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: "task/test-task", + isTest: false, + tags: [], + delayUntil: new Date(Date.now() + 1000), + ttl: "2s", + }, + prisma + ); + + //should be created but not queued yet + const executionData = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("RUN_CREATED"); + expect(run.status).toBe("DELAYED"); + + //wait for 1 seconds + await setTimeout(2_500); + + //should now be queued + const executionData2 = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData2); + expect(executionData2.snapshot.executionStatus).toBe("QUEUED"); + + const run2 = await prisma.taskRun.findFirstOrThrow({ + where: { id: run.id }, + }); + + expect(run2.status).toBe("PENDING"); + + //wait for 3 seconds + await setTimeout(3_000); + + //should now be expired + const executionData3 = await engine.getRunExecutionData({ runId: run.id }); + assertNonNullable(executionData3); + expect(executionData3.snapshot.executionStatus).toBe("FINISHED"); + + const run3 = await prisma.taskRun.findFirstOrThrow({ + where: { id: run.id }, + }); + + expect(run3.status).toBe("EXPIRED"); + } finally { + engine.quit(); + } + }); }); diff --git a/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts b/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts index 518189e9ab..89df4e0726 100644 --- a/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts +++ b/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts @@ -95,7 +95,7 @@ describe("RunEngine heartbeats", () => { assertNonNullable(executionData); expect(executionData.snapshot.executionStatus).toBe("PENDING_EXECUTING"); - await setTimeout(pendingExecutingTimeout * 2); + await setTimeout(pendingExecutingTimeout * 4); //expect it to be pending with 3 consecutiveFailures const executionData2 = await engine.getRunExecutionData({ runId: run.id }); diff --git a/internal-packages/run-engine/src/engine/locking.test.ts b/internal-packages/run-engine/src/engine/tests/locking.test.ts similarity index 97% rename from internal-packages/run-engine/src/engine/locking.test.ts rename to internal-packages/run-engine/src/engine/tests/locking.test.ts index 5fc8b9832b..17831c2c38 100644 --- a/internal-packages/run-engine/src/engine/locking.test.ts +++ b/internal-packages/run-engine/src/engine/tests/locking.test.ts @@ -1,7 +1,7 @@ import { createRedisClient } from "@internal/redis"; import { redisTest } from "@internal/testcontainers"; import { expect } from "vitest"; -import { RunLocker } from "./locking.js"; +import { RunLocker } from "../locking.js"; describe("RunLocker", () => { redisTest("Test acquiring a lock works", { timeout: 15_000 }, async ({ redisOptions }) => { diff --git a/internal-packages/run-engine/src/engine/tests/releaseConcurrency.test.ts b/internal-packages/run-engine/src/engine/tests/releaseConcurrency.test.ts new file mode 100644 index 0000000000..27d8e07172 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/releaseConcurrency.test.ts @@ -0,0 +1,1094 @@ +import { + assertNonNullable, + containerTest, + setupAuthenticatedEnvironment, + setupBackgroundWorker, +} from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { RunEngine } from "../index.js"; +import { setTimeout } from "node:timers/promises"; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunEngine Releasing Concurrency", () => { + containerTest("defaults to releasing env concurrency only", async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + releaseConcurrency: { + maxTokensRatio: 1, + maxRetries: 3, + consumersCount: 1, + pollInterval: 500, + batchSize: 1, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + const taskIdentifier = "test-task"; + + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrency).toBe(1); + + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrency).toBe(1); + + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfter).toBe(1); + + // Now confirm the environment has a concurrency of 0 + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfter).toBe(0); + + await engine.completeWaitpoint({ + id: result.waitpoint.id, + }); + + await setTimeout(500); + + // Test that we've reacquired the queue concurrency + const queueConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterWaitpoint).toBe(1); + + // Test that we've reacquired the environment concurrency + const envConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterWaitpoint).toBe(1); + + // Now we are going to block with another waitpoint, this time specifiying we want to release the concurrency in the waitpoint + const result2 = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + const executingWithWaitpointSnapshot2 = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result2.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: true, + }); + + expect(executingWithWaitpointSnapshot2.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Test that we've released the queue concurrency + const queueConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterWaitpoint2).toBe(0); + + // Test that we've released the environment concurrency + const envConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterWaitpoint2).toBe(0); + + // Complete the waitpoint and make sure the run reacquires the queue and environment concurrency + await engine.completeWaitpoint({ + id: result2.waitpoint.id, + }); + + await setTimeout(500); + + // Test that we've reacquired the queue concurrency + const queueConcurrencyAfterWaitpoint3 = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterWaitpoint3).toBe(1); + }); + + containerTest( + "releases all concurrency when configured on queue", + async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + releaseConcurrency: { + maxTokensRatio: 1, + maxRetries: 3, + consumersCount: 1, + pollInterval: 500, + batchSize: 1, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + const taskIdentifier = "test-task"; + + await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier, + undefined, + undefined, + { + releaseConcurrencyOnWaitpoint: true, + } + ); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrency).toBe(1); + + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrency).toBe(1); + + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfter).toBe(0); + + // Now confirm the environment has a concurrency of 0 + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfter).toBe(0); + + // Complete the waitpoint and make sure the run reacquires the queue and environment concurrency + await engine.completeWaitpoint({ + id: result.waitpoint.id, + }); + + await setTimeout(500); + + // Test that we've reacquired the queue concurrency + const queueConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterWaitpoint).toBe(1); + + // Test that we've reacquired the environment concurrency + const envConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterWaitpoint).toBe(1); + + // Now we are going to block with another waitpoint, this time specifiying we dont want to release the concurrency in the waitpoint + const result2 = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + const executingWithWaitpointSnapshot2 = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result2.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: false, + }); + + expect(executingWithWaitpointSnapshot2.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Test that we've not released the queue concurrency + const queueConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterWaitpoint2).toBe(1); + + // Test that we've still released the environment concurrency since we always release env concurrency + const envConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterWaitpoint2).toBe(0); + } + ); + + containerTest( + "releases all concurrency for unlimited queues", + async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + releaseConcurrency: { + maxTokensRatio: 1, + maxRetries: 3, + consumersCount: 1, + pollInterval: 500, + batchSize: 1, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + const taskIdentifier = "test-task"; + + await setupBackgroundWorker( + prisma, + authenticatedEnvironment, + taskIdentifier, + undefined, + undefined, + { + releaseConcurrencyOnWaitpoint: true, + concurrencyLimit: null, + } + ); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrency).toBe(1); + + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrency).toBe(1); + + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfter).toBe(0); + + // Now confirm the environment has a concurrency of 0 + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfter).toBe(0); + + // Complete the waitpoint and make sure the run reacquires the queue and environment concurrency + await engine.completeWaitpoint({ + id: result.waitpoint.id, + }); + + await setTimeout(500); + + // Test that we've reacquired the queue concurrency + const queueConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterWaitpoint).toBe(1); + + // Test that we've reacquired the environment concurrency + const envConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterWaitpoint).toBe(1); + + // Now we are going to block with another waitpoint, this time specifiying we dont want to release the concurrency in the waitpoint + const result2 = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + const executingWithWaitpointSnapshot2 = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result2.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: false, + }); + + expect(executingWithWaitpointSnapshot2.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Test that we've not released the queue concurrency + const queueConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterWaitpoint2).toBe(1); + + // Test that we've still released the environment concurrency since we always release env concurrency + const envConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterWaitpoint2).toBe(0); + } + ); + + containerTest( + "delays env concurrency release when token unavailable", + async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + releaseConcurrency: { + maxTokensRatio: 0.1, // 10% of the concurrency limit = 1 token + maxRetries: 3, + consumersCount: 1, + pollInterval: 500, + batchSize: 1, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + const taskIdentifier = "test-task"; + + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrency).toBe(1); + + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrency).toBe(1); + + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + await engine.releaseConcurrencySystem.consumeToken( + { + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }, + "test_12345" + ); + + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfter).toBe(1); + + // Now confirm the environment is the same as before + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfter).toBe(1); + + // Now we return the token so the concurrency can be released + await engine.releaseConcurrencySystem.returnToken( + { + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }, + "test_12345" + ); + + // Wait until the token is released + await setTimeout(1_000); + + // Now the environment should have a concurrency of 0 + const envConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterReturn).toBe(0); + + // and the queue should have a concurrency of 1 + const queueConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterReturn).toBe(1); + } + ); + + containerTest( + "delays env concurrency release after checkpoint", + async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + releaseConcurrency: { + maxTokensRatio: 0.1, // 10% of the concurrency limit = 1 token + maxRetries: 3, + consumersCount: 1, + pollInterval: 500, + batchSize: 1, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + const taskIdentifier = "test-task"; + + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrency).toBe(1); + + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrency).toBe(1); + + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + await engine.releaseConcurrencySystem.consumeToken( + { + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }, + "test_12345" + ); + + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfter).toBe(1); + + // Now confirm the environment is the same as before + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfter).toBe(1); + + const checkpointResult = await engine.createCheckpoint({ + runId: run.id, + snapshotId: executingWithWaitpointSnapshot.id, + checkpoint: { + type: "DOCKER", + reason: "TEST_CHECKPOINT", + location: "test-location", + imageRef: "test-image-ref", + }, + }); + + expect(checkpointResult.ok).toBe(true); + + const snapshot = checkpointResult.ok ? checkpointResult.snapshot : null; + assertNonNullable(snapshot); + + console.log("Snapshot", snapshot); + + expect(snapshot.executionStatus).toBe("SUSPENDED"); + + // Now we return the token so the concurrency can be released + await engine.releaseConcurrencySystem.returnToken( + { + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }, + "test_12345" + ); + + // Wait until the token is released + await setTimeout(1_000); + + // Now the environment should have a concurrency of 0 + const envConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterReturn).toBe(0); + + // and the queue should have a concurrency of 1 + const queueConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterReturn).toBe(1); + } + ); + + containerTest( + "maintains concurrency after waitpoint completion", + async ({ prisma, redisOptions }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + releaseConcurrency: { + maxTokensRatio: 0.1, // 10% of the concurrency limit = 1 token + maxRetries: 3, + consumersCount: 1, + pollInterval: 500, + batchSize: 1, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + const taskIdentifier = "test-task"; + + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + masterQueue: "main", + queueName: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); + + const dequeued = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: run.masterQueue, + maxRunCount: 10, + }); + + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrency).toBe(1); + + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrency).toBe(1); + + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); + + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + + await engine.releaseConcurrencySystem.consumeToken( + { + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }, + "test_12345" + ); + + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); + + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfter).toBe(1); + + // Now confirm the environment is the same as before + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfter).toBe(1); + + // Complete the waitpoint + await engine.completeWaitpoint({ + id: result.waitpoint.id, + }); + + await setTimeout(1_000); + + // Verify the first run is now in EXECUTING state + const executionDataAfter = await engine.getRunExecutionData({ runId: run.id }); + expect(executionDataAfter?.snapshot.executionStatus).toBe("EXECUTING"); + + // Now we return the token so the concurrency can be released + await engine.releaseConcurrencySystem.returnToken( + { + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }, + "test_12345" + ); + + // give the release concurrency system time to run + await setTimeout(1_000); + + // Now the environment should have a concurrency of 1 + const envConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); + + expect(envConcurrencyAfterReturn).toBe(1); + + // and the queue should have a concurrency of 1 + const queueConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterReturn).toBe(1); + } + ); +}); diff --git a/internal-packages/run-engine/src/engine/tests/releaseConcurrencyTokenBucketQueue.test.ts b/internal-packages/run-engine/src/engine/tests/releaseConcurrencyTokenBucketQueue.test.ts new file mode 100644 index 0000000000..0c416457d9 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/releaseConcurrencyTokenBucketQueue.test.ts @@ -0,0 +1,621 @@ +import { redisTest, StartedRedisContainer } from "@internal/testcontainers"; +import { ReleaseConcurrencyTokenBucketQueue } from "../releaseConcurrencyTokenBucketQueue.js"; +import { setTimeout } from "node:timers/promises"; + +type TestQueueDescriptor = { + name: string; +}; + +function createReleaseConcurrencyQueue( + redisContainer: StartedRedisContainer, + maxTokens: number = 2 +) { + const executedRuns: { releaseQueue: string; runId: string }[] = []; + + const queue = new ReleaseConcurrencyTokenBucketQueue({ + redis: { + keyPrefix: "release-queue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + executor: async (releaseQueue, runId) => { + executedRuns.push({ releaseQueue: releaseQueue.name, runId }); + }, + maxTokens: async (_) => maxTokens, + keys: { + fromDescriptor: (descriptor) => descriptor.name, + toDescriptor: (name) => ({ name }), + }, + pollInterval: 100, + }); + + return { + queue, + executedRuns, + }; +} + +describe("ReleaseConcurrencyQueue", () => { + redisTest("Should manage token bucket and queue correctly", async ({ redisContainer }) => { + const { queue, executedRuns } = createReleaseConcurrencyQueue(redisContainer, 2); + + try { + // First two attempts should execute immediately (we have 2 tokens) + await queue.attemptToRelease({ name: "test-queue" }, "run1"); + await queue.attemptToRelease({ name: "test-queue" }, "run2"); + + // Verify first two runs were executed + expect(executedRuns).toHaveLength(2); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run1" }); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run2" }); + + // Third attempt should be queued (no tokens left) + await queue.attemptToRelease({ name: "test-queue" }, "run3"); + expect(executedRuns).toHaveLength(2); // Still 2, run3 is queued + + // Refill one token, should execute run3 + await queue.refillTokens({ name: "test-queue" }, 1); + + // Now we need to wait for the queue to be processed + await setTimeout(1000); + + expect(executedRuns).toHaveLength(3); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run3" }); + } finally { + await queue.quit(); + } + }); + + redisTest("Should handle multiple refills correctly", async ({ redisContainer }) => { + const { queue, executedRuns } = createReleaseConcurrencyQueue(redisContainer, 3); + + try { + // Queue up 5 runs (more than maxTokens) + await queue.attemptToRelease({ name: "test-queue" }, "run1"); + await queue.attemptToRelease({ name: "test-queue" }, "run2"); + await queue.attemptToRelease({ name: "test-queue" }, "run3"); + await queue.attemptToRelease({ name: "test-queue" }, "run4"); + await queue.attemptToRelease({ name: "test-queue" }, "run5"); + + // First 3 should be executed immediately (maxTokens = 3) + expect(executedRuns).toHaveLength(3); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run1" }); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run2" }); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run3" }); + + // Refill 2 tokens + await queue.refillTokens({ name: "test-queue" }, 2); + + await setTimeout(1000); + + // Should execute the remaining 2 runs + expect(executedRuns).toHaveLength(5); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run4" }); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run5" }); + } finally { + await queue.quit(); + } + }); + + redisTest("Should handle multiple queues independently", async ({ redisContainer }) => { + const { queue, executedRuns } = createReleaseConcurrencyQueue(redisContainer, 1); + + try { + // Add runs to different queues + await queue.attemptToRelease({ name: "queue1" }, "run1"); + await queue.attemptToRelease({ name: "queue1" }, "run2"); + await queue.attemptToRelease({ name: "queue2" }, "run3"); + await queue.attemptToRelease({ name: "queue2" }, "run4"); + + // Only first run from each queue should be executed + expect(executedRuns).toHaveLength(2); + expect(executedRuns).toContainEqual({ releaseQueue: "queue1", runId: "run1" }); + expect(executedRuns).toContainEqual({ releaseQueue: "queue2", runId: "run3" }); + + // Refill tokens for queue1 + await queue.refillTokens({ name: "queue1" }, 1); + + await setTimeout(1000); + + // Should only execute the queued run from queue1 + expect(executedRuns).toHaveLength(3); + expect(executedRuns).toContainEqual({ releaseQueue: "queue1", runId: "run2" }); + + // Refill tokens for queue2 + await queue.refillTokens({ name: "queue2" }, 1); + + await setTimeout(1000); + + // Should execute the queued run from queue2 + expect(executedRuns).toHaveLength(4); + expect(executedRuns).toContainEqual({ releaseQueue: "queue2", runId: "run4" }); + } finally { + await queue.quit(); + } + }); + + redisTest("Should not allow refilling more than maxTokens", async ({ redisContainer }) => { + const { queue, executedRuns } = createReleaseConcurrencyQueue(redisContainer, 1); + + try { + // Add two runs + await queue.attemptToRelease({ name: "test-queue" }, "run1"); + await queue.attemptToRelease({ name: "test-queue" }, "run2"); + + // First run should be executed immediately + expect(executedRuns).toHaveLength(1); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run1" }); + + // Refill with more tokens than needed + await queue.refillTokens({ name: "test-queue" }, 5); + + await setTimeout(1000); + + // Should only execute the one remaining run + expect(executedRuns).toHaveLength(2); + expect(executedRuns).toContainEqual({ releaseQueue: "test-queue", runId: "run2" }); + + // Add another run - should NOT execute immediately because we don't have excess tokens + await queue.attemptToRelease({ name: "test-queue" }, "run3"); + expect(executedRuns).toHaveLength(2); + } finally { + await queue.quit(); + } + }); + + redisTest("Should maintain FIFO order when releasing", async ({ redisContainer }) => { + const { queue, executedRuns } = createReleaseConcurrencyQueue(redisContainer, 1); + + try { + // Queue up multiple runs + await queue.attemptToRelease({ name: "test-queue" }, "run1"); + await queue.attemptToRelease({ name: "test-queue" }, "run2"); + await queue.attemptToRelease({ name: "test-queue" }, "run3"); + await queue.attemptToRelease({ name: "test-queue" }, "run4"); + + // First run should be executed immediately + expect(executedRuns).toHaveLength(1); + expect(executedRuns[0]).toEqual({ releaseQueue: "test-queue", runId: "run1" }); + + // Refill tokens one at a time and verify order + await queue.refillTokens({ name: "test-queue" }, 1); + + await setTimeout(1000); + + expect(executedRuns).toHaveLength(2); + expect(executedRuns[1]).toEqual({ releaseQueue: "test-queue", runId: "run2" }); + + await queue.refillTokens({ name: "test-queue" }, 1); + + await setTimeout(1000); + + expect(executedRuns).toHaveLength(3); + expect(executedRuns[2]).toEqual({ releaseQueue: "test-queue", runId: "run3" }); + + await queue.refillTokens({ name: "test-queue" }, 1); + + await setTimeout(1000); + + expect(executedRuns).toHaveLength(4); + expect(executedRuns[3]).toEqual({ releaseQueue: "test-queue", runId: "run4" }); + } finally { + await queue.quit(); + } + }); + + redisTest( + "Should handle executor failures by returning the token and adding the item into the queue", + async ({ redisContainer }) => { + let shouldFail = true; + + const executedRuns: { releaseQueue: string; runId: string }[] = []; + + const queue = new ReleaseConcurrencyTokenBucketQueue({ + redis: { + keyPrefix: "release-queue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + executor: async (releaseQueue, runId) => { + if (shouldFail) { + throw new Error("Executor failed"); + } + executedRuns.push({ releaseQueue, runId }); + }, + maxTokens: async (_) => 2, + keys: { + fromDescriptor: (descriptor) => descriptor, + toDescriptor: (name) => name, + }, + batchSize: 2, + retry: { + maxRetries: 2, + backoff: { + minDelay: 100, + maxDelay: 1000, + factor: 1, + }, + }, + pollInterval: 50, + }); + + try { + // Attempt to release with failing executor + await queue.attemptToRelease("test-queue", "run1"); + // Does not execute because the executor throws an error + expect(executedRuns).toHaveLength(0); + + // Token should have been returned to the bucket so this should try to execute immediately and fail again + await queue.attemptToRelease("test-queue", "run2"); + expect(executedRuns).toHaveLength(0); + + // Allow executor to succeed + shouldFail = false; + + await setTimeout(1000); + + // Should now execute successfully + expect(executedRuns).toHaveLength(2); + expect(executedRuns[0]).toEqual({ releaseQueue: "test-queue", runId: "run1" }); + expect(executedRuns[1]).toEqual({ releaseQueue: "test-queue", runId: "run2" }); + } finally { + await queue.quit(); + } + } + ); + + redisTest("Should handle invalid token amounts", async ({ redisContainer }) => { + const { queue, executedRuns } = createReleaseConcurrencyQueue(redisContainer, 1); + + try { + // Try to refill with negative tokens + await expect(queue.refillTokens({ name: "test-queue" }, -1)).rejects.toThrow(); + + // Try to refill with zero tokens + await queue.refillTokens({ name: "test-queue" }, 0); + + await setTimeout(1000); + + expect(executedRuns).toHaveLength(0); + + // Verify normal operation still works + await queue.attemptToRelease({ name: "test-queue" }, "run1"); + expect(executedRuns).toHaveLength(1); + } finally { + await queue.quit(); + } + }); + + redisTest("Should handle concurrent operations correctly", async ({ redisContainer }) => { + const executedRuns: { releaseQueue: string; runId: string }[] = []; + + const queue = new ReleaseConcurrencyTokenBucketQueue({ + redis: { + keyPrefix: "release-queue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + executor: async (releaseQueue, runId) => { + // Add small delay to simulate work + await setTimeout(10); + executedRuns.push({ releaseQueue, runId }); + }, + keys: { + fromDescriptor: (descriptor) => descriptor, + toDescriptor: (name) => name, + }, + maxTokens: async (_) => 2, + batchSize: 5, + pollInterval: 50, + }); + + try { + // Attempt multiple concurrent releases + await Promise.all([ + queue.attemptToRelease("test-queue", "run1"), + queue.attemptToRelease("test-queue", "run2"), + queue.attemptToRelease("test-queue", "run3"), + queue.attemptToRelease("test-queue", "run4"), + ]); + + // Should only execute maxTokens (2) runs + expect(executedRuns).toHaveLength(2); + + // Attempt concurrent refills + await queue.refillTokens("test-queue", 2); + + await setTimeout(1000); + + // Should execute remaining runs + expect(executedRuns).toHaveLength(4); + + // Verify all runs were executed exactly once + const runCounts = executedRuns.reduce( + (acc, { runId }) => { + acc[runId] = (acc[runId] || 0) + 1; + return acc; + }, + {} as Record + ); + + Object.values(runCounts).forEach((count) => { + expect(count).toBe(1); + }); + } finally { + await queue.quit(); + } + }); + + redisTest("Should clean up Redis resources on quit", async ({ redisContainer }) => { + const { queue } = createReleaseConcurrencyQueue(redisContainer, 1); + + // Add some data + await queue.attemptToRelease({ name: "test-queue" }, "run1"); + await queue.attemptToRelease({ name: "test-queue" }, "run2"); + + // Quit the queue + await queue.quit(); + + // Verify we can't perform operations after quit + await expect(queue.attemptToRelease({ name: "test-queue" }, "run3")).rejects.toThrow(); + await expect(queue.refillTokens({ name: "test-queue" }, 1)).rejects.toThrow(); + }); + + redisTest("Should stop retrying after max retries is reached", async ({ redisContainer }) => { + let failCount = 0; + const executedRuns: { releaseQueue: string; runId: string; attempt: number }[] = []; + + const queue = new ReleaseConcurrencyTokenBucketQueue({ + redis: { + keyPrefix: "release-queue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + executor: async (releaseQueue, runId) => { + failCount++; + executedRuns.push({ releaseQueue, runId, attempt: failCount }); + throw new Error("Executor failed"); + }, + keys: { + fromDescriptor: (descriptor) => descriptor, + toDescriptor: (name) => name, + }, + maxTokens: async (_) => 1, + retry: { + maxRetries: 2, // Set max retries to 2 (will attempt 3 times total: initial + 2 retries) + backoff: { + minDelay: 100, + maxDelay: 1000, + factor: 1, + }, + }, + pollInterval: 50, // Reduce poll interval for faster test + }); + + try { + // Attempt to release - this will fail and retry + await queue.attemptToRelease("test-queue", "run1"); + + // Wait for retries to occur + await setTimeout(2000); + + // Should have attempted exactly 3 times (initial + 2 retries) + expect(executedRuns).toHaveLength(3); + expect(executedRuns[0]).toEqual({ releaseQueue: "test-queue", runId: "run1", attempt: 1 }); + expect(executedRuns[1]).toEqual({ releaseQueue: "test-queue", runId: "run1", attempt: 2 }); + expect(executedRuns[2]).toEqual({ releaseQueue: "test-queue", runId: "run1", attempt: 3 }); + + // Verify that no more retries occur + await setTimeout(1000); + expect(executedRuns).toHaveLength(3); // Should still be 3 + + // Attempt a new release to verify the token was returned + let secondRunAttempted = false; + const queue2 = new ReleaseConcurrencyTokenBucketQueue({ + redis: { + keyPrefix: "release-queue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + executor: async (releaseQueue, runId) => { + secondRunAttempted = true; + }, + keys: { + fromDescriptor: (descriptor) => descriptor, + toDescriptor: (name) => name, + }, + maxTokens: async (_) => 1, + retry: { + maxRetries: 2, + backoff: { + minDelay: 100, + maxDelay: 1000, + factor: 1, + }, + }, + pollInterval: 50, + }); + + await queue2.attemptToRelease("test-queue", "run2"); + expect(secondRunAttempted).toBe(true); // Should execute immediately because token was returned + + await queue2.quit(); + } finally { + await queue.quit(); + } + }); + + redisTest("Should handle max retries in batch processing", async ({ redisContainer }) => { + const executedRuns: { releaseQueue: string; runId: string; attempt: number }[] = []; + const runAttempts: Record = {}; + + const queue = new ReleaseConcurrencyTokenBucketQueue({ + redis: { + keyPrefix: "release-queue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + executor: async (releaseQueue, runId) => { + runAttempts[runId] = (runAttempts[runId] || 0) + 1; + executedRuns.push({ releaseQueue, runId, attempt: runAttempts[runId] }); + throw new Error("Executor failed"); + }, + keys: { + fromDescriptor: (descriptor) => descriptor, + toDescriptor: (name) => name, + }, + maxTokens: async (_) => 3, + retry: { + maxRetries: 2, + backoff: { + minDelay: 100, + maxDelay: 1000, + factor: 1, + }, + }, + batchSize: 3, + pollInterval: 100, + }); + + try { + // Queue up multiple runs + await Promise.all([ + queue.attemptToRelease("test-queue", "run1"), + queue.attemptToRelease("test-queue", "run2"), + queue.attemptToRelease("test-queue", "run3"), + ]); + + // Wait for all retries to complete + await setTimeout(2000); + + // Each run should have been attempted exactly 3 times + expect(Object.values(runAttempts)).toHaveLength(3); // 3 runs + Object.values(runAttempts).forEach((attempts) => { + expect(attempts).toBe(3); // Each run attempted 3 times + }); + + // Verify execution order maintained retry attempts for each run + const run1Attempts = executedRuns.filter((r) => r.runId === "run1"); + const run2Attempts = executedRuns.filter((r) => r.runId === "run2"); + const run3Attempts = executedRuns.filter((r) => r.runId === "run3"); + + expect(run1Attempts).toHaveLength(3); + expect(run2Attempts).toHaveLength(3); + expect(run3Attempts).toHaveLength(3); + + // Verify attempts are numbered correctly for each run + [run1Attempts, run2Attempts, run3Attempts].forEach((attempts) => { + expect(attempts.map((a) => a.attempt)).toEqual([1, 2, 3]); + }); + + // Verify no more retries occur + await setTimeout(1000); + expect(executedRuns).toHaveLength(9); // 3 runs * 3 attempts each + } finally { + await queue.quit(); + } + }); + + redisTest("Should implement exponential backoff between retries", async ({ redisContainer }) => { + const executionTimes: number[] = []; + let startTime: number; + + const minDelay = 100; + const factor = 2; + + const queue = new ReleaseConcurrencyTokenBucketQueue({ + redis: { + keyPrefix: "release-queue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + executor: async (releaseQueue, runId) => { + const now = Date.now(); + executionTimes.push(now); + console.log(`Execution at ${now - startTime}ms from start`); + throw new Error("Executor failed"); + }, + keys: { + fromDescriptor: (descriptor) => descriptor, + toDescriptor: (name) => name, + }, + maxTokens: async (_) => 1, + retry: { + maxRetries: 2, + backoff: { + minDelay, + maxDelay: 1000, + factor, + }, + }, + pollInterval: 50, + }); + + try { + startTime = Date.now(); + await queue.attemptToRelease("test-queue", "run1"); + + // Wait for all retries to complete + await setTimeout(1000); + + // Should have 3 execution times (initial + 2 retries) + expect(executionTimes).toHaveLength(3); + + const intervals = executionTimes.slice(1).map((time, i) => time - executionTimes[i]); + console.log("Intervals between retries:", intervals); + + // First retry should be after ~200ms (minDelay + processing overhead) + const expectedFirstDelay = minDelay * 2; // Account for observed overhead + expect(intervals[0]).toBeGreaterThanOrEqual(expectedFirstDelay * 0.8); + expect(intervals[0]).toBeLessThanOrEqual(expectedFirstDelay * 1.5); + + // Second retry should be after ~400ms (first delay * factor) + const expectedSecondDelay = expectedFirstDelay * factor; + expect(intervals[1]).toBeGreaterThanOrEqual(expectedSecondDelay * 0.8); + expect(intervals[1]).toBeLessThanOrEqual(expectedSecondDelay * 1.5); + + // Log expected vs actual delays + console.log("Expected delays:", { first: expectedFirstDelay, second: expectedSecondDelay }); + } finally { + await queue.quit(); + } + }); + + redisTest("Should not execute or queue when maxTokens is 0", async ({ redisContainer }) => { + const { queue, executedRuns } = createReleaseConcurrencyQueue(redisContainer, 0); + + try { + // Attempt to release with maxTokens of 0 + await queue.attemptToRelease({ name: "test-queue" }, "run1"); + await queue.attemptToRelease({ name: "test-queue" }, "run2"); + + // Wait some time to ensure no processing occurs + await setTimeout(1000); + + // Should not have executed any runs + expect(executedRuns).toHaveLength(0); + } finally { + await queue.quit(); + } + }); + + // Makes sure that the maxTokens is an integer (round down) + // And if it throws, returns 0 + redisTest("Should handle maxTokens errors", async ({ redisContainer }) => { + const { queue, executedRuns } = createReleaseConcurrencyQueue(redisContainer, 0.5); + + try { + // Attempt to release with maxTokens of 0 + await queue.attemptToRelease({ name: "test-queue" }, "run1"); + await queue.attemptToRelease({ name: "test-queue" }, "run2"); + + // Wait some time to ensure no processing occurs + await setTimeout(1000); + + // Should not have executed any runs + expect(executedRuns).toHaveLength(0); + } finally { + await queue.quit(); + } + }); +}); diff --git a/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts b/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts index 401d03edcc..7b24a6c27c 100644 --- a/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts +++ b/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts @@ -372,7 +372,6 @@ describe("RunEngine triggerAndWait", () => { const blockedResult = await engine.blockRunWithWaitpoint({ runId: parentRun2.id, waitpoints: childRunWithWaitpoint.associatedWaitpoint!.id, - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.project.id, organizationId: authenticatedEnvironment.organizationId, tx: prisma, diff --git a/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts b/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts index db0eb3ec73..519212aedf 100644 --- a/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts +++ b/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts @@ -7,7 +7,7 @@ import { import { trace } from "@internal/tracing"; import { expect } from "vitest"; import { RunEngine } from "../index.js"; -import { setTimeout } from "timers/promises"; +import { setTimeout } from "node:timers/promises"; import { EventBusEventArgs } from "../eventBus.js"; import { isWaitpointOutputTimeout } from "@trigger.dev/core/v3"; @@ -102,12 +102,9 @@ describe("RunEngine Waitpoints", () => { const result = await engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: [waitpoint.id], - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.project.id, organizationId: authenticatedEnvironment.organization.id, - releaseConcurrency: { - releaseQueue: true, - }, + releaseConcurrency: true, }); expect(result.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); expect(result.runStatus).toBe("EXECUTING"); @@ -218,7 +215,6 @@ describe("RunEngine Waitpoints", () => { const result = await engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: [waitpoint.id], - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.project.id, organizationId: authenticatedEnvironment.organization.id, }); @@ -360,7 +356,6 @@ describe("RunEngine Waitpoints", () => { await engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: result.waitpoint.id, - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.projectId, organizationId: authenticatedEnvironment.organizationId, }); @@ -500,7 +495,6 @@ describe("RunEngine Waitpoints", () => { await engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: result.waitpoint.id, - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.projectId, organizationId: authenticatedEnvironment.organizationId, }); @@ -627,7 +621,6 @@ describe("RunEngine Waitpoints", () => { engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: result.waitpoint.id, - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.projectId, organizationId: authenticatedEnvironment.organizationId, }) @@ -770,7 +763,6 @@ describe("RunEngine Waitpoints", () => { await engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: result.waitpoint.id, - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.projectId, organizationId: authenticatedEnvironment.organizationId, }); @@ -921,7 +913,6 @@ describe("RunEngine Waitpoints", () => { await engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: result.waitpoint.id, - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.projectId, organizationId: authenticatedEnvironment.organizationId, }); @@ -1082,7 +1073,6 @@ describe("RunEngine Waitpoints", () => { await engine.blockRunWithWaitpoint({ runId: run.id, waitpoints: result.waitpoint.id, - environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.projectId, organizationId: authenticatedEnvironment.organizationId, }); @@ -1143,4 +1133,213 @@ describe("RunEngine Waitpoints", () => { engine.quit(); } }); + + containerTest( + "continueRunIfUnblocked enqueues run when cannot reacquire concurrency", + async ({ prisma, redisOptions }) => { + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + redis: redisOptions, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + try { + const taskIdentifier = "test-task"; + const queueName = "task/test-task-limited"; + + // Create background worker + await setupBackgroundWorker(prisma, authenticatedEnvironment, taskIdentifier); + + // Create first run with queue concurrency limit of 1 + const firstRun = await engine.trigger( + { + number: 1, + friendlyId: "run_first", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345-first", + spanId: "s12345-first", + masterQueue: "main", + queueName, + isTest: false, + tags: [], + queue: { concurrencyLimit: 1 }, + }, + prisma + ); + + // Dequeue and start the first run + const dequeuedFirst = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: firstRun.masterQueue, + maxRunCount: 10, + }); + + const firstAttempt = await engine.startRunAttempt({ + runId: dequeuedFirst[0].run.id, + snapshotId: dequeuedFirst[0].snapshot.id, + }); + expect(firstAttempt.snapshot.executionStatus).toBe("EXECUTING"); + + // Create a manual waitpoint for the first run + const waitpoint = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); + expect(waitpoint.waitpoint.status).toBe("PENDING"); + + // Block the first run with releaseConcurrency set to true + const blockedResult = await engine.blockRunWithWaitpoint({ + runId: firstRun.id, + waitpoints: waitpoint.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: true, + }); + + // Verify first run is blocked + const firstRunData = await engine.getRunExecutionData({ runId: firstRun.id }); + expect(firstRunData?.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + + // Create and start second run on the same queue + const secondRun = await engine.trigger( + { + number: 2, + friendlyId: "run_second", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345-second", + spanId: "s12345-second", + masterQueue: "main", + queueName, + isTest: false, + tags: [], + queue: { concurrencyLimit: 1 }, + }, + prisma + ); + + // Dequeue and start the second run + const dequeuedSecond = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: secondRun.masterQueue, + maxRunCount: 10, + }); + + const secondAttempt = await engine.startRunAttempt({ + runId: dequeuedSecond[0].run.id, + snapshotId: dequeuedSecond[0].snapshot.id, + }); + expect(secondAttempt.snapshot.executionStatus).toBe("EXECUTING"); + + // Now complete the waitpoint for the first run + await engine.completeWaitpoint({ + id: waitpoint.waitpoint.id, + }); + + // Wait for the continueRunIfUnblocked to process + await setTimeout(500); + + // Verify the first run is now in QUEUED_EXECUTING state + const executionDataAfter = await engine.getRunExecutionData({ runId: firstRun.id }); + expect(executionDataAfter?.snapshot.executionStatus).toBe("QUEUED_EXECUTING"); + expect(executionDataAfter?.snapshot.description).toBe( + "Run can continue, but is waiting for concurrency" + ); + + // Verify the waitpoint is no longer blocking the first run + const runWaitpoint = await prisma.taskRunWaitpoint.findFirst({ + where: { + taskRunId: firstRun.id, + }, + include: { + waitpoint: true, + }, + }); + expect(runWaitpoint).toBeNull(); + + // Verify the waitpoint itself is completed + const completedWaitpoint = await prisma.waitpoint.findUnique({ + where: { + id: waitpoint.waitpoint.id, + }, + }); + assertNonNullable(completedWaitpoint); + expect(completedWaitpoint.status).toBe("COMPLETED"); + + // Complete the second run so the first run can be dequeued + const result = await engine.completeRunAttempt({ + runId: dequeuedSecond[0].run.id, + snapshotId: secondAttempt.snapshot.id, + completion: { + ok: true, + id: dequeuedSecond[0].run.id, + output: `{"foo":"bar"}`, + outputType: "application/json", + }, + }); + + await setTimeout(500); + + let event: EventBusEventArgs<"workerNotification">[0] | undefined = undefined; + engine.eventBus.on("workerNotification", (result) => { + event = result; + }); + + // Verify the first run is back in the queue + const queuedRun = await engine.dequeueFromMasterQueue({ + consumerId: "test_12345", + masterQueue: firstRun.masterQueue, + maxRunCount: 10, + }); + + expect(queuedRun.length).toBe(0); + + // Get the latest execution snapshot and make sure it's EXECUTING + const executionData = await engine.getRunExecutionData({ runId: firstRun.id }); + assertNonNullable(executionData); + expect(executionData.snapshot.executionStatus).toBe("EXECUTING"); + + assertNonNullable(event); + const notificationEvent = event as EventBusEventArgs<"workerNotification">[0]; + expect(notificationEvent.run.id).toBe(firstRun.id); + expect(notificationEvent.snapshot.executionStatus).toBe("EXECUTING"); + } finally { + await engine.quit(); + } + } + ); }); diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts index aea71d605b..59274daf89 100644 --- a/internal-packages/run-engine/src/engine/types.ts +++ b/internal-packages/run-engine/src/engine/types.ts @@ -1,18 +1,19 @@ -import { type WorkerConcurrencyOptions } from "@internal/redis-worker"; +import { type RedisOptions } from "@internal/redis"; +import { Worker, type WorkerConcurrencyOptions } from "@internal/redis-worker"; import { Tracer } from "@internal/tracing"; import { MachinePreset, MachinePresetName, QueueOptions, RetryOptions } from "@trigger.dev/core/v3"; import { PrismaClient } from "@trigger.dev/database"; -import { type RedisOptions } from "@internal/redis"; -import { MinimalAuthenticatedEnvironment } from "../shared/index.js"; import { FairQueueSelectionStrategyOptions } from "../run-queue/fairQueueSelectionStrategy.js"; +import { MinimalAuthenticatedEnvironment } from "../shared/index.js"; +import { workerCatalog } from "./workerCatalog.js"; export type RunEngineOptions = { prisma: PrismaClient; - worker: WorkerConcurrencyOptions & { + worker: { redis: RedisOptions; pollIntervalMs?: number; immediatePollIntervalMs?: number; - }; + } & WorkerConcurrencyOptions; machines: { defaultMachine: MachinePresetName; machines: Record; @@ -35,6 +36,20 @@ export type RunEngineOptions = { heartbeatTimeoutsMs?: Partial; queueRunsWaitingForWorkerBatchSize?: number; tracer: Tracer; + releaseConcurrency?: { + disabled?: boolean; + maxTokensRatio?: number; + redis?: Partial; + maxRetries?: number; + consumersCount?: number; + pollInterval?: number; + batchSize?: number; + backoff?: { + minDelay?: number; // Defaults to 1000 + maxDelay?: number; // Defaults to 60000 + factor?: number; // Defaults to 2 + }; + }; }; export type HeartbeatTimeouts = { @@ -91,4 +106,7 @@ export type TriggerParams = { machine?: MachinePresetName; workerId?: string; runnerId?: string; + releaseConcurrency?: boolean; }; + +export type EngineWorker = Worker; diff --git a/internal-packages/run-engine/src/engine/workerCatalog.ts b/internal-packages/run-engine/src/engine/workerCatalog.ts new file mode 100644 index 0000000000..e4d945d654 --- /dev/null +++ b/internal-packages/run-engine/src/engine/workerCatalog.ts @@ -0,0 +1,56 @@ +import { z } from "zod"; + +export const workerCatalog = { + finishWaitpoint: { + schema: z.object({ + waitpointId: z.string(), + error: z.string().optional(), + }), + visibilityTimeoutMs: 5000, + }, + heartbeatSnapshot: { + schema: z.object({ + runId: z.string(), + snapshotId: z.string(), + }), + visibilityTimeoutMs: 5000, + }, + expireRun: { + schema: z.object({ + runId: z.string(), + }), + visibilityTimeoutMs: 5000, + }, + cancelRun: { + schema: z.object({ + runId: z.string(), + completedAt: z.coerce.date(), + reason: z.string().optional(), + }), + visibilityTimeoutMs: 5000, + }, + queueRunsWaitingForWorker: { + schema: z.object({ + backgroundWorkerId: z.string(), + }), + visibilityTimeoutMs: 5000, + }, + tryCompleteBatch: { + schema: z.object({ + batchId: z.string(), + }), + visibilityTimeoutMs: 10_000, + }, + continueRunIfUnblocked: { + schema: z.object({ + runId: z.string(), + }), + visibilityTimeoutMs: 10_000, + }, + enqueueDelayedRun: { + schema: z.object({ + runId: z.string(), + }), + visibilityTimeoutMs: 10_000, + }, +}; diff --git a/internal-packages/run-engine/src/index.ts b/internal-packages/run-engine/src/index.ts index 89bd08196d..8d77c66a04 100644 --- a/internal-packages/run-engine/src/index.ts +++ b/internal-packages/run-engine/src/index.ts @@ -1,2 +1,3 @@ -export { RunEngine, RunDuplicateIdempotencyKeyError } from "./engine/index.js"; +export { RunEngine } from "./engine/index.js"; +export { RunDuplicateIdempotencyKeyError } from "./engine/errors.js"; export type { EventBusEventArgs } from "./engine/eventBus.js"; diff --git a/internal-packages/run-engine/src/run-queue/errors.ts b/internal-packages/run-engine/src/run-queue/errors.ts new file mode 100644 index 0000000000..eecebdab54 --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/errors.ts @@ -0,0 +1,5 @@ +export class MessageNotFoundError extends Error { + constructor(messageId: string) { + super(`Message not found: ${messageId}`); + } +} diff --git a/internal-packages/run-engine/src/run-queue/fairQueueSelectionStrategy.ts b/internal-packages/run-engine/src/run-queue/fairQueueSelectionStrategy.ts index eb65c41513..e46177ec0d 100644 --- a/internal-packages/run-engine/src/run-queue/fairQueueSelectionStrategy.ts +++ b/internal-packages/run-engine/src/run-queue/fairQueueSelectionStrategy.ts @@ -52,7 +52,6 @@ export type FairQueueSelectionStrategyOptions = { type FairQueueConcurrency = { current: number; limit: number; - reserve: number; }; type FairQueue = { id: string; age: number; org: string; env: string; project: string }; @@ -403,7 +402,7 @@ export class FairQueueSelectionStrategy implements RunQueueSelectionStrategy { ); const envsAtFullConcurrency = envs.filter( - (env) => env.concurrency.current >= env.concurrency.limit + env.concurrency.reserve + (env) => env.concurrency.current >= env.concurrency.limit ); const envIdsAtFullConcurrency = new Set(envsAtFullConcurrency.map((env) => env.id)); @@ -500,17 +499,15 @@ export class FairQueueSelectionStrategy implements RunQueueSelectionStrategy { span.setAttribute("org_id", env.orgId); span.setAttribute("project_id", env.projectId); - const [currentValue, limitValue, reserveValue] = await Promise.all([ + const [currentValue, limitValue] = await Promise.all([ this.#getEnvCurrentConcurrency(env), this.#getEnvConcurrencyLimit(env), - this.#getEnvReserveConcurrency(env), ]); span.setAttribute("current_value", currentValue); span.setAttribute("limit_value", limitValue); - span.setAttribute("reserve_value", reserveValue); - return { current: currentValue, limit: limitValue, reserve: reserveValue }; + return { current: currentValue, limit: limitValue }; }); } @@ -587,22 +584,6 @@ export class FairQueueSelectionStrategy implements RunQueueSelectionStrategy { }); } - async #getEnvReserveConcurrency(env: EnvDescriptor) { - return await startSpan(this.options.tracer, "getEnvReserveConcurrency", async (span) => { - span.setAttribute("env_id", env.envId); - span.setAttribute("org_id", env.orgId); - span.setAttribute("project_id", env.projectId); - - const key = this.options.keys.envReserveConcurrencyKey(env); - - const result = await this._redis.scard(key); - - span.setAttribute("current_value", result); - - return result; - }); - } - #envDescriptorFromFairQueue(queue: FairQueue): EnvDescriptor { return { envId: queue.env, diff --git a/internal-packages/run-engine/src/run-queue/index.test.ts b/internal-packages/run-engine/src/run-queue/index.test.ts index 7fcae56eeb..dbbb574bfc 100644 --- a/internal-packages/run-engine/src/run-queue/index.test.ts +++ b/internal-packages/run-engine/src/run-queue/index.test.ts @@ -216,13 +216,6 @@ describe("RunQueue", () => { expect(queueConcurrency).toBe(0); const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); expect(envConcurrency).toBe(0); - const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvDev); - expect(projectConcurrency).toBe(0); - const taskConcurrency = await queue.currentConcurrencyOfTask( - authenticatedEnvDev, - messageDev.taskIdentifier - ); - expect(taskConcurrency).toBe(0); const dequeued = await queue.dequeueMessageFromMasterQueue( "test_12345", @@ -243,13 +236,6 @@ describe("RunQueue", () => { expect(queueConcurrency2).toBe(1); const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); expect(envConcurrency2).toBe(1); - const projectConcurrency2 = await queue.currentConcurrencyOfProject(authenticatedEnvDev); - expect(projectConcurrency2).toBe(1); - const taskConcurrency2 = await queue.currentConcurrencyOfTask( - authenticatedEnvDev, - messageDev.taskIdentifier - ); - expect(taskConcurrency2).toBe(1); //queue lengths const result3 = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue); @@ -337,13 +323,6 @@ describe("RunQueue", () => { expect(queueConcurrency).toBe(0); const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); expect(envConcurrency).toBe(0); - const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvProd); - expect(projectConcurrency).toBe(0); - const taskConcurrency = await queue.currentConcurrencyOfTask( - authenticatedEnvProd, - messageProd.taskIdentifier - ); - expect(taskConcurrency).toBe(0); //dequeue const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); @@ -361,13 +340,6 @@ describe("RunQueue", () => { expect(queueConcurrency2).toBe(1); const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); expect(envConcurrency2).toBe(1); - const projectConcurrency2 = await queue.currentConcurrencyOfProject(authenticatedEnvProd); - expect(projectConcurrency2).toBe(1); - const taskConcurrency2 = await queue.currentConcurrencyOfTask( - authenticatedEnvProd, - messageProd.taskIdentifier - ); - expect(taskConcurrency2).toBe(1); //queue length const length2 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); @@ -517,13 +489,6 @@ describe("RunQueue", () => { expect(queueConcurrency).toBe(0); const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); expect(envConcurrency).toBe(0); - const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvProd); - expect(projectConcurrency).toBe(0); - const taskConcurrency = await queue.currentConcurrencyOfTask( - authenticatedEnvProd, - messageProd.taskIdentifier - ); - expect(taskConcurrency).toBe(0); //queue lengths const queueLength3 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); @@ -586,13 +551,6 @@ describe("RunQueue", () => { expect(queueConcurrency).toBe(0); const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); expect(envConcurrency).toBe(0); - const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvProd); - expect(projectConcurrency).toBe(0); - const taskConcurrency = await queue.currentConcurrencyOfTask( - authenticatedEnvProd, - messageProd.taskIdentifier - ); - expect(taskConcurrency).toBe(0); //queue lengths const queueLength3 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); @@ -651,13 +609,6 @@ describe("RunQueue", () => { expect(queueConcurrency).toBe(1); const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); expect(envConcurrency).toBe(1); - const projectConcurrency = await queue.currentConcurrencyOfProject(authenticatedEnvProd); - expect(projectConcurrency).toBe(1); - const taskConcurrency = await queue.currentConcurrencyOfTask( - authenticatedEnvProd, - messageProd.taskIdentifier - ); - expect(taskConcurrency).toBe(1); await queue.nackMessage({ orgId: messages[0].message.orgId, @@ -675,13 +626,6 @@ describe("RunQueue", () => { expect(queueConcurrency2).toBe(0); const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); expect(envConcurrency2).toBe(0); - const projectConcurrency2 = await queue.currentConcurrencyOfProject(authenticatedEnvProd); - expect(projectConcurrency2).toBe(0); - const taskConcurrency2 = await queue.currentConcurrencyOfTask( - authenticatedEnvProd, - messageProd.taskIdentifier - ); - expect(taskConcurrency2).toBe(0); //queue lengths const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); @@ -704,129 +648,97 @@ describe("RunQueue", () => { } }); - redisTest( - "Releasing concurrency", - { timeout: 5_000 }, - async ({ redisContainer, redisOptions }) => { - const queue = new RunQueue({ - ...testOptions, - queueSelectionStrategy: new FairQueueSelectionStrategy({ - redis: { - keyPrefix: "runqueue:test:", - host: redisContainer.getHost(), - port: redisContainer.getPort(), - }, - keys: testOptions.keys, - }), + redisTest("Releasing concurrency", async ({ redisContainer, redisOptions }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ redis: { keyPrefix: "runqueue:test:", host: redisContainer.getHost(), port: redisContainer.getPort(), }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + const redis = createRedisClient({ ...redisOptions, keyPrefix: "runqueue:test:" }); + + try { + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + masterQueues: "main", }); - const redis = createRedisClient({ ...redisOptions, keyPrefix: "runqueue:test:" }); + const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); + expect(messages.length).toBe(1); - try { - await queue.enqueueMessage({ - env: authenticatedEnvProd, - message: messageProd, - masterQueues: "main", - }); + //check the message is gone + const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId); + const exists = await redis.exists(key); + expect(exists).toBe(1); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages.length).toBe(1); + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); - //check the message is gone - const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId); - const exists = await redis.exists(key); - expect(exists).toBe(1); + //release the concurrency + await queue.releaseAllConcurrency( + authenticatedEnvProd.organization.id, + messages[0].messageId + ); - //concurrencies - expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( - 1 - ); - expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); - expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(1); - expect( - await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier) - ).toBe(1); - - //release the concurrency (not the queue) - await queue.releaseConcurrency( - authenticatedEnvProd.organization.id, - messages[0].messageId, - false - ); + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 0 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0); - //concurrencies - expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( - 1 - ); - expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0); - expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(0); - expect( - await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier) - ).toBe(0); - - //reacquire the concurrency - await queue.reacquireConcurrency( - authenticatedEnvProd.organization.id, - messages[0].messageId - ); + //reacquire the concurrency + await queue.reacquireConcurrency(authenticatedEnvProd.organization.id, messages[0].messageId); - //check concurrencies are back to what they were before - expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( - 1 - ); - expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); - expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(1); - expect( - await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier) - ).toBe(1); - - //release the concurrency (with the queue this time) - await queue.releaseConcurrency( - authenticatedEnvProd.organization.id, - messages[0].messageId, - true - ); + //check concurrencies are back to what they were before + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); - //concurrencies - expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( - 0 - ); - expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0); - expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(0); - expect( - await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier) - ).toBe(0); - - //reacquire the concurrency - await queue.reacquireConcurrency( - authenticatedEnvProd.organization.id, - messages[0].messageId - ); + //release the concurrency (with the queue this time) + await queue.releaseAllConcurrency( + authenticatedEnvProd.organization.id, + messages[0].messageId + ); - //check concurrencies are back to what they were before - expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( - 1 - ); - expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); - expect(await queue.currentConcurrencyOfProject(authenticatedEnvProd)).toBe(1); - expect( - await queue.currentConcurrencyOfTask(authenticatedEnvProd, messageProd.taskIdentifier) - ).toBe(1); - } finally { - try { - await queue.quit(); - await redis.quit(); - } catch (e) {} - } + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 0 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0); + + //reacquire the concurrency + await queue.reacquireConcurrency(authenticatedEnvProd.organization.id, messages[0].messageId); + + //check concurrencies are back to what they were before + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + } finally { + try { + await queue.quit(); + await redis.quit(); + } catch (e) {} } - ); + }); - redisTest("Dead Letter Queue", { timeout: 8_000 }, async ({ redisContainer, redisOptions }) => { + redisTest("Dead Letter Queue", async ({ redisContainer, redisOptions }) => { const queue = new RunQueue({ ...testOptions, retryOptions: { @@ -882,45 +794,32 @@ describe("RunQueue", () => { expect(queueConcurrency2).toBe(0); const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); expect(envConcurrency2).toBe(0); - const projectConcurrency2 = await queue.currentConcurrencyOfProject(authenticatedEnvProd); - expect(projectConcurrency2).toBe(0); - const taskConcurrency2 = await queue.currentConcurrencyOfTask( - authenticatedEnvProd, - messageProd.taskIdentifier - ); - expect(taskConcurrency2).toBe(0); //check the message is still there - const exists2 = await redis.exists(key); - expect(exists2).toBe(1); + const message = await queue.readMessage(messages[0].message.orgId, messages[0].messageId); + expect(message).toBeDefined(); - //check it's in the dlq - const dlqKey = "dlq"; - const dlqExists = await redis.exists(dlqKey); - expect(dlqExists).toBe(1); - const dlqMembers = await redis.zrange(dlqKey, 0, -1); - expect(dlqMembers).toContain(messageProd.runId); + const deadLetterQueueLengthBefore = await queue.lengthOfDeadLetterQueue(authenticatedEnvProd); + expect(deadLetterQueueLengthBefore).toBe(1); - //redrive - const redisClient = createRedisClient({ - host: redisContainer.getHost(), - port: redisContainer.getPort(), - password: redisContainer.getPassword(), - }); - - // Publish redrive message - await redisClient.publish( - "rq:redrive", - JSON.stringify({ runId: messageProd.runId, orgId: messageProd.orgId }) + const existsInDlq = await queue.messageInDeadLetterQueue( + authenticatedEnvProd, + messageProd.runId ); + expect(existsInDlq).toBe(true); + + //redrive + await queue.redriveMessage(authenticatedEnvProd, messageProd.runId); // Wait for the item to be redrived and processed await setTimeout(5_000); - await redisClient.quit(); //shouldn't be in the dlq now - const dlqMembersAfter = await redis.zrange(dlqKey, 0, -1); - expect(dlqMembersAfter).not.toContain(messageProd.runId); + const existsInDlqAfter = await queue.messageInDeadLetterQueue( + authenticatedEnvProd, + messageProd.runId + ); + expect(existsInDlqAfter).toBe(false); //dequeue const messages3 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); diff --git a/internal-packages/run-engine/src/run-queue/index.ts b/internal-packages/run-engine/src/run-queue/index.ts index dcedf121ec..a5aacd957f 100644 --- a/internal-packages/run-engine/src/run-queue/index.ts +++ b/internal-packages/run-engine/src/run-queue/index.ts @@ -29,6 +29,7 @@ import { type RedisOptions, type Result, } from "@internal/redis"; +import { MessageNotFoundError } from "./errors.js"; const SemanticAttributes = { QUEUE: "runqueue.queue", @@ -156,6 +157,28 @@ export class RunQueue { return this.redis.zcard(this.keys.envQueueKey(env)); } + public async lengthOfDeadLetterQueue(env: MinimalAuthenticatedEnvironment) { + return this.redis.zcard(this.keys.deadLetterQueueKey(env)); + } + + public async messageInDeadLetterQueue(env: MinimalAuthenticatedEnvironment, messageId: string) { + const result = await this.redis.zscore(this.keys.deadLetterQueueKey(env), messageId); + return !!result; + } + + public async redriveMessage(env: MinimalAuthenticatedEnvironment, messageId: string) { + // Publish redrive message + await this.redis.publish( + "rq:redrive", + JSON.stringify({ + runId: messageId, + orgId: env.organization.id, + envId: env.id, + projectId: env.project.id, + }) + ); + } + public async oldestMessageInQueue( env: MinimalAuthenticatedEnvironment, queue: string, @@ -258,15 +281,43 @@ export class RunQueue { return this.redis.scard(this.keys.envCurrentConcurrencyKey(env)); } - public async currentConcurrencyOfProject(env: MinimalAuthenticatedEnvironment) { - return this.redis.scard(this.keys.projectCurrentConcurrencyKey(env)); + public async messageExists(orgId: string, messageId: string) { + return this.redis.exists(this.keys.messageKey(orgId, messageId)); } - public async currentConcurrencyOfTask( - env: MinimalAuthenticatedEnvironment, - taskIdentifier: string - ) { - return this.redis.scard(this.keys.taskIdentifierCurrentConcurrencyKey(env, taskIdentifier)); + public async readMessage(orgId: string, messageId: string) { + return this.#trace( + "readMessage", + async (span) => { + const rawMessage = await this.redis.get(this.keys.messageKey(orgId, messageId)); + + if (!rawMessage) { + return; + } + + const message = OutputPayload.safeParse(JSON.parse(rawMessage)); + + if (!message.success) { + this.logger.error(`[${this.name}] Failed to parse message`, { + messageId, + error: message.error, + service: this.name, + }); + + return; + } + + return message.data; + }, + { + attributes: { + [SEMATTRS_MESSAGING_OPERATION]: "receive", + [SEMATTRS_MESSAGE_ID]: messageId, + [SEMATTRS_MESSAGING_SYSTEM]: "marqs", + [SemanticAttributes.RUN_ID]: messageId, + }, + } + ); } public async enqueueMessage({ @@ -304,7 +355,7 @@ export class RunQueue { attempt: 0, }; - await this.#callEnqueueMessage(messagePayload, parentQueues); + return await this.#callEnqueueMessage(messagePayload, parentQueues); }, { kind: SpanKind.PRODUCER, @@ -373,15 +424,6 @@ export class RunQueue { // Attempt to dequeue from this queue const message = await this.#callDequeueMessage({ messageQueue: queue, - concurrencyLimitKey: this.keys.concurrencyLimitKeyFromQueue(queue), - currentConcurrencyKey: this.keys.currentConcurrencyKeyFromQueue(queue), - envConcurrencyLimitKey: this.keys.envConcurrencyLimitKeyFromQueue(queue), - envCurrentConcurrencyKey: this.keys.envCurrentConcurrencyKeyFromQueue(queue), - projectCurrentConcurrencyKey: this.keys.projectCurrentConcurrencyKeyFromQueue(queue), - messageKeyPrefix: this.keys.messageKeyPrefixFromQueue(queue), - envQueueKey: this.keys.envQueueKeyFromQueue(queue), - taskCurrentConcurrentKeyPrefix: - this.keys.taskIdentifierCurrentConcurrencyKeyPrefixFromQueue(queue), }); if (message) { @@ -428,13 +470,10 @@ export class RunQueue { return this.#trace( "acknowledgeMessage", async (span) => { - const message = await this.#readMessage(orgId, messageId); + const message = await this.readMessage(orgId, messageId); if (!message) { - this.logger.log(`[${this.name}].acknowledgeMessage() message not found`, { - messageId, - service: this.name, - }); + // Message not found, it may have already been acknowledged return; } @@ -446,18 +485,7 @@ export class RunQueue { }); await this.#callAcknowledgeMessage({ - messageId, - messageQueue: message.queue, - masterQueues: message.masterQueues, - messageKey: this.keys.messageKey(orgId, messageId), - concurrencyKey: this.keys.currentConcurrencyKeyFromQueue(message.queue), - envConcurrencyKey: this.keys.envCurrentConcurrencyKeyFromQueue(message.queue), - taskConcurrencyKey: this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue( - message.queue, - message.taskIdentifier - ), - envQueueKey: this.keys.envQueueKeyFromQueue(message.queue), - projectConcurrencyKey: this.keys.projectCurrentConcurrencyKeyFromQueue(message.queue), + message, }); }, { @@ -491,7 +519,7 @@ export class RunQueue { async (span) => { const maxAttempts = this.retryOptions.maxAttempts ?? defaultRetrySettings.maxAttempts; - const message = await this.#readMessage(orgId, messageId); + const message = await this.readMessage(orgId, messageId); if (!message) { this.logger.log(`[${this.name}].nackMessage() message not found`, { orgId, @@ -510,75 +538,16 @@ export class RunQueue { [SemanticAttributes.MASTER_QUEUES]: message.masterQueues.join(","), }); - const messageKey = this.keys.messageKey(orgId, messageId); - const messageQueue = message.queue; - const concurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); - const envConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); - const taskConcurrencyKey = this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue( - message.queue, - message.taskIdentifier - ); - const projectConcurrencyKey = this.keys.projectCurrentConcurrencyKeyFromQueue( - message.queue - ); - const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); - if (incrementAttemptCount) { message.attempt = message.attempt + 1; if (message.attempt >= maxAttempts) { - await this.redis.moveToDeadLetterQueue( - messageKey, - messageQueue, - concurrencyKey, - envConcurrencyKey, - projectConcurrencyKey, - envQueueKey, - taskConcurrencyKey, - "dlq", - messageId, - messageQueue, - JSON.stringify(message.masterQueues), - this.options.redis.keyPrefix ?? "" - ); + await this.#callMoveToDeadLetterQueue({ message }); return false; } } - const nextRetryDelay = calculateNextRetryDelay(this.retryOptions, message.attempt); - const messageScore = retryAt ?? (nextRetryDelay ? Date.now() + nextRetryDelay : Date.now()); - - this.logger.debug("Calling nackMessage", { - messageKey, - messageQueue, - masterQueues: message.masterQueues, - concurrencyKey, - envConcurrencyKey, - projectConcurrencyKey, - envQueueKey, - taskConcurrencyKey, - messageId, - messageScore, - attempt: message.attempt, - service: this.name, - }); + await this.#callNackMessage({ message }); - await this.redis.nackMessage( - //keys - messageKey, - messageQueue, - concurrencyKey, - envConcurrencyKey, - projectConcurrencyKey, - envQueueKey, - taskConcurrencyKey, - //args - messageId, - messageQueue, - JSON.stringify(message), - String(messageScore), - JSON.stringify(message.masterQueues), - this.options.redis.keyPrefix ?? "" - ); return true; }, { @@ -592,18 +561,17 @@ export class RunQueue { ); } - public async releaseConcurrency( - orgId: string, - messageId: string, - releaseForRun: boolean = false - ) { + /** + * Release all concurrency for a message, including environment and queue concurrency + */ + public async releaseAllConcurrency(orgId: string, messageId: string) { return this.#trace( - "releaseConcurrency", + "releaseAllConcurrency", async (span) => { - const message = await this.#readMessage(orgId, messageId); + const message = await this.readMessage(orgId, messageId); if (!message) { - this.logger.log(`[${this.name}].acknowledgeMessage() message not found`, { + this.logger.log(`[${this.name}].releaseAllConcurrency() message not found`, { messageId, service: this.name, }); @@ -618,23 +586,15 @@ export class RunQueue { }); return this.redis.releaseConcurrency( - this.keys.messageKey(orgId, messageId), - message.queue, - releaseForRun ? this.keys.currentConcurrencyKeyFromQueue(message.queue) : "", + this.keys.currentConcurrencyKeyFromQueue(message.queue), this.keys.envCurrentConcurrencyKeyFromQueue(message.queue), - this.keys.projectCurrentConcurrencyKeyFromQueue(message.queue), - this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue( - message.queue, - message.taskIdentifier - ), - messageId, - JSON.stringify(message.masterQueues) + messageId ); }, { kind: SpanKind.CONSUMER, attributes: { - [SEMATTRS_MESSAGING_OPERATION]: "releaseConcurrency", + [SEMATTRS_MESSAGING_OPERATION]: "releaseAllConcurrency", [SEMATTRS_MESSAGE_ID]: messageId, [SEMATTRS_MESSAGING_SYSTEM]: "runqueue", }, @@ -642,14 +602,14 @@ export class RunQueue { ); } - public async reacquireConcurrency(orgId: string, messageId: string) { + public async releaseEnvConcurrency(orgId: string, messageId: string) { return this.#trace( - "reacquireConcurrency", + "releaseEnvConcurrency", async (span) => { - const message = await this.#readMessage(orgId, messageId); + const message = await this.readMessage(orgId, messageId); if (!message) { - this.logger.log(`[${this.name}].acknowledgeMessage() message not found`, { + this.logger.log(`[${this.name}].releaseEnvConcurrency() message not found`, { messageId, service: this.name, }); @@ -663,19 +623,54 @@ export class RunQueue { [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey, }); - return this.redis.reacquireConcurrency( - this.keys.messageKey(orgId, messageId), - message.queue, - this.keys.currentConcurrencyKeyFromQueue(message.queue), + return this.redis.releaseEnvConcurrency( this.keys.envCurrentConcurrencyKeyFromQueue(message.queue), - this.keys.projectCurrentConcurrencyKeyFromQueue(message.queue), - this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue( - message.queue, - message.taskIdentifier - ), + messageId + ); + }, + { + kind: SpanKind.CONSUMER, + attributes: { + [SEMATTRS_MESSAGING_OPERATION]: "releaseEnvConcurrency", + [SEMATTRS_MESSAGE_ID]: messageId, + [SEMATTRS_MESSAGING_SYSTEM]: "runqueue", + }, + } + ); + } + + public async reacquireConcurrency(orgId: string, messageId: string) { + return this.#trace( + "reacquireConcurrency", + async (span) => { + const message = await this.readMessage(orgId, messageId); + + if (!message) { + throw new MessageNotFoundError(messageId); + } + + span.setAttributes({ + [SemanticAttributes.QUEUE]: message.queue, + [SemanticAttributes.ORG_ID]: message.orgId, + [SemanticAttributes.RUN_ID]: messageId, + [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey, + }); + + const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); + const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); + const queueConcurrencyLimitKey = this.keys.concurrencyLimitKeyFromQueue(message.queue); + const envConcurrencyLimitKey = this.keys.envConcurrencyLimitKeyFromQueue(message.queue); + + const result = await this.redis.reacquireConcurrency( + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, + queueConcurrencyLimitKey, + envConcurrencyLimitKey, messageId, - JSON.stringify(message.masterQueues) + String(this.options.defaultEnvConcurrency) ); + + return !!result; }, { kind: SpanKind.CONSUMER, @@ -696,16 +691,21 @@ export class RunQueue { private async handleRedriveMessage(channel: string, message: string) { try { - const { runId, orgId } = JSON.parse(message) as any; - if (typeof orgId !== "string" || typeof runId !== "string") { + const { runId, envId, projectId, orgId } = JSON.parse(message) as any; + if ( + typeof orgId !== "string" || + typeof runId !== "string" || + typeof envId !== "string" || + typeof projectId !== "string" + ) { this.logger.error( - "handleRedriveMessage: invalid message format: runId and orgId must be strings", + "handleRedriveMessage: invalid message format: runId, envId, projectId and orgId must be strings", { message, channel } ); return; } - const data = await this.#readMessage(orgId, runId); + const data = await this.readMessage(orgId, runId); if (!data) { this.logger.error(`handleRedriveMessage: couldn't read message`, { orgId, runId, channel }); @@ -733,7 +733,10 @@ export class RunQueue { }); //remove from the dlq - const result = await this.redis.zrem("dlq", runId); + const result = await this.redis.zrem( + this.keys.deadLetterQueueKey({ envId, orgId, projectId }), + runId + ); if (result === 0) { this.logger.error(`handleRedriveMessage: couldn't remove message from dlq`, { @@ -794,107 +797,80 @@ export class RunQueue { this.subscriber.on("message", this.handleRedriveMessage.bind(this)); } - async #readMessage(orgId: string, messageId: string) { - return this.#trace( - "readMessage", - async (span) => { - const rawMessage = await this.redis.get(this.keys.messageKey(orgId, messageId)); - - if (!rawMessage) { - return; - } - - const message = OutputPayload.safeParse(JSON.parse(rawMessage)); - - if (!message.success) { - this.logger.error(`[${this.name}] Failed to parse message`, { - messageId, - error: message.error, - service: this.name, - }); - - return; - } - - return message.data; - }, - { - attributes: { - [SEMATTRS_MESSAGING_OPERATION]: "receive", - [SEMATTRS_MESSAGE_ID]: messageId, - [SEMATTRS_MESSAGING_SYSTEM]: "marqs", - [SemanticAttributes.RUN_ID]: messageId, - }, - } - ); - } - async #callEnqueueMessage(message: OutputPayload, masterQueues: string[]) { - const concurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); - const envConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); - const taskConcurrencyKey = this.keys.taskIdentifierCurrentConcurrencyKeyFromQueue( - message.queue, - message.taskIdentifier - ); - const projectConcurrencyKey = this.keys.projectCurrentConcurrencyKeyFromQueue(message.queue); + const queueKey = message.queue; + const messageKey = this.keys.messageKey(message.orgId, message.runId); + const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); + const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); + const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); + + const queueName = message.queue; + const messageId = message.runId; + const messageData = JSON.stringify(message); + const messageScore = String(message.timestamp); + const $masterQueues = JSON.stringify(masterQueues); + const keyPrefix = this.options.redis.keyPrefix ?? ""; this.logger.debug("Calling enqueueMessage", { - messagePayload: message, - concurrencyKey, - envConcurrencyKey, - masterQueues, + queueKey, + messageKey, + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, + envQueueKey, + queueName, + messageId, + messageData, + messageScore, + masterQueues: $masterQueues, service: this.name, }); - return this.redis.enqueueMessage( - message.queue, - this.keys.messageKey(message.orgId, message.runId), - concurrencyKey, - envConcurrencyKey, - taskConcurrencyKey, - projectConcurrencyKey, - this.keys.envQueueKeyFromQueue(message.queue), - message.queue, - message.runId, - JSON.stringify(message), - String(message.timestamp), - JSON.stringify(masterQueues), - this.options.redis.keyPrefix ?? "" + await this.redis.enqueueMessage( + queueKey, + messageKey, + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, + envQueueKey, + queueName, + messageId, + messageData, + messageScore, + $masterQueues, + keyPrefix ); } async #callDequeueMessage({ messageQueue, - concurrencyLimitKey, - envConcurrencyLimitKey, - currentConcurrencyKey, - envCurrentConcurrencyKey, - projectCurrentConcurrencyKey, - messageKeyPrefix, - envQueueKey, - taskCurrentConcurrentKeyPrefix, }: { messageQueue: string; - concurrencyLimitKey: string; - envConcurrencyLimitKey: string; - currentConcurrencyKey: string; - envCurrentConcurrencyKey: string; - projectCurrentConcurrencyKey: string; - messageKeyPrefix: string; - envQueueKey: string; - taskCurrentConcurrentKeyPrefix: string; }): Promise { + const queueConcurrencyLimitKey = this.keys.concurrencyLimitKeyFromQueue(messageQueue); + const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(messageQueue); + const envConcurrencyLimitKey = this.keys.envConcurrencyLimitKeyFromQueue(messageQueue); + const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(messageQueue); + const messageKeyPrefix = this.keys.messageKeyPrefixFromQueue(messageQueue); + const envQueueKey = this.keys.envQueueKeyFromQueue(messageQueue); + + this.logger.debug("#callDequeueMessage", { + messageQueue, + queueConcurrencyLimitKey, + envConcurrencyLimitKey, + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, + messageKeyPrefix, + envQueueKey, + }); + const result = await this.redis.dequeueMessage( //keys messageQueue, - concurrencyLimitKey, + queueConcurrencyLimitKey, envConcurrencyLimitKey, - currentConcurrencyKey, + queueCurrentConcurrencyKey, envCurrentConcurrencyKey, - projectCurrentConcurrencyKey, messageKeyPrefix, envQueueKey, - taskCurrentConcurrentKeyPrefix, //args messageQueue, String(Date.now()), @@ -942,35 +918,21 @@ export class RunQueue { }; } - async #callAcknowledgeMessage({ - messageId, - masterQueues, - messageKey, - messageQueue, - concurrencyKey, - envConcurrencyKey, - taskConcurrencyKey, - envQueueKey, - projectConcurrencyKey, - }: { - masterQueues: string[]; - messageKey: string; - messageQueue: string; - concurrencyKey: string; - envConcurrencyKey: string; - taskConcurrencyKey: string; - envQueueKey: string; - projectConcurrencyKey: string; - messageId: string; - }) { + async #callAcknowledgeMessage({ message }: { message: OutputPayload }) { + const messageId = message.runId; + const messageKey = this.keys.messageKey(message.orgId, messageId); + const messageQueue = message.queue; + const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); + const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); + const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); + const masterQueues = message.masterQueues; + this.logger.debug("Calling acknowledgeMessage", { messageKey, messageQueue, - concurrencyKey, - envConcurrencyKey, - projectConcurrencyKey, + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, envQueueKey, - taskConcurrencyKey, messageId, masterQueues, service: this.name, @@ -979,11 +941,9 @@ export class RunQueue { return this.redis.acknowledgeMessage( messageKey, messageQueue, - concurrencyKey, - envConcurrencyKey, - projectConcurrencyKey, + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, envQueueKey, - taskConcurrencyKey, messageId, messageQueue, JSON.stringify(masterQueues), @@ -991,6 +951,70 @@ export class RunQueue { ); } + async #callNackMessage({ message, retryAt }: { message: OutputPayload; retryAt?: number }) { + const messageId = message.runId; + const messageKey = this.keys.messageKey(message.orgId, message.runId); + const messageQueue = message.queue; + const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); + const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); + const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); + + const nextRetryDelay = calculateNextRetryDelay(this.retryOptions, message.attempt); + const messageScore = retryAt ?? (nextRetryDelay ? Date.now() + nextRetryDelay : Date.now()); + + this.logger.debug("Calling nackMessage", { + messageKey, + messageQueue, + masterQueues: message.masterQueues, + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, + envQueueKey, + messageId, + messageScore, + attempt: message.attempt, + service: this.name, + }); + + await this.redis.nackMessage( + //keys + messageKey, + messageQueue, + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, + envQueueKey, + //args + messageId, + messageQueue, + JSON.stringify(message), + String(messageScore), + JSON.stringify(message.masterQueues), + this.options.redis.keyPrefix ?? "" + ); + } + + async #callMoveToDeadLetterQueue({ message }: { message: OutputPayload }) { + const messageId = message.runId; + const messageKey = this.keys.messageKey(message.orgId, message.runId); + const messageQueue = message.queue; + const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); + const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); + const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); + const deadLetterQueueKey = this.keys.deadLetterQueueKeyFromQueue(message.queue); + + await this.redis.moveToDeadLetterQueue( + messageKey, + messageQueue, + queueCurrentConcurrencyKey, + envCurrentConcurrencyKey, + envQueueKey, + deadLetterQueueKey, + messageId, + messageQueue, + JSON.stringify(message.masterQueues), + this.options.redis.keyPrefix ?? "" + ); + } + #callUpdateGlobalConcurrencyLimits({ envConcurrencyLimitKey, envConcurrencyLimit, @@ -1006,15 +1030,13 @@ export class RunQueue { #registerCommands() { this.redis.defineCommand("enqueueMessage", { - numberOfKeys: 7, + numberOfKeys: 5, lua: ` -local queue = KEYS[1] +local queueKey = KEYS[1] local messageKey = KEYS[2] -local concurrencyKey = KEYS[3] -local envConcurrencyKey = KEYS[4] -local taskConcurrencyKey = KEYS[5] -local projectConcurrencyKey = KEYS[6] -local envQueueKey = KEYS[7] +local queueCurrentConcurrencyKey = KEYS[3] +local envCurrentConcurrencyKey = KEYS[4] +local envQueueKey = KEYS[5] local queueName = ARGV[1] local messageId = ARGV[2] @@ -1027,13 +1049,13 @@ local keyPrefix = ARGV[6] redis.call('SET', messageKey, messageData) -- Add the message to the queue -redis.call('ZADD', queue, messageScore, messageId) +redis.call('ZADD', queueKey, messageScore, messageId) -- Add the message to the env queue redis.call('ZADD', envQueueKey, messageScore, messageId) -- Rebalance the parent queues -local earliestMessage = redis.call('ZRANGE', queue, 0, 0, 'WITHSCORES') +local earliestMessage = redis.call('ZRANGE', queueKey, 0, 0, 'WITHSCORES') for _, parentQueue in ipairs(parentQueues) do local prefixedParentQueue = keyPrefix .. parentQueue @@ -1045,27 +1067,23 @@ for _, parentQueue in ipairs(parentQueues) do end -- Update the concurrency keys -redis.call('SREM', concurrencyKey, messageId) -redis.call('SREM', envConcurrencyKey, messageId) -redis.call('SREM', taskConcurrencyKey, messageId) -redis.call('SREM', projectConcurrencyKey, messageId) +redis.call('SREM', queueCurrentConcurrencyKey, messageId) +redis.call('SREM', envCurrentConcurrencyKey, messageId) `, }); this.redis.defineCommand("dequeueMessage", { - numberOfKeys: 9, + numberOfKeys: 7, lua: ` -local childQueue = KEYS[1] -local concurrencyLimitKey = KEYS[2] +local queueKey = KEYS[1] +local queueConcurrencyLimitKey = KEYS[2] local envConcurrencyLimitKey = KEYS[3] -local currentConcurrencyKey = KEYS[4] +local queueCurrentConcurrencyKey = KEYS[4] local envCurrentConcurrencyKey = KEYS[5] -local projectConcurrencyKey = KEYS[6] -local messageKeyPrefix = KEYS[7] -local envQueueKey = KEYS[8] -local taskCurrentConcurrentKeyPrefix = KEYS[9] +local messageKeyPrefix = KEYS[6] +local envQueueKey = KEYS[7] -local childQueueName = ARGV[1] +local queueName = ARGV[1] local currentTime = tonumber(ARGV[2]) local defaultEnvConcurrencyLimit = ARGV[3] local keyPrefix = ARGV[4] @@ -1073,22 +1091,24 @@ local keyPrefix = ARGV[4] -- Check current env concurrency against the limit local envCurrentConcurrency = tonumber(redis.call('SCARD', envCurrentConcurrencyKey) or '0') local envConcurrencyLimit = tonumber(redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit) +local totalEnvConcurrencyLimit = envConcurrencyLimit -if envCurrentConcurrency >= envConcurrencyLimit then +if envCurrentConcurrency >= totalEnvConcurrencyLimit then return nil end -- Check current queue concurrency against the limit -local currentConcurrency = tonumber(redis.call('SCARD', currentConcurrencyKey) or '0') -local concurrencyLimit = tonumber(redis.call('GET', concurrencyLimitKey) or '1000000') +local queueCurrentConcurrency = tonumber(redis.call('SCARD', queueCurrentConcurrencyKey) or '0') +local queueConcurrencyLimit = math.min(tonumber(redis.call('GET', queueConcurrencyLimitKey) or '1000000'), envConcurrencyLimit) +local totalQueueConcurrencyLimit = queueConcurrencyLimit -- Check condition only if concurrencyLimit exists -if currentConcurrency >= concurrencyLimit then +if queueCurrentConcurrency >= totalQueueConcurrencyLimit then return nil end -- Attempt to dequeue the next message -local messages = redis.call('ZRANGEBYSCORE', childQueue, '-inf', currentTime, 'WITHSCORES', 'LIMIT', 0, 1) +local messages = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'WITHSCORES', 'LIMIT', 0, 1) if #messages == 0 then return nil @@ -1102,28 +1122,20 @@ local messageKey = messageKeyPrefix .. messageId local messagePayload = redis.call('GET', messageKey) local decodedPayload = cjson.decode(messagePayload); --- Extract taskIdentifier -local taskIdentifier = decodedPayload.taskIdentifier - --- Perform SADD with taskIdentifier and messageId -local taskConcurrencyKey = taskCurrentConcurrentKeyPrefix .. taskIdentifier - -- Update concurrency -redis.call('ZREM', childQueue, messageId) +redis.call('ZREM', queueKey, messageId) redis.call('ZREM', envQueueKey, messageId) -redis.call('SADD', currentConcurrencyKey, messageId) +redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) -redis.call('SADD', projectConcurrencyKey, messageId) -redis.call('SADD', taskConcurrencyKey, messageId) -- Rebalance the parent queues -local earliestMessage = redis.call('ZRANGE', childQueue, 0, 0, 'WITHSCORES') +local earliestMessage = redis.call('ZRANGE', queueKey, 0, 0, 'WITHSCORES') for _, parentQueue in ipairs(decodedPayload.masterQueues) do local prefixedParentQueue = keyPrefix .. parentQueue if #earliestMessage == 0 then - redis.call('ZREM', prefixedParentQueue, childQueueName) + redis.call('ZREM', prefixedParentQueue, queueName) else - redis.call('ZADD', prefixedParentQueue, earliestMessage[2], childQueueName) + redis.call('ZADD', prefixedParentQueue, earliestMessage[2], queueName) end end @@ -1132,16 +1144,14 @@ return {messageId, messageScore, messagePayload} -- Return message details }); this.redis.defineCommand("acknowledgeMessage", { - numberOfKeys: 7, + numberOfKeys: 5, lua: ` -- Keys: local messageKey = KEYS[1] -local messageQueue = KEYS[2] -local concurrencyKey = KEYS[3] +local messageQueueKey = KEYS[2] +local queueCurrentConcurrencyKey = KEYS[3] local envCurrentConcurrencyKey = KEYS[4] -local projectCurrentConcurrencyKey = KEYS[5] -local envQueueKey = KEYS[6] -local taskCurrentConcurrencyKey = KEYS[7] +local envQueueKey = KEYS[5] -- Args: local messageId = ARGV[1] @@ -1153,11 +1163,11 @@ local keyPrefix = ARGV[4] redis.call('DEL', messageKey) -- Remove the message from the queue -redis.call('ZREM', messageQueue, messageId) +redis.call('ZREM', messageQueueKey, messageId) redis.call('ZREM', envQueueKey, messageId) -- Rebalance the parent queues -local earliestMessage = redis.call('ZRANGE', messageQueue, 0, 0, 'WITHSCORES') +local earliestMessage = redis.call('ZRANGE', messageQueueKey, 0, 0, 'WITHSCORES') for _, parentQueue in ipairs(parentQueues) do local prefixedParentQueue = keyPrefix .. parentQueue if #earliestMessage == 0 then @@ -1168,24 +1178,20 @@ for _, parentQueue in ipairs(parentQueues) do end -- Update the concurrency keys -redis.call('SREM', concurrencyKey, messageId) +redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) -redis.call('SREM', projectCurrentConcurrencyKey, messageId) -redis.call('SREM', taskCurrentConcurrencyKey, messageId) `, }); this.redis.defineCommand("nackMessage", { - numberOfKeys: 7, + numberOfKeys: 5, lua: ` -- Keys: local messageKey = KEYS[1] local messageQueueKey = KEYS[2] -local concurrencyKey = KEYS[3] -local envConcurrencyKey = KEYS[4] -local projectConcurrencyKey = KEYS[5] -local envQueueKey = KEYS[6] -local taskConcurrencyKey = KEYS[7] +local queueCurrentConcurrencyKey = KEYS[3] +local envCurrentConcurrencyKey = KEYS[4] +local envQueueKey = KEYS[5] -- Args: local messageId = ARGV[1] @@ -1199,10 +1205,8 @@ local keyPrefix = ARGV[6] redis.call('SET', messageKey, messageData) -- Update the concurrency keys -redis.call('SREM', concurrencyKey, messageId) -redis.call('SREM', envConcurrencyKey, messageId) -redis.call('SREM', projectConcurrencyKey, messageId) -redis.call('SREM', taskConcurrencyKey, messageId) +redis.call('SREM', queueCurrentConcurrencyKey, messageId) +redis.call('SREM', envCurrentConcurrencyKey, messageId) -- Enqueue the message into the queue redis.call('ZADD', messageQueueKey, messageScore, messageId) @@ -1222,17 +1226,15 @@ end }); this.redis.defineCommand("moveToDeadLetterQueue", { - numberOfKeys: 8, + numberOfKeys: 6, lua: ` -- Keys: local messageKey = KEYS[1] local messageQueue = KEYS[2] -local concurrencyKey = KEYS[3] +local queueCurrentConcurrencyKey = KEYS[3] local envCurrentConcurrencyKey = KEYS[4] -local projectCurrentConcurrencyKey = KEYS[5] -local envQueueKey = KEYS[6] -local taskCurrentConcurrencyKey = KEYS[7] -local deadLetterQueueKey = KEYS[8] +local envQueueKey = KEYS[5] +local deadLetterQueueKey = KEYS[6] -- Args: local messageId = ARGV[1] @@ -1259,56 +1261,88 @@ end redis.call('ZADD', deadLetterQueueKey, tonumber(redis.call('TIME')[1]), messageId) -- Update the concurrency keys -redis.call('SREM', concurrencyKey, messageId) +redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) -redis.call('SREM', projectCurrentConcurrencyKey, messageId) -redis.call('SREM', taskCurrentConcurrencyKey, messageId) `, }); this.redis.defineCommand("releaseConcurrency", { - numberOfKeys: 6, + numberOfKeys: 2, lua: ` -- Keys: -local messageKey = KEYS[1] -local messageQueue = KEYS[2] -local concurrencyKey = KEYS[3] -local envCurrentConcurrencyKey = KEYS[4] -local projectCurrentConcurrencyKey = KEYS[5] -local taskCurrentConcurrencyKey = KEYS[6] +local queueCurrentConcurrencyKey = KEYS[1] +local envCurrentConcurrencyKey = KEYS[2] + +-- Args: +local messageId = ARGV[1] + +-- Update the concurrency keys +redis.call('SREM', queueCurrentConcurrencyKey, messageId) +redis.call('SREM', envCurrentConcurrencyKey, messageId) +`, + }); + + this.redis.defineCommand("releaseEnvConcurrency", { + numberOfKeys: 1, + lua: ` +-- Keys: +local envCurrentConcurrencyKey = KEYS[1] -- Args: local messageId = ARGV[1] -- Update the concurrency keys -if concurrencyKey ~= "" then - redis.call('SREM', concurrencyKey, messageId) -end redis.call('SREM', envCurrentConcurrencyKey, messageId) -redis.call('SREM', projectCurrentConcurrencyKey, messageId) -redis.call('SREM', taskCurrentConcurrencyKey, messageId) `, }); this.redis.defineCommand("reacquireConcurrency", { - numberOfKeys: 6, + numberOfKeys: 4, lua: ` -- Keys: -local messageKey = KEYS[1] -local messageQueue = KEYS[2] -local concurrencyKey = KEYS[3] -local envCurrentConcurrencyKey = KEYS[4] -local projectCurrentConcurrencyKey = KEYS[5] -local taskCurrentConcurrencyKey = KEYS[6] +local queueCurrentConcurrencyKey = KEYS[1] +local envCurrentConcurrencyKey = KEYS[2] +local queueConcurrencyLimitKey = KEYS[3] +local envConcurrencyLimitKey = KEYS[4] -- Args: local messageId = ARGV[1] +local defaultEnvConcurrencyLimit = ARGV[2] + +-- Check if the message is already in either current concurrency set +local isInQueueConcurrency = redis.call('SISMEMBER', queueCurrentConcurrencyKey, messageId) == 1 +local isInEnvConcurrency = redis.call('SISMEMBER', envCurrentConcurrencyKey, messageId) == 1 + +-- If it's already in both sets, we're done +if isInQueueConcurrency and isInEnvConcurrency then + return true +end + +-- Check current env concurrency against the limit +local envCurrentConcurrency = tonumber(redis.call('SCARD', envCurrentConcurrencyKey) or '0') +local envConcurrencyLimit = tonumber(redis.call('GET', envConcurrencyLimitKey) or defaultEnvConcurrencyLimit) +local totalEnvConcurrencyLimit = envConcurrencyLimit + +if envCurrentConcurrency >= totalEnvConcurrencyLimit then + return false +end + +-- Check current queue concurrency against the limit +if not isInQueueConcurrency then + local queueCurrentConcurrency = tonumber(redis.call('SCARD', queueCurrentConcurrencyKey) or '0') + local queueConcurrencyLimit = math.min(tonumber(redis.call('GET', queueConcurrencyLimitKey) or '1000000'), envConcurrencyLimit) + local totalQueueConcurrencyLimit = queueConcurrencyLimit + + if queueCurrentConcurrency >= totalQueueConcurrencyLimit then + return false + end +end -- Update the concurrency keys -redis.call('SADD', concurrencyKey, messageId) +redis.call('SADD', queueCurrentConcurrencyKey, messageId) redis.call('SADD', envCurrentConcurrencyKey, messageId) -redis.call('SADD', projectCurrentConcurrencyKey, messageId) -redis.call('SADD', taskCurrentConcurrencyKey, messageId) + +return true `, }); @@ -1333,10 +1367,8 @@ declare module "@internal/redis" { //keys queue: string, messageKey: string, - concurrencyKey: string, - envConcurrencyKey: string, - taskConcurrencyKey: string, - projectConcurrencyKey: string, + queueCurrentConcurrencyKey: string, + envCurrentConcurrencyKey: string, envQueueKey: string, //args queueName: string, @@ -1351,14 +1383,12 @@ declare module "@internal/redis" { dequeueMessage( //keys childQueue: string, - concurrencyLimitKey: string, + queueConcurrencyLimitKey: string, envConcurrencyLimitKey: string, - currentConcurrencyKey: string, - envConcurrencyKey: string, - projectConcurrencyKey: string, + queueCurrentConcurrencyKey: string, + envCurrentConcurrencyKey: string, messageKeyPrefix: string, envQueueKey: string, - taskCurrentConcurrentKeyPrefix: string, //args childQueueName: string, currentTime: string, @@ -1372,9 +1402,7 @@ declare module "@internal/redis" { messageQueue: string, concurrencyKey: string, envConcurrencyKey: string, - projectConcurrencyKey: string, envQueueKey: string, - taskConcurrencyKey: string, messageId: string, messageQueueName: string, masterQueues: string, @@ -1385,11 +1413,9 @@ declare module "@internal/redis" { nackMessage( messageKey: string, messageQueue: string, - concurrencyKey: string, - envConcurrencyKey: string, - projectConcurrencyKey: string, + queueCurrentConcurrencyKey: string, + envCurrentConcurrencyKey: string, envQueueKey: string, - taskConcurrencyKey: string, messageId: string, messageQueueName: string, messageData: string, @@ -1402,11 +1428,9 @@ declare module "@internal/redis" { moveToDeadLetterQueue( messageKey: string, messageQueue: string, - concurrencyKey: string, - envConcurrencyKey: string, - projectConcurrencyKey: string, + queueCurrentConcurrencyKey: string, + envCurrentConcurrencyKey: string, envQueueKey: string, - taskConcurrencyKey: string, deadLetterQueueKey: string, messageId: string, messageQueueName: string, @@ -1416,29 +1440,28 @@ declare module "@internal/redis" { ): Result; releaseConcurrency( - messageKey: string, - messageQueue: string, - concurrencyKey: string, - envConcurrencyKey: string, - projectConcurrencyKey: string, - taskConcurrencyKey: string, + queueCurrentConcurrencyKey: string, + envCurrentConcurrencyKey: string, messageId: string, - masterQueues: string, callback?: Callback ): Result; - reacquireConcurrency( - messageKey: string, - messageQueue: string, - concurrencyKey: string, - envConcurrencyKey: string, - projectConcurrencyKey: string, - taskConcurrencyKey: string, + releaseEnvConcurrency( + envCurrentConcurrencyKey: string, messageId: string, - masterQueues: string, callback?: Callback ): Result; + reacquireConcurrency( + queueCurrentConcurrencyKey: string, + envCurrentConcurrencyKey: string, + queueConcurrencyLimitKey: string, + envConcurrencyLimitKey: string, + messageId: string, + defaultEnvConcurrencyLimit: string, + callback?: Callback + ): Result; + updateGlobalConcurrencyLimits( envConcurrencyLimitKey: string, envConcurrencyLimit: string, diff --git a/internal-packages/run-engine/src/run-queue/keyProducer.ts b/internal-packages/run-engine/src/run-queue/keyProducer.ts index cebdacea5c..6c840bd212 100644 --- a/internal-packages/run-engine/src/run-queue/keyProducer.ts +++ b/internal-packages/run-engine/src/run-queue/keyProducer.ts @@ -1,5 +1,5 @@ import { MinimalAuthenticatedEnvironment } from "../shared/index.js"; -import { EnvDescriptor, RunQueueKeyProducer } from "./types.js"; +import { EnvDescriptor, QueueDescriptor, RunQueueKeyProducer } from "./types.js"; const constants = { CURRENT_CONCURRENCY_PART: "currentConcurrency", @@ -12,7 +12,7 @@ const constants = { CONCURRENCY_KEY_PART: "ck", TASK_PART: "task", MESSAGE_PART: "message", - RESERVE_CONCURRENCY_PART: "reserveConcurrency", + DEAD_LETTER_QUEUE_PART: "deadLetter", } as const; export class RunQueueFullKeyProducer implements RunQueueKeyProducer { @@ -110,8 +110,13 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer { } envConcurrencyLimitKeyFromQueue(queue: string) { - const { orgId, envId } = this.descriptorFromQueue(queue); - return `{${constants.ORG_PART}:${orgId}}:${constants.ENV_PART}:${envId}:${constants.CONCURRENCY_LIMIT_PART}`; + const { orgId, projectId, envId } = this.descriptorFromQueue(queue); + + return this.envConcurrencyLimitKey({ + orgId, + projectId, + envId, + }); } envCurrentConcurrencyKeyFromQueue(queue: string) { @@ -146,67 +151,6 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer { } } - envReserveConcurrencyKey(env: EnvDescriptor): string; - envReserveConcurrencyKey(env: MinimalAuthenticatedEnvironment): string; - envReserveConcurrencyKey( - envOrDescriptor: EnvDescriptor | MinimalAuthenticatedEnvironment - ): string { - if ("id" in envOrDescriptor) { - return [ - this.orgKeySection(envOrDescriptor.organization.id), - this.projKeySection(envOrDescriptor.project.id), - this.envKeySection(envOrDescriptor.id), - constants.RESERVE_CONCURRENCY_PART, - ].join(":"); - } else { - return [ - this.orgKeySection(envOrDescriptor.orgId), - this.projKeySection(envOrDescriptor.projectId), - this.envKeySection(envOrDescriptor.envId), - constants.RESERVE_CONCURRENCY_PART, - ].join(":"); - } - } - - taskIdentifierCurrentConcurrencyKeyPrefixFromQueue(queue: string) { - const { orgId, projectId } = this.descriptorFromQueue(queue); - - return `${[this.orgKeySection(orgId), this.projKeySection(projectId), constants.TASK_PART] - .filter(Boolean) - .join(":")}:`; - } - - taskIdentifierCurrentConcurrencyKeyFromQueue(queue: string, taskIdentifier: string) { - return `${this.taskIdentifierCurrentConcurrencyKeyPrefixFromQueue(queue)}${taskIdentifier}`; - } - - taskIdentifierCurrentConcurrencyKey( - env: MinimalAuthenticatedEnvironment, - taskIdentifier: string - ): string { - return [ - this.orgKeySection(env.organization.id), - this.projKeySection(env.project.id), - constants.TASK_PART, - taskIdentifier, - ].join(":"); - } - - projectCurrentConcurrencyKey(env: MinimalAuthenticatedEnvironment): string { - return [ - this.orgKeySection(env.organization.id), - this.projKeySection(env.project.id), - constants.CURRENT_CONCURRENCY_PART, - ].join(":"); - } - - projectCurrentConcurrencyKeyFromQueue(queue: string): string { - const { orgId, projectId } = this.descriptorFromQueue(queue); - return `${this.orgKeySection(orgId)}:${this.projKeySection(projectId)}:${ - constants.CURRENT_CONCURRENCY_PART - }`; - } - messageKeyPrefixFromQueue(queue: string) { const { orgId } = this.descriptorFromQueue(queue); return `${this.orgKeySection(orgId)}:${constants.MESSAGE_PART}:`; @@ -230,7 +174,32 @@ export class RunQueueFullKeyProducer implements RunQueueKeyProducer { return this.descriptorFromQueue(queue).projectId; } - descriptorFromQueue(queue: string) { + deadLetterQueueKey(env: MinimalAuthenticatedEnvironment): string; + deadLetterQueueKey(env: EnvDescriptor): string; + deadLetterQueueKey(envOrDescriptor: EnvDescriptor | MinimalAuthenticatedEnvironment): string { + if ("id" in envOrDescriptor) { + return [ + this.orgKeySection(envOrDescriptor.organization.id), + this.projKeySection(envOrDescriptor.project.id), + this.envKeySection(envOrDescriptor.id), + constants.DEAD_LETTER_QUEUE_PART, + ].join(":"); + } else { + return [ + this.orgKeySection(envOrDescriptor.orgId), + this.projKeySection(envOrDescriptor.projectId), + this.envKeySection(envOrDescriptor.envId), + constants.DEAD_LETTER_QUEUE_PART, + ].join(":"); + } + } + deadLetterQueueKeyFromQueue(queue: string): string { + const descriptor = this.descriptorFromQueue(queue); + + return this.deadLetterQueueKey(descriptor); + } + + descriptorFromQueue(queue: string): QueueDescriptor { const parts = queue.split(":"); return { orgId: parts[1].replace("{", "").replace("}", ""), diff --git a/internal-packages/run-engine/src/run-queue/tests/ack.test.ts b/internal-packages/run-engine/src/run-queue/tests/ack.test.ts new file mode 100644 index 0000000000..8fc6da7dd7 --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/tests/ack.test.ts @@ -0,0 +1,169 @@ +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { describe } from "node:test"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueue } from "../index.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; +import { InputPayload } from "../types.js"; + +const testOptions = { + name: "rq", + tracer: trace.getTracer("rq"), + workers: 1, + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "warn"), + retryOptions: { + maxAttempts: 5, + factor: 1.1, + minTimeoutInMs: 100, + maxTimeoutInMs: 1_000, + randomize: true, + }, + keys: new RunQueueFullKeyProducer(), +}; + +const authenticatedEnvDev = { + id: "e1234", + type: "DEVELOPMENT" as const, + maximumConcurrencyLimit: 10, + project: { id: "p1234" }, + organization: { id: "o1234" }, +}; + +const messageDev: InputPayload = { + runId: "r4321", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: "e4321", + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, +}; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunQueue.acknowledgeMessage", () => { + redisTest("acknowledging a message clears all concurrency", async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + const envMasterQueue = `env:${authenticatedEnvDev.id}`; + + // Enqueue and dequeue a message to get it into processing + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: messageDev, + masterQueues: ["main", envMasterQueue], + }); + + const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); + expect(dequeued.length).toBe(1); + + // Verify concurrency is set + const queueConcurrency = await queue.currentConcurrencyOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueConcurrency).toBe(1); + + const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); + expect(envConcurrency).toBe(1); + + // Acknowledge the message + await queue.acknowledgeMessage(messageDev.orgId, messageDev.runId); + + // Verify all concurrency is cleared + const queueConcurrencyAfter = await queue.currentConcurrencyOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueConcurrencyAfter).toBe(0); + + const envConcurrencyAfter = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); + expect(envConcurrencyAfter).toBe(0); + } finally { + await queue.quit(); + } + }); + + redisTest("acknowledging a message removes it from the queue", async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + const envMasterQueue = `env:${authenticatedEnvDev.id}`; + + // Enqueue message + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: messageDev, + masterQueues: ["main", envMasterQueue], + }); + + // Verify queue lengths + const queueLength = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue); + expect(queueLength).toBe(1); + + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLength).toBe(1); + + // Dequeue the message + const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); + expect(dequeued.length).toBe(1); + + // Verify queue is empty after dequeue + const queueLengthAfterDequeue = await queue.lengthOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueLengthAfterDequeue).toBe(0); + + const envQueueLengthAfterDequeue = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLengthAfterDequeue).toBe(0); + + // Acknowledge the message + await queue.acknowledgeMessage(messageDev.orgId, messageDev.runId); + + // Verify queue remains empty + const queueLengthAfterAck = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue); + expect(queueLengthAfterAck).toBe(0); + + const envQueueLengthAfterAck = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLengthAfterAck).toBe(0); + } finally { + await queue.quit(); + } + }); +}); diff --git a/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromMasterQueue.test.ts b/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromMasterQueue.test.ts new file mode 100644 index 0000000000..cb23b0dd39 --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromMasterQueue.test.ts @@ -0,0 +1,267 @@ +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { describe } from "node:test"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueue } from "../index.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; +import { InputPayload } from "../types.js"; + +const testOptions = { + name: "rq", + tracer: trace.getTracer("rq"), + workers: 1, + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "warn"), + retryOptions: { + maxAttempts: 5, + factor: 1.1, + minTimeoutInMs: 100, + maxTimeoutInMs: 1_000, + randomize: true, + }, + keys: new RunQueueFullKeyProducer(), +}; + +const authenticatedEnvDev = { + id: "e1234", + type: "DEVELOPMENT" as const, + maximumConcurrencyLimit: 10, + project: { id: "p1234" }, + organization: { id: "o1234" }, +}; + +const messageDev: InputPayload = { + runId: "r4321", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: "e4321", + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, +}; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunQueue.dequeueMessageFromMasterQueue", () => { + redisTest("dequeuing a message from a master queue", async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + //initial queue length + const result = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue); + expect(result).toBe(0); + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLength).toBe(0); + + //initial oldest message + const oldestScore = await queue.oldestMessageInQueue(authenticatedEnvDev, messageDev.queue); + expect(oldestScore).toBe(undefined); + + const envMasterQueue = `env:${authenticatedEnvDev.id}`; + + //enqueue message + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: messageDev, + masterQueues: ["main", envMasterQueue], + }); + + //queue length + const result2 = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue); + expect(result2).toBe(1); + + const envQueueLength2 = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLength2).toBe(1); + + //oldest message + const oldestScore2 = await queue.oldestMessageInQueue(authenticatedEnvDev, messageDev.queue); + expect(oldestScore2).toBe(messageDev.timestamp); + + //concurrencies + const queueConcurrency = await queue.currentConcurrencyOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueConcurrency).toBe(0); + + const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); + expect(envConcurrency).toBe(0); + + const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); + expect(dequeued.length).toBe(1); + expect(dequeued[0].messageId).toEqual(messageDev.runId); + expect(dequeued[0].message.orgId).toEqual(messageDev.orgId); + expect(dequeued[0].message.version).toEqual("1"); + expect(dequeued[0].message.masterQueues).toEqual(["main", envMasterQueue]); + + //concurrencies + const queueConcurrencyAfter = await queue.currentConcurrencyOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueConcurrencyAfter).toBe(1); + + const envConcurrencyAfter = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); + expect(envConcurrencyAfter).toBe(1); + + //queue length + const result3 = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue); + expect(result3).toBe(0); + const envQueueLength3 = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLength3).toBe(0); + } finally { + await queue.quit(); + } + }); + + redisTest( + "should not dequeue when env current concurrency equals env concurrency limit", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + // Set env concurrency limit to 1 + await queue.updateEnvConcurrencyLimits({ + ...authenticatedEnvDev, + maximumConcurrencyLimit: 1, + }); + + const envMasterQueue = `env:${authenticatedEnvDev.id}`; + + // Enqueue first message + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: messageDev, + masterQueues: ["main", envMasterQueue], + }); + + // Dequeue first message to occupy the concurrency + const dequeued1 = await queue.dequeueMessageFromMasterQueue( + "test_12345", + envMasterQueue, + 10 + ); + expect(dequeued1.length).toBe(1); + + // Enqueue second message + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: { ...messageDev, runId: "r4322" }, + masterQueues: ["main", envMasterQueue], + }); + + // Try to dequeue second message + const dequeued2 = await queue.dequeueMessageFromMasterQueue( + "test_12345", + envMasterQueue, + 10 + ); + expect(dequeued2.length).toBe(0); + + const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); + expect(envConcurrency).toBe(1); + } finally { + await queue.quit(); + } + } + ); + + redisTest( + "should respect queue concurrency limits when dequeuing", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + // Set queue concurrency limit to 1 + await queue.updateQueueConcurrencyLimits(authenticatedEnvDev, messageDev.queue, 1); + + const envMasterQueue = `env:${authenticatedEnvDev.id}`; + + // Enqueue two messages + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: messageDev, + masterQueues: ["main", envMasterQueue], + }); + + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: { ...messageDev, runId: "r4322" }, + masterQueues: ["main", envMasterQueue], + }); + + // Dequeue first message + const dequeued1 = await queue.dequeueMessageFromMasterQueue( + "test_12345", + envMasterQueue, + 10 + ); + expect(dequeued1.length).toBe(1); + + // Try to dequeue second message + const dequeued2 = await queue.dequeueMessageFromMasterQueue( + "test_12345", + envMasterQueue, + 10 + ); + expect(dequeued2.length).toBe(0); + + const queueConcurrency = await queue.currentConcurrencyOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueConcurrency).toBe(1); + } finally { + await queue.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/run-queue/tests/enqueueMessage.test.ts b/internal-packages/run-engine/src/run-queue/tests/enqueueMessage.test.ts new file mode 100644 index 0000000000..573ac1485b --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/tests/enqueueMessage.test.ts @@ -0,0 +1,113 @@ +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { describe } from "node:test"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueue } from "../index.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; +import { InputPayload } from "../types.js"; + +const testOptions = { + name: "rq", + tracer: trace.getTracer("rq"), + workers: 1, + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "warn"), + retryOptions: { + maxAttempts: 5, + factor: 1.1, + minTimeoutInMs: 100, + maxTimeoutInMs: 1_000, + randomize: true, + }, + keys: new RunQueueFullKeyProducer(), +}; + +const authenticatedEnvDev = { + id: "e1234", + type: "DEVELOPMENT" as const, + maximumConcurrencyLimit: 10, + project: { id: "p1234" }, + organization: { id: "o1234" }, +}; + +const messageDev: InputPayload = { + runId: "r4321", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: "e4321", + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, +}; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunQueue.enqueueMessage", () => { + redisTest("should add the message to the queue", async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + //initial queue length + const result = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue); + expect(result).toBe(0); + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLength).toBe(0); + + //initial oldest message + const oldestScore = await queue.oldestMessageInQueue(authenticatedEnvDev, messageDev.queue); + expect(oldestScore).toBe(undefined); + + const envMasterQueue = `env:${authenticatedEnvDev.id}`; + + //enqueue message + const enqueueResult = await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: messageDev, + masterQueues: ["main", envMasterQueue], + }); + + expect(enqueueResult).toBe(undefined); + + //queue length + const result2 = await queue.lengthOfQueue(authenticatedEnvDev, messageDev.queue); + expect(result2).toBe(1); + + const envQueueLength2 = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLength2).toBe(1); + + //oldest message + const oldestScore2 = await queue.oldestMessageInQueue(authenticatedEnvDev, messageDev.queue); + expect(oldestScore2).toBe(messageDev.timestamp); + + //concurrencies + const queueConcurrency = await queue.currentConcurrencyOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueConcurrency).toBe(0); + + const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); + expect(envConcurrency).toBe(0); + } finally { + await queue.quit(); + } + }); +}); diff --git a/internal-packages/run-engine/src/run-queue/fairQueueSelectionStrategy.test.ts b/internal-packages/run-engine/src/run-queue/tests/fairQueueSelectionStrategy.test.ts similarity index 94% rename from internal-packages/run-engine/src/run-queue/fairQueueSelectionStrategy.test.ts rename to internal-packages/run-engine/src/run-queue/tests/fairQueueSelectionStrategy.test.ts index b07a47db88..3230ba73a0 100644 --- a/internal-packages/run-engine/src/run-queue/fairQueueSelectionStrategy.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/fairQueueSelectionStrategy.test.ts @@ -1,10 +1,10 @@ -import { createRedisClient, RedisOptions } from "@internal/redis"; import { redisTest } from "@internal/testcontainers"; import { describe, expect, vi } from "vitest"; -import { RUN_QUEUE_RESUME_PRIORITY_TIMESTAMP_OFFSET } from "./constants.js"; -import { FairQueueSelectionStrategy } from "./fairQueueSelectionStrategy.js"; -import { RunQueueFullKeyProducer } from "./keyProducer.js"; -import { EnvQueues, RunQueueKeyProducer } from "./types.js"; +import { RUN_QUEUE_RESUME_PRIORITY_TIMESTAMP_OFFSET } from "../constants.js"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; +import { EnvQueues, RunQueueKeyProducer } from "../types.js"; +import { createRedisClient, RedisOptions } from "@internal/redis"; vi.setConfig({ testTimeout: 60_000 }); // 30 seconds timeout @@ -76,54 +76,6 @@ describe("FairDequeuingStrategy", () => { expect(result).toHaveLength(0); }); - redisTest( - "should give extra concurrency when the env has reserve concurrency", - async ({ redisOptions: redis }) => { - const keyProducer = new RunQueueFullKeyProducer(); - const strategy = new FairQueueSelectionStrategy({ - redis, - keys: keyProducer, - defaultEnvConcurrencyLimit: 2, - parentQueueLimit: 100, - seed: "test-seed-3", - }); - - await setupQueue({ - redis, - keyProducer, - parentQueue: "parent-queue", - score: Date.now() - 1000, - queueId: "queue-1", - orgId: "org-1", - projectId: "proj-1", - envId: "env-1", - }); - - await setupConcurrency({ - redis, - keyProducer, - env: { - envId: "env-1", - projectId: "proj-1", - orgId: "org-1", - currentConcurrency: 2, - limit: 2, - reserveConcurrency: 1, - }, - }); - - const result = await strategy.distributeFairQueuesFromParentQueue( - "parent-queue", - "consumer-1" - ); - expect(result).toHaveLength(1); - expect(result[0]).toEqual({ - envId: "env-1", - queues: [keyProducer.queueKey("org-1", "proj-1", "env-1", "queue-1")], - }); - } - ); - redisTest("should respect parentQueueLimit", async ({ redisOptions: redis }) => { const keyProducer = new RunQueueFullKeyProducer(); const strategy = new FairQueueSelectionStrategy({ @@ -259,8 +211,8 @@ describe("FairDequeuingStrategy", () => { console.log("Second distribution took", distribute2Duration, "ms"); - // Make sure the second call is more than 9 times faster than the first - expect(distribute2Duration).toBeLessThan(distribute1Duration / 9); + // Make sure the second call is more than 6 times faster than the first + expect(distribute2Duration).toBeLessThan(distribute1Duration / 6); const startDistribute3 = performance.now(); @@ -274,7 +226,7 @@ describe("FairDequeuingStrategy", () => { console.log("Third distribution took", distribute3Duration, "ms"); // Make sure the third call is more than 4 times the second - expect(distribute3Duration).toBeGreaterThan(distribute2Duration * 4); + expect(distribute3Duration).toBeGreaterThan(distribute2Duration * 2); } ); @@ -1119,7 +1071,6 @@ type SetupConcurrencyOptions = { orgId: string; currentConcurrency: number; limit?: number; - reserveConcurrency?: number; }; }; @@ -1145,19 +1096,6 @@ async function setupConcurrency({ redis, keyProducer, env }: SetupConcurrencyOpt await $redis.sadd(envCurrentKey, ...dummyJobs); } - - if (env.reserveConcurrency && env.reserveConcurrency > 0) { - // Set reserved concurrency by adding dummy members to the set - const envReservedKey = keyProducer.envReserveConcurrencyKey(env); - - // Add dummy reserved job IDs to simulate reserved concurrency - const dummyJobs = Array.from( - { length: env.reserveConcurrency }, - (_, i) => `dummy-reserved-job-${i}-${Date.now()}` - ); - - await $redis.sadd(envReservedKey, ...dummyJobs); - } } /** diff --git a/internal-packages/run-engine/src/run-queue/keyProducer.test.ts b/internal-packages/run-engine/src/run-queue/tests/keyProducer.test.ts similarity index 78% rename from internal-packages/run-engine/src/run-queue/keyProducer.test.ts rename to internal-packages/run-engine/src/run-queue/tests/keyProducer.test.ts index 0f6b14e17d..88bbd55177 100644 --- a/internal-packages/run-engine/src/run-queue/keyProducer.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/keyProducer.test.ts @@ -1,6 +1,6 @@ import { describe } from "node:test"; import { expect, it } from "vitest"; -import { RunQueueFullKeyProducer } from "./keyProducer.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; describe("KeyProducer", () => { it("queueConcurrencyLimitKey", () => { @@ -163,73 +163,6 @@ describe("KeyProducer", () => { expect(key).toBe("{org:o1234}:proj:p1234:env:e1234:queue:task/task-name:currentConcurrency"); }); - it("taskIdentifierCurrentConcurrencyKeyPrefixFromQueue", () => { - const keyProducer = new RunQueueFullKeyProducer(); - const queueKey = keyProducer.queueKey( - { - id: "e1234", - type: "PRODUCTION", - maximumConcurrencyLimit: 10, - project: { id: "p1234" }, - organization: { id: "o1234" }, - }, - "task/task-name" - ); - const key = keyProducer.taskIdentifierCurrentConcurrencyKeyPrefixFromQueue(queueKey); - expect(key).toBe("{org:o1234}:proj:p1234:task:"); - }); - - it("taskIdentifierCurrentConcurrencyKeyFromQueue", () => { - const keyProducer = new RunQueueFullKeyProducer(); - const queueKey = keyProducer.queueKey( - { - id: "e1234", - type: "PRODUCTION", - maximumConcurrencyLimit: 10, - project: { id: "p1234" }, - organization: { id: "o1234" }, - }, - "task/task-name" - ); - const key = keyProducer.taskIdentifierCurrentConcurrencyKeyFromQueue(queueKey, "task-name"); - expect(key).toBe("{org:o1234}:proj:p1234:task:task-name"); - }); - - it("taskIdentifierCurrentConcurrencyKey", () => { - const keyProducer = new RunQueueFullKeyProducer(); - const key = keyProducer.taskIdentifierCurrentConcurrencyKey( - { - id: "e1234", - type: "PRODUCTION", - maximumConcurrencyLimit: 10, - project: { id: "p1234" }, - organization: { id: "o1234" }, - }, - "task-name" - ); - expect(key).toBe("{org:o1234}:proj:p1234:task:task-name"); - }); - - it("projectCurrentConcurrencyKey", () => { - const keyProducer = new RunQueueFullKeyProducer(); - const key = keyProducer.projectCurrentConcurrencyKey({ - id: "e1234", - type: "PRODUCTION", - maximumConcurrencyLimit: 10, - project: { id: "p1234" }, - organization: { id: "o1234" }, - }); - expect(key).toBe("{org:o1234}:proj:p1234:currentConcurrency"); - }); - - it("projectCurrentConcurrencyKeyFromQueue", () => { - const keyProducer = new RunQueueFullKeyProducer(); - const key = keyProducer.projectCurrentConcurrencyKeyFromQueue( - "{org:o1234}:proj:p1234:currentConcurrency" - ); - expect(key).toBe("{org:o1234}:proj:p1234:currentConcurrency"); - }); - it("disabledConcurrencyLimitKeyFromQueue", () => { const keyProducer = new RunQueueFullKeyProducer(); const queueKey = keyProducer.queueKey( @@ -259,7 +192,7 @@ describe("KeyProducer", () => { "task/task-name" ); const key = keyProducer.envConcurrencyLimitKeyFromQueue(queueKey); - expect(key).toBe("{org:o1234}:env:e1234:concurrency"); + expect(key).toBe("{org:o1234}:proj:p1234:env:e1234:concurrency"); }); it("envCurrentConcurrencyKeyFromQueue", () => { diff --git a/internal-packages/run-engine/src/run-queue/tests/nack.test.ts b/internal-packages/run-engine/src/run-queue/tests/nack.test.ts new file mode 100644 index 0000000000..f7b8aa1449 --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/tests/nack.test.ts @@ -0,0 +1,217 @@ +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { describe } from "node:test"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueue } from "../index.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; +import { InputPayload } from "../types.js"; +import { setTimeout } from "node:timers/promises"; + +const testOptions = { + name: "rq", + tracer: trace.getTracer("rq"), + workers: 1, + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "warn"), + retryOptions: { + maxAttempts: 5, + factor: 1.1, + minTimeoutInMs: 100, + maxTimeoutInMs: 1_000, + randomize: true, + }, + keys: new RunQueueFullKeyProducer(), +}; + +const authenticatedEnvDev = { + id: "e1234", + type: "DEVELOPMENT" as const, + maximumConcurrencyLimit: 10, + project: { id: "p1234" }, + organization: { id: "o1234" }, +}; + +const messageDev: InputPayload = { + runId: "r4321", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: "e4321", + environmentType: "DEVELOPMENT", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, +}; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunQueue.nackMessage", () => { + redisTest("nacking a message clears all concurrency", async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + const envMasterQueue = `env:${authenticatedEnvDev.id}`; + + // Enqueue message with reserve concurrency + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: messageDev, + masterQueues: ["main", envMasterQueue], + }); + + // Dequeue message + const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); + expect(dequeued.length).toBe(1); + + // Verify current concurrency is set and reserve is cleared + const queueCurrentConcurrency = await queue.currentConcurrencyOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueCurrentConcurrency).toBe(1); + + const envCurrentConcurrency = await queue.currentConcurrencyOfEnvironment( + authenticatedEnvDev + ); + expect(envCurrentConcurrency).toBe(1); + + // Nack the message + await queue.nackMessage({ + orgId: messageDev.orgId, + messageId: messageDev.runId, + }); + + // Verify all concurrency is cleared + const queueCurrentConcurrencyAfterNack = await queue.currentConcurrencyOfQueue( + authenticatedEnvDev, + messageDev.queue + ); + expect(queueCurrentConcurrencyAfterNack).toBe(0); + + const envCurrentConcurrencyAfterNack = await queue.currentConcurrencyOfEnvironment( + authenticatedEnvDev + ); + expect(envCurrentConcurrencyAfterNack).toBe(0); + + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLength).toBe(1); + + const message = await queue.readMessage(messageDev.orgId, messageDev.runId); + expect(message?.attempt).toBe(1); + + //we need to wait because the default wait is 1 second + await setTimeout(300); + + // Now we should be able to dequeue it again + const dequeued2 = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); + expect(dequeued2.length).toBe(1); + } finally { + await queue.quit(); + } + }); + + redisTest( + "nacking a message with maxAttempts reached should be moved to dead letter queue", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + retryOptions: { + ...testOptions.retryOptions, + maxAttempts: 2, // Set lower for testing + }, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + const envMasterQueue = `env:${authenticatedEnvDev.id}`; + + await queue.enqueueMessage({ + env: authenticatedEnvDev, + message: messageDev, + masterQueues: ["main", envMasterQueue], + }); + + const dequeued = await queue.dequeueMessageFromMasterQueue( + "test_12345", + envMasterQueue, + 10 + ); + expect(dequeued.length).toBe(1); + + await queue.nackMessage({ + orgId: messageDev.orgId, + messageId: messageDev.runId, + }); + + // Wait for any requeue delay + await setTimeout(300); + + // Message should not be requeued as max attempts reached + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLength).toBe(1); + + const message = await queue.readMessage(messageDev.orgId, messageDev.runId); + expect(message?.attempt).toBe(1); + + // Now we dequeue and nack again, and it should be moved to dead letter queue + const dequeued3 = await queue.dequeueMessageFromMasterQueue( + "test_12345", + envMasterQueue, + 10 + ); + expect(dequeued3.length).toBe(1); + + const envQueueLengthDequeue = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLengthDequeue).toBe(0); + + const deadLetterQueueLengthBefore = await queue.lengthOfDeadLetterQueue( + authenticatedEnvDev + ); + expect(deadLetterQueueLengthBefore).toBe(0); + + await queue.nackMessage({ + orgId: messageDev.orgId, + messageId: messageDev.runId, + }); + + const envQueueLengthAfterNack = await queue.lengthOfEnvQueue(authenticatedEnvDev); + expect(envQueueLengthAfterNack).toBe(0); + + const deadLetterQueueLengthAfterNack = await queue.lengthOfDeadLetterQueue( + authenticatedEnvDev + ); + expect(deadLetterQueueLengthAfterNack).toBe(1); + } finally { + await queue.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/run-queue/tests/reacquireConcurrency.test.ts b/internal-packages/run-engine/src/run-queue/tests/reacquireConcurrency.test.ts new file mode 100644 index 0000000000..a9c0386ca5 --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/tests/reacquireConcurrency.test.ts @@ -0,0 +1,328 @@ +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueue } from "../index.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; +import { InputPayload } from "../types.js"; +import { MessageNotFoundError } from "../errors.js"; + +const testOptions = { + name: "rq", + tracer: trace.getTracer("rq"), + workers: 1, + defaultEnvConcurrency: 25, + enableRebalancing: false, + logger: new Logger("RunQueue", "warn"), + retryOptions: { + maxAttempts: 5, + factor: 1.1, + minTimeoutInMs: 100, + maxTimeoutInMs: 1_000, + randomize: true, + }, + keys: new RunQueueFullKeyProducer(), +}; + +const authenticatedEnvProd = { + id: "e1234", + type: "PRODUCTION" as const, + maximumConcurrencyLimit: 10, + project: { id: "p1234" }, + organization: { id: "o1234" }, +}; + +const messageProd: InputPayload = { + runId: "r1234", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: "e1234", + environmentType: "PRODUCTION", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, +}; + +describe("RunQueue.reacquireConcurrency", () => { + redisTest( + "It should return true if we can reacquire the concurrency", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await queue.updateEnvConcurrencyLimits({ + ...authenticatedEnvProd, + maximumConcurrencyLimit: 1, + }); + + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + masterQueues: "main", + }); + + const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); + expect(messages.length).toBe(1); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + + // First, release the concurrency + await queue.releaseAllConcurrency(authenticatedEnvProd.organization.id, messageProd.runId); + + //reacquire the concurrency + const result = await queue.reacquireConcurrency( + authenticatedEnvProd.organization.id, + messageProd.runId + ); + expect(result).toBe(true); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + } finally { + await queue.quit(); + } + } + ); + + redisTest( + "It should return true if the run is already being counted as concurrency", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await queue.updateEnvConcurrencyLimits({ + ...authenticatedEnvProd, + maximumConcurrencyLimit: 1, + }); + + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + masterQueues: "main", + }); + + const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); + expect(messages.length).toBe(1); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + + //reacquire the concurrency + const result = await queue.reacquireConcurrency( + authenticatedEnvProd.organization.id, + messageProd.runId + ); + expect(result).toBe(true); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + } finally { + await queue.quit(); + } + } + ); + + redisTest( + "It should return true if the run is already being counted as concurrency", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await queue.updateEnvConcurrencyLimits({ + ...authenticatedEnvProd, + maximumConcurrencyLimit: 1, + }); + + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + masterQueues: "main", + }); + + const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); + expect(messages.length).toBe(1); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + + //reacquire the concurrency + const result = await queue.reacquireConcurrency( + authenticatedEnvProd.organization.id, + messageProd.runId + ); + expect(result).toBe(true); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + } finally { + await queue.quit(); + } + } + ); + + redisTest( + "It should false if the run is not in the current concurrency set and there is no capacity in the environment", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await queue.updateEnvConcurrencyLimits({ + ...authenticatedEnvProd, + maximumConcurrencyLimit: 1, + }); + + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + masterQueues: "main", + }); + + const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 1); + expect(messages.length).toBe(1); + expect(messages[0].message.runId).toBe(messageProd.runId); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + + // Enqueue a second message + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: { + ...messageProd, + runId: "r1235", + queue: "task/my-task-2", + }, + masterQueues: "main", + }); + + //reacquire the concurrency + const result = await queue.reacquireConcurrency( + authenticatedEnvProd.organization.id, + "r1235" + ); + expect(result).toBe(false); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, "task/my-task-2")).toBe( + 0 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + } finally { + await queue.quit(); + } + } + ); + + redisTest("It should throw an error if the message is not found", async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await expect( + queue.reacquireConcurrency(authenticatedEnvProd.organization.id, "r1235") + ).rejects.toThrow(MessageNotFoundError); + } finally { + await queue.quit(); + } + }); +}); diff --git a/internal-packages/run-engine/src/run-queue/tests/releaseConcurrency.test.ts b/internal-packages/run-engine/src/run-queue/tests/releaseConcurrency.test.ts new file mode 100644 index 0000000000..63873a54b3 --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/tests/releaseConcurrency.test.ts @@ -0,0 +1,152 @@ +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueue } from "../index.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; +import { InputPayload } from "../types.js"; + +const testOptions = { + name: "rq", + tracer: trace.getTracer("rq"), + workers: 1, + defaultEnvConcurrency: 25, + enableRebalancing: false, + logger: new Logger("RunQueue", "warn"), + retryOptions: { + maxAttempts: 5, + factor: 1.1, + minTimeoutInMs: 100, + maxTimeoutInMs: 1_000, + randomize: true, + }, + keys: new RunQueueFullKeyProducer(), +}; + +const authenticatedEnvProd = { + id: "e1234", + type: "PRODUCTION" as const, + maximumConcurrencyLimit: 10, + project: { id: "p1234" }, + organization: { id: "o1234" }, +}; + +const messageProd: InputPayload = { + runId: "r1234", + taskIdentifier: "task/my-task", + orgId: "o1234", + projectId: "p1234", + environmentId: "e1234", + environmentType: "PRODUCTION", + queue: "task/my-task", + timestamp: Date.now(), + attempt: 0, +}; + +describe("RunQueue.releaseConcurrency", () => { + redisTest( + "It should release the concurrency on the queue and the env", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + masterQueues: "main", + }); + + const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); + expect(messages.length).toBe(1); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + + //release the concurrency + await queue.releaseAllConcurrency(authenticatedEnvProd.organization.id, messageProd.runId); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 0 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0); + } finally { + await queue.quit(); + } + } + ); + + redisTest( + "it shouldn't affect the current concurrency if the run hasn't been dequeued", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + masterQueues: "main", + }); + + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: { ...messageProd, runId: "r1235" }, + masterQueues: "main", + }); + + const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 1); + expect(messages.length).toBe(1); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + + //release the concurrency + await queue.releaseAllConcurrency(authenticatedEnvProd.organization.id, "r1235"); + + //concurrencies + expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( + 1 + ); + expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); + } finally { + await queue.quit(); + } + } + ); +}); diff --git a/internal-packages/run-engine/src/run-queue/types.ts b/internal-packages/run-engine/src/run-queue/types.ts index 0eaa048f78..12627f375e 100644 --- a/internal-packages/run-engine/src/run-queue/types.ts +++ b/internal-packages/run-engine/src/run-queue/types.ts @@ -65,21 +65,8 @@ export interface RunQueueKeyProducer { envConcurrencyLimitKey(env: EnvDescriptor): string; envConcurrencyLimitKey(env: MinimalAuthenticatedEnvironment): string; - envReserveConcurrencyKey(env: EnvDescriptor): string; - envReserveConcurrencyKey(env: MinimalAuthenticatedEnvironment): string; - envConcurrencyLimitKeyFromQueue(queue: string): string; envCurrentConcurrencyKeyFromQueue(queue: string): string; - //task concurrency - taskIdentifierCurrentConcurrencyKey( - env: MinimalAuthenticatedEnvironment, - taskIdentifier: string - ): string; - taskIdentifierCurrentConcurrencyKeyPrefixFromQueue(queue: string): string; - taskIdentifierCurrentConcurrencyKeyFromQueue(queue: string, taskIdentifier: string): string; - //project concurrency - projectCurrentConcurrencyKey(env: MinimalAuthenticatedEnvironment): string; - projectCurrentConcurrencyKeyFromQueue(queue: string): string; //message payload messageKeyPrefixFromQueue(queue: string): string; messageKey(orgId: string, messageId: string): string; @@ -88,6 +75,10 @@ export interface RunQueueKeyProducer { envIdFromQueue(queue: string): string; projectIdFromQueue(queue: string): string; descriptorFromQueue(queue: string): QueueDescriptor; + + deadLetterQueueKey(env: MinimalAuthenticatedEnvironment): string; + deadLetterQueueKey(env: EnvDescriptor): string; + deadLetterQueueKeyFromQueue(queue: string): string; } export type EnvQueues = { diff --git a/internal-packages/run-engine/tsconfig.test.json b/internal-packages/run-engine/tsconfig.test.json index b68d234bd7..d8c7d1c638 100644 --- a/internal-packages/run-engine/tsconfig.test.json +++ b/internal-packages/run-engine/tsconfig.test.json @@ -1,5 +1,5 @@ { - "include": ["src/**/*.test.ts"], + "include": ["src/**/*.test.ts", "src/run-queue/tests/dequeueMessageFromMasterQueue.ts"], "references": [{ "path": "./tsconfig.src.json" }], "compilerOptions": { "composite": true, diff --git a/internal-packages/run-engine/vitest.config.ts b/internal-packages/run-engine/vitest.config.ts index e10e77f70e..1d779c0957 100644 --- a/internal-packages/run-engine/vitest.config.ts +++ b/internal-packages/run-engine/vitest.config.ts @@ -2,7 +2,6 @@ import { defineConfig } from "vitest/config"; export default defineConfig({ test: { - reporters: process.env.GITHUB_ACTIONS ? ["verbose", "github-actions"] : ["verbose"], include: ["**/*.test.ts"], globals: true, isolate: true, @@ -12,5 +11,9 @@ export default defineConfig({ singleThread: true, }, }, + testTimeout: 60_000, + coverage: { + provider: "v8", + }, }, }); diff --git a/internal-packages/run-queue/src/run-queue/tests/nack.test.ts b/internal-packages/run-queue/src/run-queue/tests/nack.test.ts new file mode 100644 index 0000000000..c553fce412 --- /dev/null +++ b/internal-packages/run-queue/src/run-queue/tests/nack.test.ts @@ -0,0 +1,11 @@ +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { describe } from "node:test"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueue } from "../index.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; +import { InputPayload } from "../types.js"; +import { createRedisClient } from "@internal/redis"; + +// ... existing code ... diff --git a/internal-packages/testcontainers/src/setup.ts b/internal-packages/testcontainers/src/setup.ts index a51e24eadd..b77663f062 100644 --- a/internal-packages/testcontainers/src/setup.ts +++ b/internal-packages/testcontainers/src/setup.ts @@ -69,7 +69,11 @@ export async function setupBackgroundWorker( environment: AuthenticatedEnvironment, taskIdentifier: string | string[], machineConfig?: MachineConfig, - retryOptions?: RetryOptions + retryOptions?: RetryOptions, + queueOptions?: { + releaseConcurrencyOnWaitpoint?: boolean; + concurrencyLimit?: number | null; + } ) { const worker = await prisma.backgroundWorker.create({ data: { @@ -115,10 +119,17 @@ export async function setupBackgroundWorker( data: { friendlyId: generateFriendlyId("queue"), name: queueName, - concurrencyLimit: 10, + concurrencyLimit: + typeof queueOptions?.concurrencyLimit === "undefined" + ? 10 + : queueOptions.concurrencyLimit, runtimeEnvironmentId: worker.runtimeEnvironmentId, projectId: worker.projectId, type: "VIRTUAL", + releaseConcurrencyOnWaitpoint: + typeof queueOptions?.releaseConcurrencyOnWaitpoint === "boolean" + ? queueOptions.releaseConcurrencyOnWaitpoint + : undefined, }, }); } diff --git a/packages/cli-v3/src/entryPoints/dev-run-controller.ts b/packages/cli-v3/src/entryPoints/dev-run-controller.ts index 68c051d8f0..8807915d53 100644 --- a/packages/cli-v3/src/entryPoints/dev-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/dev-run-controller.ts @@ -449,6 +449,7 @@ export class DevRunController { return; } case "RUN_CREATED": + case "QUEUED_EXECUTING": case "QUEUED": { logger.debug("Status change not handled", { status: snapshot.executionStatus }); return; diff --git a/packages/cli-v3/src/entryPoints/managed-run-controller.ts b/packages/cli-v3/src/entryPoints/managed-run-controller.ts index 5eab5839f7..540a599f53 100644 --- a/packages/cli-v3/src/entryPoints/managed-run-controller.ts +++ b/packages/cli-v3/src/entryPoints/managed-run-controller.ts @@ -627,6 +627,7 @@ class ManagedRunController { return; } case "RUN_CREATED": + case "QUEUED_EXECUTING": case "QUEUED": { console.log("Status change not handled", { status: snapshot.executionStatus }); return; diff --git a/packages/core/src/v3/schemas/api.ts b/packages/core/src/v3/schemas/api.ts index ff2028f7c3..48be14e602 100644 --- a/packages/core/src/v3/schemas/api.ts +++ b/packages/core/src/v3/schemas/api.ts @@ -119,6 +119,7 @@ export const TriggerTaskRequestBody = z.object({ test: z.boolean().optional(), ttl: z.string().or(z.number().nonnegative().int()).optional(), priority: z.number().optional(), + releaseConcurrency: z.boolean().optional(), }) .optional(), }); @@ -956,6 +957,9 @@ export const WaitForDurationRequestBody = z.object({ * This means after that time if you pass the same idempotency key again, you will get a new waitpoint. */ idempotencyKeyTTL: z.string().optional(), + + releaseConcurrency: z.boolean().optional(), + date: z.coerce.date(), }); export type WaitForDurationRequestBody = z.infer; diff --git a/packages/core/src/v3/schemas/runEngine.ts b/packages/core/src/v3/schemas/runEngine.ts index db0c9221cd..392d29d823 100644 --- a/packages/core/src/v3/schemas/runEngine.ts +++ b/packages/core/src/v3/schemas/runEngine.ts @@ -6,6 +6,7 @@ import type * as DB_TYPES from "@trigger.dev/database"; export const TaskRunExecutionStatus = { RUN_CREATED: "RUN_CREATED", QUEUED: "QUEUED", + QUEUED_EXECUTING: "QUEUED_EXECUTING", PENDING_EXECUTING: "PENDING_EXECUTING", EXECUTING: "EXECUTING", EXECUTING_WITH_WAITPOINTS: "EXECUTING_WITH_WAITPOINTS", diff --git a/packages/core/src/v3/types/tasks.ts b/packages/core/src/v3/types/tasks.ts index 921712ec3e..7a4f279151 100644 --- a/packages/core/src/v3/types/tasks.ts +++ b/packages/core/src/v3/types/tasks.ts @@ -824,8 +824,16 @@ export type TriggerOptions = { version?: string; }; -export type TriggerAndWaitOptions = Omit; - +export type TriggerAndWaitOptions = Omit & { + /** + * If set to true, this will cause the waitpoint to release the current run from the queue's concurrency. + * + * This is useful if you want to allow other runs to execute while the child task is executing + * + * @default false + */ + releaseConcurrency?: boolean; +}; export type BatchTriggerOptions = { /** * If no idempotencyKey is set on an individual item in the batch, it will use this key on each item + the array index. diff --git a/packages/trigger-sdk/src/v3/shared.ts b/packages/trigger-sdk/src/v3/shared.ts index d2e083aba3..6d4b4606d5 100644 --- a/packages/trigger-sdk/src/v3/shared.ts +++ b/packages/trigger-sdk/src/v3/shared.ts @@ -1345,6 +1345,7 @@ async function triggerAndWait_internal=6.9.0'} dependencies: - '@ampproject/remapping': 2.2.1 + '@ampproject/remapping': 2.3.0 '@babel/code-frame': 7.24.7 '@babel/generator': 7.24.7 '@babel/helper-compilation-targets': 7.25.2 '@babel/helper-module-transforms': 7.25.2(@babel/core@7.24.5) '@babel/helpers': 7.25.6 - '@babel/parser': 7.24.7 + '@babel/parser': 7.26.8 '@babel/template': 7.24.7 '@babel/traverse': 7.24.7 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 convert-source-map: 2.0.0 - debug: 4.3.7 + debug: 4.4.0 gensync: 1.0.0-beta.2 json5: 2.2.3 semver: 6.3.1 @@ -3582,7 +3585,7 @@ packages: resolution: {integrity: sha512-oipXieGC3i45Y1A41t4tAqpnEZWgB/lC6Ehh6+rOviR5XWpTtMmLN+fGjz9vOiNRt0p6RtO6DtD0pdU3vpqdSA==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 '@jridgewell/gen-mapping': 0.3.5 '@jridgewell/trace-mapping': 0.3.25 jsesc: 2.5.2 @@ -3591,7 +3594,7 @@ packages: resolution: {integrity: sha512-VPC82gr1seXOpkjAAKoLhP50vx4vGNlF4msF64dSFq1P8RfB+QAuJWGHPXXPc8QyfVWwwB/TNNU4+ayZmHNbZw==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.25.6 + '@babel/types': 7.26.8 '@jridgewell/gen-mapping': 0.3.5 '@jridgewell/trace-mapping': 0.3.25 jsesc: 2.5.2 @@ -3612,7 +3615,7 @@ packages: resolution: {integrity: sha512-LvBTxu8bQSQkcyKOU+a1btnNFQ1dMAd0R6PyW3arXes06F6QLWLIrd681bxRPIXlrMGR3XYnW9JyML7dP3qgxg==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-builder-binary-assignment-operator-visitor@7.18.9: @@ -3620,7 +3623,7 @@ packages: engines: {node: '>=6.9.0'} dependencies: '@babel/helper-explode-assignable-expression': 7.18.6 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-compilation-targets@7.22.15: @@ -3712,7 +3715,7 @@ packages: '@babel/core': 7.22.17 '@babel/helper-compilation-targets': 7.22.15 '@babel/helper-plugin-utils': 7.24.0 - debug: 4.3.7 + debug: 4.4.0 lodash.debounce: 4.0.8 resolve: 1.22.8 semver: 6.3.1 @@ -3728,7 +3731,7 @@ packages: '@babel/core': 7.22.17 '@babel/helper-compilation-targets': 7.22.15 '@babel/helper-plugin-utils': 7.24.0 - debug: 4.3.7 + debug: 4.4.0 lodash.debounce: 4.0.8 resolve: 1.22.8 transitivePeerDependencies: @@ -3748,13 +3751,13 @@ packages: resolution: {integrity: sha512-DoiN84+4Gnd0ncbBOM9AZENV4a5ZiL39HYMyZJGZ/AZEykHYdJw0wW3kdcsh9/Kn+BRXHLkkklZ51ecPKmI1CQ==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 /@babel/helper-explode-assignable-expression@7.18.6: resolution: {integrity: sha512-eyAYAsQmB80jNfg4baAtLeWAQHfHFiR483rzFK+BhETlGZaQC9bsfrugfXDCbRHLQbIA7U5NxhhOxN7p/dWIcg==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-function-name@7.22.5: @@ -3762,7 +3765,7 @@ packages: engines: {node: '>=6.9.0'} dependencies: '@babel/template': 7.22.15 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-function-name@7.23.0: @@ -3770,7 +3773,7 @@ packages: engines: {node: '>=6.9.0'} dependencies: '@babel/template': 7.24.7 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-function-name@7.24.7: @@ -3778,47 +3781,47 @@ packages: engines: {node: '>=6.9.0'} dependencies: '@babel/template': 7.24.7 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 /@babel/helper-hoist-variables@7.22.5: resolution: {integrity: sha512-wGjk9QZVzvknA6yKIUURb8zY3grXCcOZt+/7Wcy8O2uctxhplmUPkOdlgoNhmdVee2c92JXbf1xpMtVNbfoxRw==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-hoist-variables@7.24.7: resolution: {integrity: sha512-MJJwhkoGy5c4ehfoRyrJ/owKeMl19U54h27YYftT0o2teQ3FJ3nQUf/I3LlJsX4l3qlw7WRXUmiyajvHXoTubQ==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 /@babel/helper-member-expression-to-functions@7.21.5: resolution: {integrity: sha512-nIcGfgwpH2u4n9GG1HpStW5Ogx7x7ekiFHbjjFRKXbn5zUvqO9ZgotCO4x1aNbKn/x/xOUaXEhyNHCwtFCpxWg==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-member-expression-to-functions@7.23.0: resolution: {integrity: sha512-6gfrPwh7OuT6gZyJZvd6WbTfrqAo7vm4xCzAXOusKqq/vWdKXphTpj5klHKNmRUU6/QRGlBsyU9mAIPaWHlqJA==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-module-imports@7.22.15: resolution: {integrity: sha512-0pYVBnDKZO2fnSPCrgM/6WMc7eS20Fbok+0r88fp+YtWVLZrp4CkafFGIp+W0VKw4a22sgebPT99y+FDNMdP4w==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 /@babel/helper-module-imports@7.24.7: resolution: {integrity: sha512-8AyH3C+74cgCVVXow/myrynrAGv+nTVg5vKu2nZph9x7RcRwzmh0VFallJuFTZ9mx6u4eSdXZfcOzSqTUm0HCA==} engines: {node: '>=6.9.0'} dependencies: '@babel/traverse': 7.25.6 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 transitivePeerDependencies: - supports-color dev: false @@ -3844,7 +3847,7 @@ packages: '@babel/helper-module-imports': 7.22.15 '@babel/helper-simple-access': 7.22.5 '@babel/helper-split-export-declaration': 7.22.6 - '@babel/helper-validator-identifier': 7.24.7 + '@babel/helper-validator-identifier': 7.25.9 /@babel/helper-module-transforms@7.25.2(@babel/core@7.24.5): resolution: {integrity: sha512-BjyRAbix6j/wv83ftcVJmBt72QtHI56C7JXZoG2xATiLpmoC7dpd8WnkikExHDVPpi/3qCmO6WY1EaXOluiecQ==} @@ -3855,7 +3858,7 @@ packages: '@babel/core': 7.24.5 '@babel/helper-module-imports': 7.24.7 '@babel/helper-simple-access': 7.24.7 - '@babel/helper-validator-identifier': 7.24.7 + '@babel/helper-validator-identifier': 7.25.9 '@babel/traverse': 7.25.6 transitivePeerDependencies: - supports-color @@ -3879,14 +3882,14 @@ packages: resolution: {integrity: sha512-HP59oD9/fEHQkdcbgFCnbmgH5vIQTJbxh2yf+CdM89/glUNnuzr87Q8GIjGEnOktTROemO0Pe0iPAYbqZuOUiA==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-optimise-call-expression@7.22.5: resolution: {integrity: sha512-HBwaojN0xFRx4yIvpwGqxiV2tUfl7401jlok564NgB9EHS1y6QT17FmKWm4ztqjeVdXLuC4fSvHc5ePpQjoTbw==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-plugin-utils@7.22.5: @@ -3909,7 +3912,7 @@ packages: '@babel/helper-annotate-as-pure': 7.22.5 '@babel/helper-environment-visitor': 7.22.20 '@babel/helper-wrap-function': 7.20.5 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 transitivePeerDependencies: - supports-color dev: true @@ -3923,7 +3926,7 @@ packages: '@babel/helper-optimise-call-expression': 7.22.5 '@babel/template': 7.24.7 '@babel/traverse': 7.24.7 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 transitivePeerDependencies: - supports-color dev: true @@ -3944,14 +3947,14 @@ packages: resolution: {integrity: sha512-n0H99E/K+Bika3++WNL17POvo4rKWZ7lZEp1Q+fStVbUi8nxPQEBOlTmCOxW/0JsS56SKKQ+ojAe2pHKJHN35w==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 /@babel/helper-simple-access@7.24.7: resolution: {integrity: sha512-zBAIvbCMh5Ts+b86r/CjU+4XGYIs+R1j951gxI3KmmxBMhCg4oQMsv6ZXQ64XOm/cvzfU1FmoCyt6+owc5QMYg==} engines: {node: '>=6.9.0'} dependencies: '@babel/traverse': 7.25.6 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 transitivePeerDependencies: - supports-color dev: false @@ -3960,37 +3963,32 @@ packages: resolution: {integrity: sha512-5y1JYeNKfvnT8sZcK9DVRtpTbGiomYIHviSP3OQWmDPU3DeH4a1ZlT/N2lyQ5P8egjcRaT/Y9aNqUxK0WsnIIg==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-skip-transparent-expression-wrappers@7.22.5: resolution: {integrity: sha512-tK14r66JZKiC43p8Ki33yLBVJKlQDFoA8GYN67lWCDCqoL6EMMSuM9b+Iff2jHaM/RRFYl7K+iiru7hbRqNx8Q==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/helper-split-export-declaration@7.22.6: resolution: {integrity: sha512-AsUnxuLhRYsisFiaJwvp1QF+I3KjD5FOxut14q/GzovUe6orHLesW2C7d754kRm53h5gqrz6sFl6sxc4BVtE/g==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 /@babel/helper-split-export-declaration@7.24.7: resolution: {integrity: sha512-oy5V7pD+UvfkEATUKvIjvIAH/xCzfsFVw7ygW2SI6NClZzquT+mwdTfgfdbUiceh6iQO0CHtCPsyze/MZ2YbAA==} engines: {node: '>=6.9.0'} dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 /@babel/helper-string-parser@7.24.7: resolution: {integrity: sha512-7MbVt6xrwFQbunH2DNQsAP5sTGxfqQtErvBIvIMi6EQnbgUOuVYanvREcmFrOPhoXBrTtjhhP+lW+o5UfK+tDg==} engines: {node: '>=6.9.0'} - /@babel/helper-string-parser@7.24.8: - resolution: {integrity: sha512-pO9KhhRcuUyGnJWwyEgnRJTSIZHiT+vMD0kPeD+so0l7mxkMT19g3pjY9GTnHySck/hDzq+dtW/4VgnMkippsQ==} - engines: {node: '>=6.9.0'} - dev: false - /@babel/helper-string-parser@7.25.9: resolution: {integrity: sha512-4A/SCr/2KLd5jrtOMFzaKjVtAei3+2r/NChoBNoZ3EyP/+GlhoaEGoWOZUmFmoITP7zOJyHIMm+DYRd8o3PvHA==} engines: {node: '>=6.9.0'} @@ -4024,7 +4022,7 @@ packages: '@babel/helper-function-name': 7.24.7 '@babel/template': 7.24.7 '@babel/traverse': 7.24.7 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 transitivePeerDependencies: - supports-color dev: true @@ -4035,7 +4033,7 @@ packages: dependencies: '@babel/template': 7.22.15 '@babel/traverse': 7.24.7 - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 transitivePeerDependencies: - supports-color @@ -4044,7 +4042,7 @@ packages: engines: {node: '>=6.9.0'} dependencies: '@babel/template': 7.25.0 - '@babel/types': 7.25.6 + '@babel/types': 7.26.8 dev: false /@babel/helpers@7.26.7: @@ -4059,7 +4057,7 @@ packages: resolution: {integrity: sha512-C/BaXcnnvBCmHTpz/VGZ8jgtE2aYlW4hxDhseJAWZb7gqGM/qtCK6iZUb0TyKFf7BOUsBH7Q7fkRsDRhg1XklQ==} engines: {node: '>=6.9.0'} dependencies: - '@babel/helper-validator-identifier': 7.24.7 + '@babel/helper-validator-identifier': 7.25.9 chalk: 2.4.2 js-tokens: 4.0.0 @@ -4067,7 +4065,7 @@ packages: resolution: {integrity: sha512-EStJpq4OuY8xYfhGVXngigBJRWxftKX9ksiGDnmlY3o7B/V7KIAc9X4oiK87uPJSc/vs5L869bem5fhZa8caZw==} engines: {node: '>=6.9.0'} dependencies: - '@babel/helper-validator-identifier': 7.24.7 + '@babel/helper-validator-identifier': 7.25.9 chalk: 2.4.2 js-tokens: 4.0.0 picocolors: 1.1.1 @@ -4085,7 +4083,7 @@ packages: engines: {node: '>=6.0.0'} hasBin: true dependencies: - '@babel/types': 7.25.6 + '@babel/types': 7.26.8 dev: false /@babel/parser@7.24.5: @@ -4093,7 +4091,7 @@ packages: engines: {node: '>=6.0.0'} hasBin: true dependencies: - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: false /@babel/parser@7.24.7: @@ -4103,14 +4101,6 @@ packages: dependencies: '@babel/types': 7.24.7 - /@babel/parser@7.25.6: - resolution: {integrity: sha512-trGdfBdbD0l1ZPmcJ83eNxB9rbEax4ALFTF7fN386TMYbeCQbyme5cOEXQhbGXKebwGaB/J52w1mrklMcbgy6Q==} - engines: {node: '>=6.0.0'} - hasBin: true - dependencies: - '@babel/types': 7.25.6 - dev: false - /@babel/parser@7.26.8: resolution: {integrity: sha512-TZIQ25pkSoaKEYYaHbbxkfL36GNsQ6iFiBbeuzAkLnXayKR1yP1zFe+NxuZWWsUyvt8icPU9CCq0sgWGXR1GEw==} engines: {node: '>=6.0.0'} @@ -4748,7 +4738,7 @@ packages: '@babel/helper-hoist-variables': 7.22.5 '@babel/helper-module-transforms': 7.22.17(@babel/core@7.22.17) '@babel/helper-plugin-utils': 7.24.0 - '@babel/helper-validator-identifier': 7.24.7 + '@babel/helper-validator-identifier': 7.25.9 dev: true /@babel/plugin-transform-modules-umd@7.18.6(@babel/core@7.22.17): @@ -4845,7 +4835,7 @@ packages: '@babel/helper-module-imports': 7.22.15 '@babel/helper-plugin-utils': 7.24.0 '@babel/plugin-syntax-jsx': 7.22.5(@babel/core@7.22.17) - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 dev: true /@babel/plugin-transform-react-pure-annotations@7.18.6(@babel/core@7.22.17): @@ -5078,7 +5068,7 @@ packages: '@babel/helper-plugin-utils': 7.24.0 '@babel/plugin-proposal-unicode-property-regex': 7.18.6(@babel/core@7.22.17) '@babel/plugin-transform-dotall-regex': 7.18.6(@babel/core@7.22.17) - '@babel/types': 7.24.7 + '@babel/types': 7.26.8 esutils: 2.0.3 dev: true @@ -5141,24 +5131,24 @@ packages: engines: {node: '>=6.9.0'} dependencies: '@babel/code-frame': 7.22.13 - '@babel/parser': 7.24.7 - '@babel/types': 7.24.7 + '@babel/parser': 7.26.8 + '@babel/types': 7.26.8 /@babel/template@7.24.7: resolution: {integrity: sha512-jYqfPrU9JTF0PmPy1tLYHW4Mp4KlgxJD9l2nP9fD6yT/ICi554DmrWBAEYpIelzjHf1msDP3PxJIRt/nFNfBig==} engines: {node: '>=6.9.0'} dependencies: '@babel/code-frame': 7.24.7 - '@babel/parser': 7.24.7 - '@babel/types': 7.24.7 + '@babel/parser': 7.26.8 + '@babel/types': 7.26.8 /@babel/template@7.25.0: resolution: {integrity: sha512-aOOgh1/5XzKvg1jvVz7AVrx2piJ2XBi227DHmbY6y+bM9H2FlN+IfecYu4Xl0cNiiVejlsCri89LUsbj8vJD9Q==} engines: {node: '>=6.9.0'} dependencies: '@babel/code-frame': 7.24.7 - '@babel/parser': 7.25.6 - '@babel/types': 7.25.6 + '@babel/parser': 7.26.8 + '@babel/types': 7.26.8 dev: false /@babel/template@7.26.8: @@ -5198,9 +5188,9 @@ packages: '@babel/helper-function-name': 7.24.7 '@babel/helper-hoist-variables': 7.24.7 '@babel/helper-split-export-declaration': 7.24.7 - '@babel/parser': 7.24.7 - '@babel/types': 7.24.7 - debug: 4.3.7 + '@babel/parser': 7.26.8 + '@babel/types': 7.26.8 + debug: 4.4.0 globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -5211,10 +5201,10 @@ packages: dependencies: '@babel/code-frame': 7.24.7 '@babel/generator': 7.25.6 - '@babel/parser': 7.25.6 + '@babel/parser': 7.26.8 '@babel/template': 7.25.0 - '@babel/types': 7.25.6 - debug: 4.3.7 + '@babel/types': 7.26.8 + debug: 4.4.0 globals: 11.12.0 transitivePeerDependencies: - supports-color @@ -5239,8 +5229,8 @@ packages: resolution: {integrity: sha512-+j7a5c253RfKh8iABBhywc8NSfP5LURe7Uh4qpsh6jc+aLJguvmIUBdjSdEMQv2bENrCR5MfRdjGo7vzS/ob7w==} engines: {node: '>=6.9.0'} dependencies: - '@babel/helper-string-parser': 7.24.7 - '@babel/helper-validator-identifier': 7.24.7 + '@babel/helper-string-parser': 7.25.9 + '@babel/helper-validator-identifier': 7.25.9 to-fast-properties: 2.0.0 /@babel/types@7.24.7: @@ -5251,15 +5241,6 @@ packages: '@babel/helper-validator-identifier': 7.24.7 to-fast-properties: 2.0.0 - /@babel/types@7.25.6: - resolution: {integrity: sha512-/l42B1qxpG6RdfYf343Uw1vmDjeNhneUXtzhojE7pDgfpEypmRhI6j1kr17XCVv4Cgl9HdAiQY2x0GwKm7rWCw==} - engines: {node: '>=6.9.0'} - dependencies: - '@babel/helper-string-parser': 7.24.8 - '@babel/helper-validator-identifier': 7.24.7 - to-fast-properties: 2.0.0 - dev: false - /@babel/types@7.26.8: resolution: {integrity: sha512-eUuWapzEGWFEpHFxgEaBG8e3n6S8L3MSu0oda755rOfabWPnh0Our1AozNFVUxGFIhbKgd1ksprsoDGMinTOTA==} engines: {node: '>=6.9.0'} @@ -5270,6 +5251,11 @@ packages: /@balena/dockerignore@1.0.2: resolution: {integrity: sha512-wMue2Sy4GAVTk6Ic4tJVcnfdau+gx2EnG7S+uAEe+TWJFqE4YoWN4/H8MSLj4eYJKxGg26lZwboEniNiNwZQ6Q==} + /@bcoe/v8-coverage@1.0.2: + resolution: {integrity: sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==} + engines: {node: '>=18'} + dev: true + /@bufbuild/protobuf@1.10.0: resolution: {integrity: sha512-QDdVFLoN93Zjg36NoQPZfsVH9tZew7wKDKyV5qRdj8ntT4wQCOradQjRaTdwMhWUYsgKsvCINKKm87FdEk96Ag==} dev: false @@ -7929,6 +7915,11 @@ packages: minipass: 7.1.2 dev: false + /@istanbuljs/schema@0.1.3: + resolution: {integrity: sha512-ZXRY4jNvVgSVQ8DL3LTcakaAtXwTVUxE81hslsyD2AtoXW/wVob10HkOJ1X/pAlcI7D+2YoZKg5do8G/w6RYgA==} + engines: {node: '>=8'} + dev: true + /@jest/schemas@29.6.3: resolution: {integrity: sha512-mo5j5X+jIZmJQveBKeS/clAueipV7KgiX1vMgCxam1RNYiqE1w62n0/tJJnHtjW8ZHcQco5gY85jA3mi0L+nSA==} engines: {node: ^14.15.0 || ^16.10.0 || >=18.0.0} @@ -9830,7 +9821,7 @@ packages: engines: {node: '>=18'} hasBin: true dependencies: - debug: 4.3.7 + debug: 4.4.0 extract-zip: 2.0.1 progress: 2.0.3 proxy-agent: 6.4.0 @@ -17817,6 +17808,32 @@ packages: resolution: {integrity: sha512-17kVyLq3ePTKOkveHxXuIJZtGYs+cSoev7BlP+Lf4916qfDhk/HBjvlYDe8egrea7LNPHKwSZJK/bzZC+Q6AwQ==} dev: true + /@vitest/coverage-v8@3.0.8(vitest@3.0.8): + resolution: {integrity: sha512-y7SAKsQirsEJ2F8bulBck4DoluhI2EEgTimHd6EEUgJBGKy9tC25cpywh1MH4FvDGoG2Unt7+asVd1kj4qOSAw==} + peerDependencies: + '@vitest/browser': 3.0.8 + vitest: 3.0.8 + peerDependenciesMeta: + '@vitest/browser': + optional: true + dependencies: + '@ampproject/remapping': 2.3.0 + '@bcoe/v8-coverage': 1.0.2 + debug: 4.4.0 + istanbul-lib-coverage: 3.2.2 + istanbul-lib-report: 3.0.1 + istanbul-lib-source-maps: 5.0.6 + istanbul-reports: 3.1.7 + magic-string: 0.30.17 + magicast: 0.3.5 + std-env: 3.8.1 + test-exclude: 7.0.1 + tinyrainbow: 2.0.0 + vitest: 3.0.8(@types/node@20.14.14) + transitivePeerDependencies: + - supports-color + dev: true + /@vitest/expect@0.28.5: resolution: {integrity: sha512-gqTZwoUTwepwGIatnw4UKpQfnoyV0Z9Czn9+Lo2/jLIt4/AXLTn+oVZxlQ7Ng8bzcNkR+3DqLJ08kNr8jRmdNQ==} dependencies: @@ -17850,12 +17867,44 @@ packages: tinyrainbow: 1.2.0 dev: true + /@vitest/expect@3.0.8: + resolution: {integrity: sha512-Xu6TTIavTvSSS6LZaA3EebWFr6tsoXPetOWNMOlc7LO88QVVBwq2oQWBoDiLCN6YTvNYsGSjqOO8CAdjom5DCQ==} + dependencies: + '@vitest/spy': 3.0.8 + '@vitest/utils': 3.0.8 + chai: 5.2.0 + tinyrainbow: 2.0.0 + dev: true + + /@vitest/mocker@3.0.8(vite@5.2.7): + resolution: {integrity: sha512-n3LjS7fcW1BCoF+zWZxG7/5XvuYH+lsFg+BDwwAz0arIwHQJFUEsKBQ0BLU49fCxuM/2HSeBPHQD8WjgrxMfow==} + peerDependencies: + msw: ^2.4.9 + vite: ^5.0.0 || ^6.0.0 + peerDependenciesMeta: + msw: + optional: true + vite: + optional: true + dependencies: + '@vitest/spy': 3.0.8 + estree-walker: 3.0.3 + magic-string: 0.30.17 + vite: 5.2.7(@types/node@20.14.14) + dev: true + /@vitest/pretty-format@2.0.5: resolution: {integrity: sha512-h8k+1oWHfwTkyTkb9egzwNMfJAEx4veaPSnMeKbVSjp4euqGSbQlm5+6VHwTr7u4FJslVVsUG5nopCaAYdOmSQ==} dependencies: tinyrainbow: 1.2.0 dev: true + /@vitest/pretty-format@3.0.8: + resolution: {integrity: sha512-BNqwbEyitFhzYMYHUVbIvepOyeQOSFA/NeJMIP9enMntkkxLgOcgABH6fjyXG85ipTgvero6noreavGIqfJcIg==} + dependencies: + tinyrainbow: 2.0.0 + dev: true + /@vitest/runner@0.28.5: resolution: {integrity: sha512-NKkHtLB+FGjpp5KmneQjTcPLWPTDfB7ie+MmF1PnUBf/tGe2OjGxWyB62ySYZ25EYp9krR5Bw0YPLS/VWh1QiA==} dependencies: @@ -17887,6 +17936,13 @@ packages: pathe: 1.1.2 dev: true + /@vitest/runner@3.0.8: + resolution: {integrity: sha512-c7UUw6gEcOzI8fih+uaAXS5DwjlBaCJUo7KJ4VvJcjL95+DSR1kova2hFuRt3w41KZEFcOEiq098KkyrjXeM5w==} + dependencies: + '@vitest/utils': 3.0.8 + pathe: 2.0.3 + dev: true + /@vitest/snapshot@1.4.0: resolution: {integrity: sha512-saAFnt5pPIA5qDGxOHxJ/XxhMFKkUSBJmVt5VgDsAqPTX6JP326r5C/c9UuCMPoXNzuudTPsYDZCoJ5ilpqG2A==} dependencies: @@ -17911,6 +17967,14 @@ packages: pathe: 1.1.2 dev: true + /@vitest/snapshot@3.0.8: + resolution: {integrity: sha512-x8IlMGSEMugakInj44nUrLSILh/zy1f2/BgH0UeHpNyOocG18M9CWVIFBaXPt8TrqVZWmcPjwfG/ht5tnpba8A==} + dependencies: + '@vitest/pretty-format': 3.0.8 + magic-string: 0.30.17 + pathe: 2.0.3 + dev: true + /@vitest/spy@0.28.5: resolution: {integrity: sha512-7if6rsHQr9zbmvxN7h+gGh2L9eIIErgf8nSKYDlg07HHimCxp4H6I/X/DPXktVPPLQfiZ1Cw2cbDIx9fSqDjGw==} dependencies: @@ -17935,6 +17999,12 @@ packages: tinyspy: 3.0.0 dev: true + /@vitest/spy@3.0.8: + resolution: {integrity: sha512-MR+PzJa+22vFKYb934CejhR4BeRpMSoxkvNoDit68GQxRLSf11aT6CTj3XaqUU9rxgWJFnqicN/wxw6yBRkI1Q==} + dependencies: + tinyspy: 3.0.2 + dev: true + /@vitest/utils@0.28.5: resolution: {integrity: sha512-UyZdYwdULlOa4LTUSwZ+Paz7nBHGTT72jKwdFSV4IjHF1xsokp+CabMdhjvVhYwkLfO88ylJT46YMilnkSARZA==} dependencies: @@ -17972,6 +18042,14 @@ packages: tinyrainbow: 1.2.0 dev: true + /@vitest/utils@3.0.8: + resolution: {integrity: sha512-nkBC3aEhfX2PdtQI/QwAWp8qZWwzASsU4Npbcd5RdMPBSSLCpkZp52P3xku3s3uA0HIEhGvEcF8rNkBsz9dQ4Q==} + dependencies: + '@vitest/pretty-format': 3.0.8 + loupe: 3.1.3 + tinyrainbow: 2.0.0 + dev: true + /@vue/compiler-core@3.4.38: resolution: {integrity: sha512-8IQOTCWnLFqfHzOGm9+P8OPSEDukgg3Huc92qSG49if/xI2SAwLHQO2qaPQbjCWPBcQoO1WYfXfTACUrWV3c5A==} dependencies: @@ -17996,7 +18074,7 @@ packages: '@vue/compiler-ssr': 3.4.38 '@vue/shared': 3.4.38 estree-walker: 2.0.2 - magic-string: 0.30.11 + magic-string: 0.30.17 postcss: 8.4.44 source-map-js: 1.2.0 @@ -18298,7 +18376,7 @@ packages: resolution: {integrity: sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==} engines: {node: '>= 6.0.0'} dependencies: - debug: 4.3.7 + debug: 4.4.0 transitivePeerDependencies: - supports-color dev: false @@ -18307,7 +18385,7 @@ packages: resolution: {integrity: sha512-o/zjMZRhJxny7OyEF+Op8X+efiELC7k7yOjMzgfzVqOzXqkBkWI79YoTdOtsuWd5BWhAGAuOY/Xa6xpiaWXiNg==} engines: {node: '>= 14'} dependencies: - debug: 4.3.7 + debug: 4.4.0 transitivePeerDependencies: - supports-color @@ -18315,7 +18393,7 @@ packages: resolution: {integrity: sha512-H0TSyFNDMomMNJQBn8wFV5YC/2eJ+VXECwOadZJT554xP6cODZHPX3H9QMQECxvrgiSOP1pHjy1sMWQVYJOUOA==} engines: {node: '>= 14'} dependencies: - debug: 4.3.7 + debug: 4.4.0 transitivePeerDependencies: - supports-color @@ -19118,7 +19196,7 @@ packages: resolution: {integrity: sha512-fdRxJkQ9MUSEi4jH2DcV3FAPFktk0wefilxrwNyUuWpoWawQGN7G7cB+fOYTtFfI6XNkFgwqJ/D3G18BoJJ/jg==} engines: {node: '>= 10.0.0'} dependencies: - '@babel/types': 7.25.6 + '@babel/types': 7.26.8 dev: false /bail@2.0.2: @@ -19530,7 +19608,7 @@ packages: /capnp-ts@0.7.0: resolution: {integrity: sha512-XKxXAC3HVPv7r674zP0VC3RTXz+/JKhfyw94ljvF80yynK6VkTnqE3jMuN8b3dUVmmc43TjyxjW4KTsmB3c86g==} dependencies: - debug: 4.3.7 + debug: 4.4.0 tslib: 2.6.2 transitivePeerDependencies: - supports-color @@ -19556,7 +19634,7 @@ packages: assertion-error: 1.1.0 check-error: 1.0.3 deep-eql: 4.1.3 - get-func-name: 2.0.0 + get-func-name: 2.0.2 loupe: 2.3.7 pathval: 1.1.1 type-detect: 4.0.8 @@ -19586,6 +19664,17 @@ packages: pathval: 2.0.0 dev: true + /chai@5.2.0: + resolution: {integrity: sha512-mCuXncKXk5iCLhfhwTc0izo0gtEmpz5CtG2y8GiOINBlMVS6v8TMRc5TaLWKS6692m9+dVVfzgeVxR5UxWHTYw==} + engines: {node: '>=12'} + dependencies: + assertion-error: 2.0.1 + check-error: 2.1.1 + deep-eql: 5.0.2 + loupe: 3.1.1 + pathval: 2.0.0 + dev: true + /chalk@2.4.2: resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==} engines: {node: '>=4'} @@ -20890,7 +20979,7 @@ packages: resolution: {integrity: sha512-f0ReSURdM3pcKPNS30mxOHSbaFLcknGmQjwSfmbcdOw1XWKXVhukM3NJHhr7NpY9BIyyWQb0EBo3KQvvuU5egQ==} engines: {node: '>= 8.0'} dependencies: - debug: 4.3.7 + debug: 4.4.0 readable-stream: 3.6.0 split-ca: 1.0.1 ssh2: 1.16.0 @@ -21319,6 +21408,10 @@ packages: /es-module-lexer@1.3.1: resolution: {integrity: sha512-JUFAyicQV9mXc3YRxPnDlrfBKpqt6hUYzz9/boprUJHs4e4KVr3XwOF70doO6gwXUor6EWZJAyWAfKki84t20Q==} + /es-module-lexer@1.6.0: + resolution: {integrity: sha512-qqnD1yMU6tk/jnaMosogGySTZP8YtUgAffA9nMN+E/rjxcfRQ6IEk7IiozUjgxKoFHBGjTLnrHB/YC45r/59EQ==} + dev: true + /es-object-atoms@1.0.0: resolution: {integrity: sha512-MZ4iQ6JwHOBQjahnjwaC1ZtIBH+2ohjamzAO3oaHcXYup7qxjF2fixyH+Q71voWHeOkI2q/TnJao/KfXYIZWbw==} engines: {node: '>= 0.4'} @@ -22510,6 +22603,11 @@ packages: resolution: {integrity: sha512-eNTPlAD67BmP31LDINZ3U7HSF8l57TxOY2PmBJ1shpCvpnxBF93mWCE8YHBnXs8qiUZJc9WDcWIeC3a2HIAMfw==} engines: {node: '>=6'} + /expect-type@1.2.0: + resolution: {integrity: sha512-80F22aiJ3GLyVnS/B3HzgR6RelZVumzj9jkL0Rhz4h0xYbNW9PjlQz5h3J/SShErbXBc295vseR4/MIbVmUbeA==} + engines: {node: '>=12.0.0'} + dev: true + /exponential-backoff@3.1.1: resolution: {integrity: sha512-dX7e/LHVJ6W3DE1MHWi9S1EYzDESENfLrYohG2G++ovZrYOkm4Knwa0mc1cn84xJOR4KEU0WSchhLbd0UklbHw==} dev: false @@ -22622,7 +22720,7 @@ packages: engines: {node: '>= 10.17.0'} hasBin: true dependencies: - debug: 4.3.7 + debug: 4.4.0 get-stream: 5.2.0 yauzl: 2.10.0 optionalDependencies: @@ -23162,10 +23260,6 @@ packages: resolution: {integrity: sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==} engines: {node: 6.* || 8.* || >= 10.*} - /get-func-name@2.0.0: - resolution: {integrity: sha512-Hm0ixYtaSZ/V7C8FJrtZIuBBI+iSgL+1Aq82zSu8VQNB4S3Gk8e7Qs3VwBDJAhmRZcFqkl3tQu36g/Foh5I5ig==} - dev: true - /get-func-name@2.0.2: resolution: {integrity: sha512-8vXOvuE167CtIc3OyItco7N/dpRtBbYOsPsXCz7X/PMnlGjYjSGuZJgM1Y7mmew7BKf9BqvLX2tnOVy1BBUsxQ==} dev: true @@ -23276,7 +23370,7 @@ packages: dependencies: basic-ftp: 5.0.3 data-uri-to-buffer: 5.0.1 - debug: 4.3.7 + debug: 4.4.0 fs-extra: 8.1.0 transitivePeerDependencies: - supports-color @@ -23340,6 +23434,18 @@ packages: path-scurry: 1.10.1 dev: false + /glob@10.4.5: + resolution: {integrity: sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg==} + hasBin: true + dependencies: + foreground-child: 3.1.1 + jackspeak: 3.4.3 + minimatch: 9.0.5 + minipass: 7.1.2 + package-json-from-dist: 1.0.0 + path-scurry: 1.11.1 + dev: true + /glob@11.0.0: resolution: {integrity: sha512-9UiX/Bl6J2yaBbxKoEBRm4Cipxgok8kQYcOPEhScPwebu2I0HoQOuYdIO6S3hLuWoZgpDpwQZMzTFxgpkyT76g==} engines: {node: 20 || >=22} @@ -23508,7 +23614,7 @@ packages: '@types/node': 20.14.14 '@types/semver': 7.5.1 chalk: 4.1.2 - debug: 4.3.7 + debug: 4.4.0 interpret: 3.1.1 semver: 7.6.3 tslib: 2.6.2 @@ -23762,7 +23868,7 @@ packages: engines: {node: '>= 14'} dependencies: agent-base: 7.1.0 - debug: 4.3.7 + debug: 4.4.0 transitivePeerDependencies: - supports-color @@ -23771,7 +23877,7 @@ packages: engines: {node: '>= 14'} dependencies: agent-base: 7.1.1 - debug: 4.3.7 + debug: 4.4.0 transitivePeerDependencies: - supports-color dev: false @@ -23790,7 +23896,7 @@ packages: engines: {node: '>= 6'} dependencies: agent-base: 6.0.2 - debug: 4.3.7 + debug: 4.4.0 transitivePeerDependencies: - supports-color dev: false @@ -23800,7 +23906,7 @@ packages: engines: {node: '>= 14'} dependencies: agent-base: 7.1.1 - debug: 4.3.7 + debug: 4.4.0 transitivePeerDependencies: - supports-color @@ -23809,7 +23915,7 @@ packages: engines: {node: '>= 14'} dependencies: agent-base: 7.1.1 - debug: 4.3.7 + debug: 4.4.0 transitivePeerDependencies: - supports-color dev: false @@ -24448,6 +24554,39 @@ packages: resolution: {integrity: sha512-Yljz7ffyPbrLpLngrMtZ7NduUgVvi6wG9RJ9IUcyCd59YQ911PBJphODUcbOVbqYfxe1wuYf/LJ8PauMRwsM/g==} dev: false + /istanbul-lib-coverage@3.2.2: + resolution: {integrity: sha512-O8dpsF+r0WV/8MNRKfnmrtCWhuKjxrq2w+jpzBL5UZKTi2LeVWnWOmWRxFlesJONmc+wLAGvKQZEOanko0LFTg==} + engines: {node: '>=8'} + dev: true + + /istanbul-lib-report@3.0.1: + resolution: {integrity: sha512-GCfE1mtsHGOELCU8e/Z7YWzpmybrx/+dSTfLrvY8qRmaY6zXTKWn6WQIjaAFw069icm6GVMNkgu0NzI4iPZUNw==} + engines: {node: '>=10'} + dependencies: + istanbul-lib-coverage: 3.2.2 + make-dir: 4.0.0 + supports-color: 7.2.0 + dev: true + + /istanbul-lib-source-maps@5.0.6: + resolution: {integrity: sha512-yg2d+Em4KizZC5niWhQaIomgf5WlL4vOOjZ5xGCmF8SnPE/mDWWXgvRExdcpCgh9lLRRa1/fSYp2ymmbJ1pI+A==} + engines: {node: '>=10'} + dependencies: + '@jridgewell/trace-mapping': 0.3.25 + debug: 4.4.0 + istanbul-lib-coverage: 3.2.2 + transitivePeerDependencies: + - supports-color + dev: true + + /istanbul-reports@3.1.7: + resolution: {integrity: sha512-BewmUXImeuRk2YY0PVbxgKAysvhRPUQE0h5QRM++nVWyubKGV0l8qQ5op8+B2DOmwSe63Jivj0BjkPQVf8fP5g==} + engines: {node: '>=8'} + dependencies: + html-escaper: 2.0.2 + istanbul-lib-report: 3.0.1 + dev: true + /jackspeak@2.3.6: resolution: {integrity: sha512-N3yCS/NegsOBokc8GAdM8UcmfsKiSS8cipheD/nivzr700H+nsMOxJjQnvwOcRYVuFkdH0wGUvW2WbXGmrZGbQ==} engines: {node: '>=14'} @@ -24456,6 +24595,14 @@ packages: optionalDependencies: '@pkgjs/parseargs': 0.11.0 + /jackspeak@3.4.3: + resolution: {integrity: sha512-OGlZQpz2yfahA/Rd1Y8Cd9SIEsqvXkLVoSw/cgwhnhFMDbsQFeZYoJJ7bIZBS9BcamUW96asq/npPWugM+RQBw==} + dependencies: + '@isaacs/cliui': 8.0.2 + optionalDependencies: + '@pkgjs/parseargs': 0.11.0 + dev: true + /jackspeak@4.0.1: resolution: {integrity: sha512-cub8rahkh0Q/bw1+GxP7aeSe29hHHn2V4m29nnDlvCdlgU+3UGxkZp7Z53jLUdpX3jdTO0nJZUDl3xvbWc2Xog==} engines: {node: 20 || >=22} @@ -25066,6 +25213,10 @@ packages: get-func-name: 2.0.2 dev: true + /loupe@3.1.3: + resolution: {integrity: sha512-kkIp7XSkP78ZxJEsSxW3712C6teJVoeHHwgo9zJ380de7IYyJ2ISlxojcH2pC5OFLewESmnRi/+XCDIEEVyoug==} + dev: true + /lowercase-keys@1.0.1: resolution: {integrity: sha512-G2Lj61tXDnVFFOi8VZds+SoQjtQC3dgokKdDG2mTm1tx4m50NUHBOZSBwQQHyy0V12A0JTG4icfZQH+xPyh8VA==} engines: {node: '>=0.10.0'} @@ -25080,6 +25231,10 @@ packages: resolution: {integrity: sha512-IJ4uwUTi2qCccrioU6g9g/5rvvVl13bsdczUUcqbciD9iLr095yj8DQKdObriEvuNSx325N1rV1O0sJFszx75g==} engines: {node: 14 || >=16.14} + /lru-cache@10.4.3: + resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==} + dev: true + /lru-cache@11.0.0: resolution: {integrity: sha512-Qv32eSV1RSCfhY3fpPE2GNZ8jgM9X7rdAfemLWqTUxwiyIC4jJ6Sy0fZ8H+oLWevO6i4/bizg7c8d8i6bxrzbA==} engines: {node: 20 || >=22} @@ -25145,6 +25300,12 @@ packages: resolution: {integrity: sha512-+Wri9p0QHMy+545hKww7YAu5NyzF8iomPL/RQazugQ9+Ez4Ic3mERMd8ZTX5rfK944j+560ZJi8iAwgak1Ac7A==} dependencies: '@jridgewell/sourcemap-codec': 1.5.0 + dev: true + + /magic-string@0.30.17: + resolution: {integrity: sha512-sNPKHvyjVf7gyjwS4xGTaW/mCnF8wnjtifKBEhxfZ7E/S8tQ0rssrwGNn6q8JH/ohItJfSQp9mBtQYuTlH5QnA==} + dependencies: + '@jridgewell/sourcemap-codec': 1.5.0 /magic-string@0.30.8: resolution: {integrity: sha512-ISQTe55T2ao7XtlAStud6qwYPZjE4GK1S/BeVPus4jrq6JuOnQ00YKQC581RWhR122W7msZV263KzVeLoqidyQ==} @@ -25160,6 +25321,14 @@ packages: source-map-js: 1.2.0 dev: false + /magicast@0.3.5: + resolution: {integrity: sha512-L0WhttDl+2BOsybvEOLK7fW3UA0OQ0IQ2d6Zl2x/a6vVRs3bAY0ECOSHHeL5jD+SbOpOCUEi0y1DgHEn9Qn1AQ==} + dependencies: + '@babel/parser': 7.26.8 + '@babel/types': 7.26.8 + source-map-js: 1.2.0 + dev: true + /make-dir@3.1.0: resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==} engines: {node: '>=8'} @@ -25722,7 +25891,7 @@ packages: resolution: {integrity: sha512-6Mj0yHLdUZjHnOPgr5xfWIMqMWS12zDN6iws9SLuSz76W8jTtAv24MN4/CL7gJrl5vtxGInkkqDv/JIoRsQOvA==} dependencies: '@types/debug': 4.1.12 - debug: 4.3.7 + debug: 4.4.0 decode-named-character-reference: 1.0.2 micromark-core-commonmark: 1.0.6 micromark-factory-space: 1.0.0 @@ -25879,6 +26048,13 @@ packages: dependencies: brace-expansion: 2.0.1 + /minimatch@9.0.5: + resolution: {integrity: sha512-G6T0ZX48xgozx7587koeX9Ys2NYy6Gmv//P89sEte9V9whIapMNF4idKxnW2QtCcLiTWlb/wfCabAtAFWhhBow==} + engines: {node: '>=16 || 14 >=14.17'} + dependencies: + brace-expansion: 2.0.1 + dev: true + /minimist-options@4.1.0: resolution: {integrity: sha512-Q4r8ghd80yhO/0j1O3B2BjweX3fiHg9cdOwjJd2J76Q135c+NDxGCqdYKQ1SKBuFfgWbAUzBfvYjPUEeNgqN1A==} engines: {node: '>= 6'} @@ -27097,7 +27273,7 @@ packages: dependencies: '@tootallnate/quickjs-emscripten': 0.23.0 agent-base: 7.1.1 - debug: 4.3.7 + debug: 4.4.0 get-uri: 6.0.1 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 @@ -27248,6 +27424,14 @@ packages: lru-cache: 10.0.1 minipass: 7.1.2 + /path-scurry@1.11.1: + resolution: {integrity: sha512-Xa4Nw17FS9ApQFJ9umLiJS4orGjm7ZzwUrwamcGQuHSzDyth9boKDaycYdDcZDuqYATXw4HFXgaqWTctW/v1HA==} + engines: {node: '>=16 || 14 >=14.18'} + dependencies: + lru-cache: 10.4.3 + minipass: 7.1.2 + dev: true + /path-scurry@2.0.0: resolution: {integrity: sha512-ypGJsmGtdXUOeM5u93TyeIEfEhM6s+ljAhrk5vAvSx8uyY/02OvrZnA0YNGUrPXfpJMgI1ODd3nwz8Npx4O4cg==} engines: {node: 20 || >=22} @@ -27290,6 +27474,10 @@ packages: /pathe@1.1.2: resolution: {integrity: sha512-whLdWMYL2TwI08hn8/ZqAbrVemu0LNaNNJZX73O6qaIdCTfXutsLhMkjdENX0qhsQ9uIimo4/aQOmXkoon2nDQ==} + /pathe@2.0.3: + resolution: {integrity: sha512-WUjGcAqP1gQacoQe+OBJsFA7Ld4DyXuUIjZ5cc75cLHvJ7dtNsTugphxIADwspS+AraAUePCKrSVtPLFj/F88w==} + dev: true + /pathval@1.1.1: resolution: {integrity: sha512-Dp6zGqpTdETdR63lehJYPeIOqpiNBNtc7BpWSLrOje7UaIsE5aY92r/AunQA7rsXvet3lrJ3JnZX29UPTKXyKQ==} dev: true @@ -28175,7 +28363,7 @@ packages: engines: {node: '>= 14'} dependencies: agent-base: 7.1.1 - debug: 4.3.7 + debug: 4.4.0 http-proxy-agent: 7.0.2 https-proxy-agent: 7.0.5 lru-cache: 7.18.3 @@ -28232,7 +28420,7 @@ packages: dependencies: '@puppeteer/browsers': 2.4.0 chromium-bidi: 0.6.5(devtools-protocol@0.0.1342118) - debug: 4.3.7 + debug: 4.4.0 devtools-protocol: 0.0.1342118 typed-query-selector: 2.12.0 ws: 8.18.0 @@ -29276,7 +29464,7 @@ packages: remix-auth: ^3.6.0 dependencies: '@remix-run/server-runtime': 2.1.0(typescript@5.5.4) - debug: 4.3.7 + debug: 4.4.0 remix-auth: 3.6.0(@remix-run/react@2.1.0)(@remix-run/server-runtime@2.1.0) transitivePeerDependencies: - supports-color @@ -29396,7 +29584,7 @@ packages: resolution: {integrity: sha512-OScOjQjrrjhAdFpQmnkE/qbIBGCRFhQB/YaJhcC3CPOlmhe7llnW46Ac1J5+EjcNXOTnDdpF96Erw/yedsGksQ==} engines: {node: '>=8.6.0'} dependencies: - debug: 4.3.7 + debug: 4.4.0 module-details-from-path: 1.0.3 resolve: 1.22.8 transitivePeerDependencies: @@ -29795,7 +29983,7 @@ packages: resolution: {integrity: sha512-v67WcEouB5GxbTWL/4NeToqcZiAWEq90N888fczVArY8A79J0L4FD7vj5hm3eUMua5EpoQ59wa/oovY6TLvRUA==} engines: {node: '>= 18'} dependencies: - debug: 4.3.7 + debug: 4.4.0 destroy: 1.2.0 encodeurl: 2.0.0 escape-html: 1.0.3 @@ -30175,7 +30363,7 @@ packages: engines: {node: '>= 14'} dependencies: agent-base: 7.1.1 - debug: 4.3.7 + debug: 4.4.0 socks: 2.8.3 transitivePeerDependencies: - supports-color @@ -30783,7 +30971,7 @@ packages: estree-walker: 3.0.3 is-reference: 3.0.1 locate-character: 3.0.0 - magic-string: 0.30.11 + magic-string: 0.30.17 periscopic: 3.1.0 /swr@2.2.5(react@18.3.1): @@ -31161,6 +31349,15 @@ packages: commander: 2.20.3 source-map-support: 0.5.21 + /test-exclude@7.0.1: + resolution: {integrity: sha512-pFYqmTw68LXVjeWJMST4+borgQP2AyMNbg1BpZh9LbyhUeNkeaPF9gzfPGUAnSMV3qPYdWUwDIjjCLiSDOl7vg==} + engines: {node: '>=18'} + dependencies: + '@istanbuljs/schema': 0.1.3 + glob: 10.4.5 + minimatch: 9.0.5 + dev: true + /testcontainers@10.13.1: resolution: {integrity: sha512-JBbOhxmygj/ouH/47GnoVNt+c55Telh/45IjVxEbDoswsLchVmJiuKiw/eF6lE5i7LN+/99xsrSCttI3YRtirg==} dependencies: @@ -31262,7 +31459,6 @@ packages: /tinyexec@0.3.2: resolution: {integrity: sha512-KQQR9yN7R5+OSwaK0XQoj22pwHoTlgYqmUscPYoknOoWCWfj/5/ABTMRi69FrKU5ffPVh5QcFikpWJI/P1ocHA==} - dev: false /tinyglobby@0.2.10: resolution: {integrity: sha512-Zc+8eJlFMvgatPZTl6A9L/yht8QqdmUNtURHaKZLmKBE12hNPSrqNkUp2cs3M/UKmNVVAMFQYSjYIVHDjW5zew==} @@ -31302,11 +31498,21 @@ packages: engines: {node: ^18.0.0 || >=20.0.0} dev: true + /tinypool@1.0.2: + resolution: {integrity: sha512-al6n+QEANGFOMf/dmUMsuS5/r9B06uwlyNjZZql/zv8J7ybHCgoihBNORZCY2mzUuAnomQa2JdhyHKzZxPCrFA==} + engines: {node: ^18.0.0 || >=20.0.0} + dev: true + /tinyrainbow@1.2.0: resolution: {integrity: sha512-weEDEq7Z5eTHPDh4xjX789+fHfF+P8boiFB+0vbWzpbnbsEr/GRaohi/uMKxg8RZMXnl1ItAi/IUHWMsjDV7kQ==} engines: {node: '>=14.0.0'} dev: true + /tinyrainbow@2.0.0: + resolution: {integrity: sha512-op4nsTR47R6p0vMUUoYl/a+ljLFVtlfaXkLQmqfLR1qHma1h/ysYk4hEXZ880bf2CYgTskvTa/e196Vd5dDQXw==} + engines: {node: '>=14.0.0'} + dev: true + /tinyspy@1.0.2: resolution: {integrity: sha512-bSGlgwLBYf7PnUsQ6WOc6SJ3pGOcd+d8AA6EUnLDDM0kWEstC1JIlSZA3UNliDXhd9ABoS7hiRBDCu+XP/sf1Q==} engines: {node: '>=14.0.0'} @@ -31322,6 +31528,11 @@ packages: engines: {node: '>=14.0.0'} dev: true + /tinyspy@3.0.2: + resolution: {integrity: sha512-n1cw8k1k0x4pgA2+9XrOkFydTerNcJ1zWCO5Nn9scWHTD+5tp8dghT2x1uduQePZTZgd3Tupf+x9BxJjeJi77Q==} + engines: {node: '>=14.0.0'} + dev: true + /tmp-promise@3.0.3: resolution: {integrity: sha512-RwM7MoPojPxsOBYnyd2hy0bxtIlVrihNs9pj5SUvY8Zz1sQcQG2tG1hSr8PDxfgEB8RNKDhqbIlroIarSNDNsQ==} dependencies: @@ -32530,7 +32741,7 @@ packages: hasBin: true dependencies: cac: 6.7.14 - debug: 4.3.7 + debug: 4.4.0 mlly: 1.7.1 pathe: 1.1.2 picocolors: 1.1.1 @@ -32554,7 +32765,7 @@ packages: hasBin: true dependencies: cac: 6.7.14 - debug: 4.3.7 + debug: 4.4.0 mlly: 1.7.1 pathe: 1.1.2 picocolors: 1.1.1 @@ -32578,7 +32789,7 @@ packages: hasBin: true dependencies: cac: 6.7.14 - debug: 4.3.7 + debug: 4.4.0 pathe: 1.1.2 picocolors: 1.1.1 vite: 5.2.7(@types/node@20.14.14) @@ -32620,7 +32831,7 @@ packages: hasBin: true dependencies: cac: 6.7.14 - debug: 4.3.7 + debug: 4.4.0 pathe: 1.1.2 tinyrainbow: 1.2.0 vite: 5.2.7(@types/node@20.14.14) @@ -32635,6 +32846,27 @@ packages: - terser dev: true + /vite-node@3.0.8(@types/node@20.14.14): + resolution: {integrity: sha512-6PhR4H9VGlcwXZ+KWCdMqbtG649xCPZqfI9j2PsK1FcXgEzro5bGHcVKFCTqPLaNKZES8Evqv4LwvZARsq5qlg==} + engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} + hasBin: true + dependencies: + cac: 6.7.14 + debug: 4.4.0 + es-module-lexer: 1.6.0 + pathe: 2.0.3 + vite: 5.2.7(@types/node@20.14.14) + transitivePeerDependencies: + - '@types/node' + - less + - lightningcss + - sass + - stylus + - sugarss + - supports-color + - terser + dev: true + /vite-tsconfig-paths@4.0.5(typescript@5.5.4): resolution: {integrity: sha512-/L/eHwySFYjwxoYt1WRJniuK/jPv+WGwgRGBYx3leciR5wBeqntQpUE6Js6+TJemChc+ter7fDBKieyEWDx4yQ==} dependencies: @@ -33045,6 +33277,66 @@ packages: - terser dev: true + /vitest@3.0.8(@types/node@20.14.14): + resolution: {integrity: sha512-dfqAsNqRGUc8hB9OVR2P0w8PZPEckti2+5rdZip0WIz9WW0MnImJ8XiR61QhqLa92EQzKP2uPkzenKOAHyEIbA==} + engines: {node: ^18.0.0 || ^20.0.0 || >=22.0.0} + hasBin: true + peerDependencies: + '@edge-runtime/vm': '*' + '@types/debug': ^4.1.12 + '@types/node': ^18.0.0 || ^20.0.0 || >=22.0.0 + '@vitest/browser': 3.0.8 + '@vitest/ui': 3.0.8 + happy-dom: '*' + jsdom: '*' + peerDependenciesMeta: + '@edge-runtime/vm': + optional: true + '@types/debug': + optional: true + '@types/node': + optional: true + '@vitest/browser': + optional: true + '@vitest/ui': + optional: true + happy-dom: + optional: true + jsdom: + optional: true + dependencies: + '@types/node': 20.14.14 + '@vitest/expect': 3.0.8 + '@vitest/mocker': 3.0.8(vite@5.2.7) + '@vitest/pretty-format': 3.0.8 + '@vitest/runner': 3.0.8 + '@vitest/snapshot': 3.0.8 + '@vitest/spy': 3.0.8 + '@vitest/utils': 3.0.8 + chai: 5.2.0 + debug: 4.4.0 + expect-type: 1.2.0 + magic-string: 0.30.17 + pathe: 2.0.3 + std-env: 3.8.1 + tinybench: 2.9.0 + tinyexec: 0.3.2 + tinypool: 1.0.2 + tinyrainbow: 2.0.0 + vite: 5.2.7(@types/node@20.14.14) + vite-node: 3.0.8(@types/node@20.14.14) + why-is-node-running: 2.3.0 + transitivePeerDependencies: + - less + - lightningcss + - msw + - sass + - stylus + - sugarss + - supports-color + - terser + dev: true + /vue@3.4.38(typescript@5.5.4): resolution: {integrity: sha512-f0ZgN+mZ5KFgVv9wz0f4OgVKukoXtS3nwET4c2vLBGQR50aI8G0cqbFtLlX9Yiyg3LFGBitruPHt2PxwTduJEw==} peerDependencies: diff --git a/references/hello-world/package.json b/references/hello-world/package.json index b8c3a8ad1b..005e19ed26 100644 --- a/references/hello-world/package.json +++ b/references/hello-world/package.json @@ -7,5 +7,8 @@ }, "dependencies": { "@trigger.dev/sdk": "workspace:*" + }, + "scripts": { + "dev": "trigger dev" } } \ No newline at end of file diff --git a/references/hello-world/src/trigger/example.ts b/references/hello-world/src/trigger/example.ts index e3e3a1c9a4..688ffa96cb 100644 --- a/references/hello-world/src/trigger/example.ts +++ b/references/hello-world/src/trigger/example.ts @@ -22,7 +22,12 @@ export const parentTask = task({ id: "parent", run: async (payload: any, { ctx }) => { logger.log("Hello, world from the parent", { payload }); - await childTask.triggerAndWait({ message: "Hello, world!" }); + await childTask.triggerAndWait( + { message: "Hello, world!" }, + { + releaseConcurrency: true, + } + ); }, }); diff --git a/references/hello-world/src/trigger/waits.ts b/references/hello-world/src/trigger/waits.ts index 0749cde17d..f3a84a7a52 100644 --- a/references/hello-world/src/trigger/waits.ts +++ b/references/hello-world/src/trigger/waits.ts @@ -73,7 +73,12 @@ export const waitForDuration = task({ }) => { const idempotency = idempotencyKey ? await idempotencyKeys.create(idempotencyKey) : undefined; - await wait.for({ seconds: duration, idempotencyKey: idempotency, idempotencyKeyTTL }); + await wait.for({ + seconds: duration, + idempotencyKey: idempotency, + idempotencyKeyTTL, + releaseConcurrency: true, + }); await wait.until({ date: new Date(Date.now() + duration * 1000) }); await retry.fetch("https://example.com/404", { method: "GET" }); diff --git a/references/test-tasks/src/trigger/test-reserve-concurrency-system.ts b/references/test-tasks/src/trigger/test-reserve-concurrency-system.ts index 05b8eba9a4..4a0b044791 100644 --- a/references/test-tasks/src/trigger/test-reserve-concurrency-system.ts +++ b/references/test-tasks/src/trigger/test-reserve-concurrency-system.ts @@ -1,4 +1,4 @@ -import { logger, task } from "@trigger.dev/sdk/v3"; +import { batch, logger, task } from "@trigger.dev/sdk/v3"; import assert from "assert"; import { getEnvironmentStats, @@ -293,8 +293,10 @@ export const testEnvReserveConcurrency = task({ })) ); + const retrievedHoldBatch = await batch.retrieve(holdBatch.batchId); + // Wait for the hold tasks to be executing - await Promise.all(holdBatch.runs.map((run) => waitForRunStatus(run.id, ["EXECUTING"]))); + await Promise.all(retrievedHoldBatch.runs.map((run) => waitForRunStatus(run, ["EXECUTING"]))); // Now we will trigger a parent task that will trigger a child task const parentRun = await genericParentTask.trigger( @@ -341,7 +343,7 @@ export const testEnvReserveConcurrency = task({ ); // Wait for the hold tasks to be completed - await Promise.all(holdBatch.runs.map((run) => waitForRunStatus(run.id, ["COMPLETED"]))); + await Promise.all(retrievedHoldBatch.runs.map((run) => waitForRunStatus(run, ["COMPLETED"]))); await updateEnvironmentConcurrencyLimit(ctx.environment.id, 100); diff --git a/references/test-tasks/src/utils.ts b/references/test-tasks/src/utils.ts index 6bbbc38ab8..5fa43866d7 100644 --- a/references/test-tasks/src/utils.ts +++ b/references/test-tasks/src/utils.ts @@ -74,9 +74,7 @@ const EnvironmentStatsResponseBody = z.object({ id: z.string(), concurrencyLimit: z.number(), currentConcurrency: z.number(), - reserveConcurrency: z.number(), queueConcurrency: z.number().optional(), - queueReserveConcurrency: z.number().optional(), queueCurrentConcurrency: z.number().optional(), });