diff --git a/.vscode/launch.json b/.vscode/launch.json index b82a2d5228..6d08392086 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -146,7 +146,7 @@ "type": "node-terminal", "request": "launch", "name": "Debug RunQueue tests", - "command": "pnpm run test ./src/run-queue/index.test.ts", + "command": "pnpm run test ./src/run-queue/index.test.ts --run", "cwd": "${workspaceFolder}/internal-packages/run-engine", "sourceMaps": true }, diff --git a/apps/coordinator/tsconfig.json b/apps/coordinator/tsconfig.json index 15cdfe9c1e..e03fd02412 100644 --- a/apps/coordinator/tsconfig.json +++ b/apps/coordinator/tsconfig.json @@ -1,6 +1,6 @@ { "compilerOptions": { - "target": "es2018", + "target": "es2020", "module": "commonjs", "esModuleInterop": true, "resolveJsonModule": true, diff --git a/apps/docker-provider/tsconfig.json b/apps/docker-provider/tsconfig.json index 3a866dd2b8..f87adfc2d7 100644 --- a/apps/docker-provider/tsconfig.json +++ b/apps/docker-provider/tsconfig.json @@ -1,6 +1,6 @@ { "compilerOptions": { - "target": "es2018", + "target": "es2020", "module": "commonjs", "esModuleInterop": true, "forceConsistentCasingInFileNames": true, diff --git a/apps/kubernetes-provider/tsconfig.json b/apps/kubernetes-provider/tsconfig.json index 4635e17647..6ec7865b64 100644 --- a/apps/kubernetes-provider/tsconfig.json +++ b/apps/kubernetes-provider/tsconfig.json @@ -1,6 +1,6 @@ { "compilerOptions": { - "target": "es2018", + "target": "es2020", "module": "commonjs", "esModuleInterop": true, "forceConsistentCasingInFileNames": true, diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index 02377169a3..8aaff96ce5 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -324,6 +324,10 @@ const EnvironmentSchema = z.object({ INTERNAL_OTEL_TRACE_DISABLED: z.string().default("0"), INTERNAL_OTEL_LOG_EXPORTER_URL: z.string().optional(), + INTERNAL_OTEL_METRIC_EXPORTER_URL: z.string().optional(), + INTERNAL_OTEL_METRIC_EXPORTER_AUTH_HEADERS: z.string().optional(), + INTERNAL_OTEL_METRIC_EXPORTER_ENABLED: z.string().default("0"), + INTERNAL_OTEL_METRIC_EXPORTER_INTERVAL_MS: z.coerce.number().int().default(30_000), ORG_SLACK_INTEGRATION_CLIENT_ID: z.string().optional(), ORG_SLACK_INTEGRATION_CLIENT_SECRET: z.string().optional(), @@ -460,8 +464,12 @@ const EnvironmentSchema = z.object({ RUN_ENGINE_QUEUE_AGE_RANDOMIZATION_BIAS: z.coerce.number().default(0.25), RUN_ENGINE_REUSE_SNAPSHOT_COUNT: z.coerce.number().int().default(0), RUN_ENGINE_MAXIMUM_ENV_COUNT: z.coerce.number().int().optional(), + RUN_ENGINE_RUN_QUEUE_SHARD_COUNT: z.coerce.number().int().default(4), RUN_ENGINE_WORKER_SHUTDOWN_TIMEOUT_MS: z.coerce.number().int().default(60_000), RUN_ENGINE_RETRY_WARM_START_THRESHOLD_MS: z.coerce.number().int().default(30_000), + RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS: z.coerce.number().int().default(200), + RUN_ENGINE_DEQUEUE_BLOCKING_TIMEOUT_SECONDS: z.coerce.number().int().default(10), + RUN_ENGINE_MASTER_QUEUE_CONSUMERS_INTERVAL_MS: z.coerce.number().int().default(500), RUN_ENGINE_WORKER_REDIS_HOST: z .string() @@ -617,6 +625,7 @@ const EnvironmentSchema = z.object({ RUN_ENGINE_RELEASE_CONCURRENCY_BATCH_SIZE: z.coerce.number().int().default(10), RUN_ENGINE_WORKER_ENABLED: z.string().default("1"), + RUN_ENGINE_WORKER_LOG_LEVEL: z.enum(["log", "error", "warn", "info", "debug"]).default("info"), /** How long should the presence ttl last */ DEV_PRESENCE_SSE_TIMEOUT: z.coerce.number().int().default(30_000), diff --git a/apps/webapp/app/presenters/v3/SpanPresenter.server.ts b/apps/webapp/app/presenters/v3/SpanPresenter.server.ts index 3caad87910..262184dcb9 100644 --- a/apps/webapp/app/presenters/v3/SpanPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/SpanPresenter.server.ts @@ -145,8 +145,7 @@ export class SpanPresenter extends BasePresenter { }, }, engine: true, - masterQueue: true, - secondaryMasterQueue: true, + workerQueue: true, error: true, output: true, outputType: true, @@ -364,8 +363,7 @@ export class SpanPresenter extends BasePresenter { maxDurationInSeconds: getMaxDuration(run.maxDurationInSeconds), batch: run.batch ? { friendlyId: run.batch.friendlyId } : undefined, engine: run.engine, - masterQueue: run.masterQueue, - secondaryMasterQueue: run.secondaryMasterQueue, + workerQueue: run.workerQueue, spanId: run.spanId, isCached: !!span.originalRun, }; diff --git a/apps/webapp/app/routes/admin.api.v1.migrate-legacy-master-queues.ts b/apps/webapp/app/routes/admin.api.v1.migrate-legacy-master-queues.ts new file mode 100644 index 0000000000..b960287c92 --- /dev/null +++ b/apps/webapp/app/routes/admin.api.v1.migrate-legacy-master-queues.ts @@ -0,0 +1,37 @@ +import { ActionFunctionArgs, json } from "@remix-run/server-runtime"; +import { prisma } from "~/db.server"; +import { authenticateApiRequestWithPersonalAccessToken } from "~/services/personalAccessToken.server"; +import { engine } from "~/v3/runEngine.server"; + +export async function action({ request }: ActionFunctionArgs) { + // Next authenticate the request + const authenticationResult = await authenticateApiRequestWithPersonalAccessToken(request); + + if (!authenticationResult) { + return json({ error: "Invalid or Missing API key" }, { status: 401 }); + } + + const user = await prisma.user.findUnique({ + where: { + id: authenticationResult.userId, + }, + }); + + if (!user) { + return json({ error: "Invalid or Missing API key" }, { status: 401 }); + } + + if (!user.admin) { + return json({ error: "You must be an admin to perform this action" }, { status: 403 }); + } + + try { + await engine.migrateLegacyMasterQueues(); + + return json({ + success: true, + }); + } catch (error) { + return json({ error: error instanceof Error ? error.message : error }, { status: 400 }); + } +} diff --git a/apps/webapp/app/routes/engine.v1.dev.dequeue.ts b/apps/webapp/app/routes/engine.v1.dev.dequeue.ts index 048ccdbca7..b7aa926a97 100644 --- a/apps/webapp/app/routes/engine.v1.dev.dequeue.ts +++ b/apps/webapp/app/routes/engine.v1.dev.dequeue.ts @@ -1,92 +1,22 @@ import { json } from "@remix-run/server-runtime"; -import { DequeuedMessage, DevDequeueRequestBody, MachineResources } from "@trigger.dev/core/v3"; -import { BackgroundWorkerId } from "@trigger.dev/core/v3/isomorphic"; -import { env } from "~/env.server"; +import { DevDequeueRequestBody } from "@trigger.dev/core/v3"; import { createActionApiRoute } from "~/services/routeBuilders/apiBuilder.server"; import { engine } from "~/v3/runEngine.server"; const { action } = createActionApiRoute( { - body: DevDequeueRequestBody, + body: DevDequeueRequestBody, // Even though we don't use it, we need to keep it for backwards compatibility maxContentLength: 1024 * 10, // 10KB method: "POST", }, - async ({ authentication, body }) => { - //we won't return more runs than this in one API call - let maxDequeueCount = env.DEV_DEQUEUE_MAX_RUNS_PER_PULL; - - //we can't use more than the max resources - const availableResources = body.maxResources ?? { - cpu: 8, - memory: 16, - }; - - let dequeuedMessages: DequeuedMessage[] = []; - - //we need to check the current worker, because a run might have been locked to it - const workers = body.oldWorkers.concat(body.currentWorker); - - //first we want to clear out old runs - for (const worker of workers) { - //dequeue - const latestResult = await engine.dequeueFromBackgroundWorkerMasterQueue({ - consumerId: authentication.environment.id, - //specific version - backgroundWorkerId: BackgroundWorkerId.toId(worker), - maxRunCount: maxDequeueCount, - maxResources: availableResources, - }); - - //add runs to the array - dequeuedMessages.push(...latestResult); - - //update availableResources - const consumedResources = latestResult.reduce( - (acc, r) => { - return { - cpu: acc.cpu + r.run.machine.cpu, - memory: acc.memory + r.run.machine.memory, - }; - }, - { cpu: 0, memory: 0 } - ); - updateAvailableResources(availableResources, consumedResources); - - //update maxDequeueCount - maxDequeueCount -= latestResult.length; - - //if we have no resources left, we exit the loop - if (!hasAvailableResources(availableResources)) break; - //we've already dequeued the max number of runs - if (maxDequeueCount <= 0) break; - } - - //dequeue from the current version if we still have space - if (hasAvailableResources(availableResources) && maxDequeueCount > 0) { - const latestResult = await engine.dequeueFromEnvironmentMasterQueue({ - consumerId: authentication.environment.id, - //current dev version (no specific version specified) - environmentId: authentication.environment.id, - maxRunCount: maxDequeueCount, - maxResources: availableResources, - }); - dequeuedMessages.push(...latestResult); - } + async ({ authentication }) => { + const dequeuedMessages = await engine.dequeueFromEnvironmentWorkerQueue({ + consumerId: authentication.environment.id, + environmentId: authentication.environment.id, + }); return json({ dequeuedMessages }, { status: 200 }); } ); -function updateAvailableResources( - availableResources: MachineResources, - resources: MachineResources -) { - availableResources.cpu -= resources.cpu; - availableResources.memory -= resources.memory; -} - -function hasAvailableResources(availableResources: MachineResources) { - return availableResources.cpu > 0 && availableResources.memory > 0; -} - export { action }; diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.deployments.$deploymentFriendlyId.dequeue.ts b/apps/webapp/app/routes/engine.v1.worker-actions.deployments.$deploymentFriendlyId.dequeue.ts index 76b0f0d3f9..b3495b1371 100644 --- a/apps/webapp/app/routes/engine.v1.worker-actions.deployments.$deploymentFriendlyId.dequeue.ts +++ b/apps/webapp/app/routes/engine.v1.worker-actions.deployments.$deploymentFriendlyId.dequeue.ts @@ -1,10 +1,9 @@ import { json, TypedResponse } from "@remix-run/server-runtime"; -import { CURRENT_DEPLOYMENT_LABEL } from "@trigger.dev/core/v3/isomorphic"; import { WorkerApiDequeueResponseBody } from "@trigger.dev/core/v3/workers"; import { z } from "zod"; -import { $replica, prisma } from "~/db.server"; import { createLoaderWorkerApiRoute } from "~/services/routeBuilders/apiBuilder.server"; +// Keep this route for backwards compatibility export const loader = createLoaderWorkerApiRoute( { params: z.object({ @@ -14,55 +13,7 @@ export const loader = createLoaderWorkerApiRoute( maxRunCount: z.coerce.number().optional(), }), }, - async ({ - authenticatedWorker, - params, - searchParams, - }): Promise> => { - const deployment = await $replica.workerDeployment.findUnique({ - where: { - friendlyId: params.deploymentFriendlyId, - }, - include: { - worker: true, - }, - }); - - if (!deployment) { - throw new Error("Deployment not found"); - } - - if (!deployment.worker) { - throw new Error("Worker not found"); - } - - const dequeuedMessages = (await isCurrentDeployment(deployment.id, deployment.environmentId)) - ? await authenticatedWorker.dequeueFromEnvironment( - deployment.worker.id, - deployment.environmentId - ) - : await authenticatedWorker.dequeueFromVersion( - deployment.worker.id, - searchParams.maxRunCount - ); - - return json(dequeuedMessages); + async (): Promise> => { + return json([]); } ); - -async function isCurrentDeployment(deploymentId: string, environmentId: string): Promise { - const promotion = await prisma.workerDeploymentPromotion.findUnique({ - where: { - environmentId_label: { - environmentId, - label: CURRENT_DEPLOYMENT_LABEL, - }, - }, - }); - - if (!promotion) { - return false; - } - - return promotion.deploymentId === deploymentId; -} diff --git a/apps/webapp/app/routes/engine.v1.worker-actions.dequeue.ts b/apps/webapp/app/routes/engine.v1.worker-actions.dequeue.ts index 43eea55317..bdc6456236 100644 --- a/apps/webapp/app/routes/engine.v1.worker-actions.dequeue.ts +++ b/apps/webapp/app/routes/engine.v1.worker-actions.dequeue.ts @@ -7,14 +7,9 @@ import { createActionWorkerApiRoute } from "~/services/routeBuilders/apiBuilder. export const action = createActionWorkerApiRoute( { - body: WorkerApiDequeueRequestBody, + body: WorkerApiDequeueRequestBody, // Even though we don't use it, we need to keep it for backwards compatibility }, - async ({ authenticatedWorker, body }): Promise> => { - return json( - await authenticatedWorker.dequeue({ - maxResources: body.maxResources, - maxRunCount: body.maxRunCount, - }) - ); + async ({ authenticatedWorker }): Promise> => { + return json(await authenticatedWorker.dequeue()); } ); diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx index d0fa15bf54..e8d527a6c2 100644 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.runs.$runParam.spans.$spanParam/route.tsx @@ -700,12 +700,8 @@ function RunBody({ {isAdmin && ( <> - Primary master queue - {run.masterQueue} - - - Secondary master queue - {run.secondaryMasterQueue ?? "–"} + Worker queue + {run.workerQueue} )} diff --git a/apps/webapp/app/runEngine/concerns/queues.server.ts b/apps/webapp/app/runEngine/concerns/queues.server.ts index 7701135516..c8bda40d22 100644 --- a/apps/webapp/app/runEngine/concerns/queues.server.ts +++ b/apps/webapp/app/runEngine/concerns/queues.server.ts @@ -196,9 +196,9 @@ export class DefaultQueueManager implements QueueManager { }; } - async getMasterQueue(environment: AuthenticatedEnvironment): Promise { + async getWorkerQueue(environment: AuthenticatedEnvironment): Promise { if (environment.type === "DEVELOPMENT") { - return; + return environment.id; } const workerGroupService = new WorkerGroupService({ diff --git a/apps/webapp/app/runEngine/services/triggerTask.server.ts b/apps/webapp/app/runEngine/services/triggerTask.server.ts index a696758404..29db094bfd 100644 --- a/apps/webapp/app/runEngine/services/triggerTask.server.ts +++ b/apps/webapp/app/runEngine/services/triggerTask.server.ts @@ -234,7 +234,7 @@ export class RunEngineTriggerTaskService { lockedQueueId, }); - const masterQueue = await this.queueConcern.getMasterQueue(environment); + const workerQueue = await this.queueConcern.getWorkerQueue(environment); try { return await this.traceEventConcern.traceRun(triggerRequest, async (event) => { @@ -271,7 +271,7 @@ export class RunEngineTriggerTaskService { concurrencyKey: body.options?.concurrencyKey, queue: queueName, lockedQueueId, - masterQueue: masterQueue, + workerQueue, isTest: body.options?.test ?? false, delayUntil, queuedAt: delayUntil ? undefined : new Date(), diff --git a/apps/webapp/app/runEngine/types.ts b/apps/webapp/app/runEngine/types.ts index e953f53169..f9853215b4 100644 --- a/apps/webapp/app/runEngine/types.ts +++ b/apps/webapp/app/runEngine/types.ts @@ -66,7 +66,7 @@ export interface QueueManager { ): Promise; getQueueName(request: TriggerTaskRequest): Promise; validateQueueLimits(env: AuthenticatedEnvironment): Promise; - getMasterQueue(env: AuthenticatedEnvironment): Promise; + getWorkerQueue(env: AuthenticatedEnvironment): Promise; } export interface PayloadProcessor { diff --git a/apps/webapp/app/services/deleteProject.server.ts b/apps/webapp/app/services/deleteProject.server.ts index 8af67330a2..8f069aaf14 100644 --- a/apps/webapp/app/services/deleteProject.server.ts +++ b/apps/webapp/app/services/deleteProject.server.ts @@ -41,15 +41,9 @@ export class DeleteProjectService { } // Delete all queues from the RunEngine 2 prod master queues - const workerGroups = await this.#prismaClient.workerInstanceGroup.findMany({ - select: { - masterQueue: true, - }, - }); - const engineMasterQueues = workerGroups.map((group) => group.masterQueue); - for (const masterQueue of engineMasterQueues) { + for (const environment of project.environments) { await engine.removeEnvironmentQueuesFromMasterQueue({ - masterQueue, + runtimeEnvironmentId: environment.id, organizationId: project.organization.id, projectId: project.id, }); diff --git a/apps/webapp/app/v3/runEngine.server.ts b/apps/webapp/app/v3/runEngine.server.ts index bd619391e1..bd5983ce2a 100644 --- a/apps/webapp/app/v3/runEngine.server.ts +++ b/apps/webapp/app/v3/runEngine.server.ts @@ -4,7 +4,7 @@ import { prisma } from "~/db.server"; import { env } from "~/env.server"; import { singleton } from "~/utils/singleton"; import { allMachines } from "./machinePresets.server"; -import { tracer } from "./tracer.server"; +import { tracer, meter } from "./tracer.server"; export const engine = singleton("RunEngine", createRunEngine); @@ -13,6 +13,7 @@ export type { RunEngine }; function createRunEngine() { const engine = new RunEngine({ prisma, + logLevel: env.RUN_ENGINE_WORKER_LOG_LEVEL, worker: { disabled: env.RUN_ENGINE_WORKER_ENABLED === "0", workers: env.RUN_ENGINE_WORKER_COUNT, @@ -58,6 +59,11 @@ function createRunEngine() { maximumEnvCount: env.RUN_ENGINE_MAXIMUM_ENV_COUNT, tracer, }, + shardCount: env.RUN_ENGINE_RUN_QUEUE_SHARD_COUNT, + processWorkerQueueDebounceMs: env.RUN_ENGINE_PROCESS_WORKER_QUEUE_DEBOUNCE_MS, + dequeueBlockingTimeoutSeconds: env.RUN_ENGINE_DEQUEUE_BLOCKING_TIMEOUT_SECONDS, + masterQueueConsumersIntervalMs: env.RUN_ENGINE_MASTER_QUEUE_CONSUMERS_INTERVAL_MS, + masterQueueConsumersDisabled: env.RUN_ENGINE_WORKER_ENABLED === "0", }, runLock: { redis: { @@ -71,6 +77,7 @@ function createRunEngine() { }, }, tracer, + meter, heartbeatTimeoutsMs: { PENDING_EXECUTING: env.RUN_ENGINE_TIMEOUT_PENDING_EXECUTING, PENDING_CANCEL: env.RUN_ENGINE_TIMEOUT_PENDING_CANCEL, diff --git a/apps/webapp/app/v3/services/worker/workerGroupTokenService.server.ts b/apps/webapp/app/v3/services/worker/workerGroupTokenService.server.ts index ecdb49724e..ecc4f6251a 100644 --- a/apps/webapp/app/v3/services/worker/workerGroupTokenService.server.ts +++ b/apps/webapp/app/v3/services/worker/workerGroupTokenService.server.ts @@ -1,5 +1,5 @@ import { customAlphabet } from "nanoid"; -import { WithRunEngine, WithRunEngineOptions } from "../baseService.server"; +import { ServiceValidationError, WithRunEngine, WithRunEngineOptions } from "../baseService.server"; import { createHash, timingSafeEqual } from "crypto"; import { logger } from "~/services/logger.server"; import { @@ -537,19 +537,11 @@ export class AuthenticatedWorkerInstance extends WithRunEngine { }); } - async dequeue({ - maxRunCount = 10, - maxResources, - }: { - maxRunCount?: number; - maxResources?: MachineResources; - } = {}): Promise { + async dequeue(): Promise { if (this.type === WorkerInstanceGroupType.MANAGED) { - return await this._engine.dequeueFromMasterQueue({ + return await this._engine.dequeueFromWorkerQueue({ consumerId: this.workerInstanceId, - masterQueue: this.masterQueue, - maxRunCount, - maxResources, + workerQueue: this.masterQueue, workerId: this.workerInstanceId, runnerId: this.runnerId, }); @@ -572,51 +564,21 @@ export class AuthenticatedWorkerInstance extends WithRunEngine { }); if (this.isLatestDeployment) { - return await this._engine.dequeueFromEnvironmentMasterQueue({ + return await this._engine.dequeueFromEnvironmentWorkerQueue({ consumerId: this.workerInstanceId, environmentId: this.environment.id, - maxRunCount, - backgroundWorkerId: this.backgroundWorkerId, workerId: this.workerInstanceId, runnerId: this.runnerId, }); } - return await this._engine.dequeueFromBackgroundWorkerMasterQueue({ - consumerId: this.workerInstanceId, - backgroundWorkerId: this.backgroundWorkerId, - maxRunCount, - workerId: this.workerInstanceId, - runnerId: this.runnerId, - }); - } - - /** Allows managed workers to dequeue from a specific version */ - async dequeueFromVersion( - backgroundWorkerId: string, - maxRunCount = 1 - ): Promise { - if (this.type !== WorkerInstanceGroupType.MANAGED) { - logger.error("[AuthenticatedWorkerInstance] Worker instance is not managed", { - ...this.toJSON(), - }); - return []; - } - - return await this._engine.dequeueFromBackgroundWorkerMasterQueue({ - consumerId: this.workerInstanceId, - backgroundWorkerId, - maxRunCount, - workerId: this.workerInstanceId, - runnerId: this.runnerId, - }); + throw new ServiceValidationError("Unmanaged workers cannot dequeue from a specific version"); } /** Allows managed workers to dequeue from a specific environment */ async dequeueFromEnvironment( backgroundWorkerId: string, - environmentId: string, - maxRunCount = 1 + environmentId: string ): Promise { if (this.type !== WorkerInstanceGroupType.MANAGED) { logger.error("[AuthenticatedWorkerInstance] Worker instance is not managed", { @@ -625,11 +587,10 @@ export class AuthenticatedWorkerInstance extends WithRunEngine { return []; } - return await this._engine.dequeueFromEnvironmentMasterQueue({ + return await this._engine.dequeueFromEnvironmentWorkerQueue({ consumerId: this.workerInstanceId, backgroundWorkerId, environmentId, - maxRunCount, workerId: this.workerInstanceId, runnerId: this.runnerId, }); diff --git a/apps/webapp/app/v3/tracer.server.ts b/apps/webapp/app/v3/tracer.server.ts index fc676eb424..cf47df248b 100644 --- a/apps/webapp/app/v3/tracer.server.ts +++ b/apps/webapp/app/v3/tracer.server.ts @@ -11,6 +11,8 @@ import { Tracer, diag, trace, + metrics, + Meter, } from "@opentelemetry/api"; import { logs, SeverityNumber } from "@opentelemetry/api-logs"; import { OTLPTraceExporter } from "@opentelemetry/exporter-trace-otlp-http"; @@ -19,6 +21,12 @@ import { BatchLogRecordProcessor, LoggerProvider } from "@opentelemetry/sdk-logs import { type Instrumentation, registerInstrumentations } from "@opentelemetry/instrumentation"; import { ExpressInstrumentation } from "@opentelemetry/instrumentation-express"; import { HttpInstrumentation } from "@opentelemetry/instrumentation-http"; +import { + MeterProvider, + ConsoleMetricExporter, + PeriodicExportingMetricReader, +} from "@opentelemetry/sdk-metrics"; +import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-proto"; import { Resource } from "@opentelemetry/resources"; import { BatchSpanProcessor, @@ -30,7 +38,10 @@ import { TraceIdRatioBasedSampler, } from "@opentelemetry/sdk-trace-base"; import { NodeTracerProvider } from "@opentelemetry/sdk-trace-node"; -import { SEMRESATTRS_SERVICE_NAME } from "@opentelemetry/semantic-conventions"; +import { + SEMRESATTRS_SERVICE_INSTANCE_ID, + SEMRESATTRS_SERVICE_NAME, +} from "@opentelemetry/semantic-conventions"; import { PrismaInstrumentation } from "@prisma/instrumentation"; import { env } from "~/env.server"; import { AuthenticatedEnvironment } from "~/services/apiAuth.server"; @@ -38,9 +49,13 @@ import { singleton } from "~/utils/singleton"; import { LoggerSpanExporter } from "./telemetry/loggerExporter.server"; import { logger } from "~/services/logger.server"; import { flattenAttributes } from "@trigger.dev/core/v3"; +import { randomUUID } from "node:crypto"; +import { prisma } from "~/db.server"; export const SEMINTATTRS_FORCE_RECORDING = "forceRecording"; +const SERVICE_INSTANCE_ID = randomUUID(); + class CustomWebappSampler implements Sampler { constructor(private readonly _baseSampler: Sampler) {} @@ -83,7 +98,12 @@ class CustomWebappSampler implements Sampler { } } -export const { tracer, logger: otelLogger, provider } = singleton("tracer", getTracer); +export const { + tracer, + logger: otelLogger, + provider, + meter, +} = singleton("opentelemetry", setupTelemetry); export async function startActiveSpan( name: string, @@ -148,7 +168,7 @@ export async function emitWarnLog(message: string, params: Record g.key === "prisma_pool_connections_busy")?.value ?? 0; + const free = gauges.find((g) => g.key === "prisma_pool_connections_idle")?.value ?? 0; + const total = + gauges.find((g) => g.key === "prisma_pool_connections_open")?.value ?? busy + free; // fallback compute + + return { total, busy, free }; + } + + meter.addBatchObservableCallback( + async (res) => { + const { total, busy, free } = await readPoolCounters(); + res.observe(totalGauge, total); + res.observe(busyGauge, busy); + res.observe(freeGauge, free); + }, + [totalGauge, busyGauge, freeGauge] + ); +} + const SemanticEnvResources = { ENV_ID: "$trigger.env.id", ENV_TYPE: "$trigger.env.type", @@ -300,3 +390,33 @@ function parseInternalTraceHeaders(): Record | undefined { return; } } + +function parseInternalMetricsHeaders(): Record | undefined { + try { + return env.INTERNAL_OTEL_METRIC_EXPORTER_AUTH_HEADERS + ? (JSON.parse(env.INTERNAL_OTEL_METRIC_EXPORTER_AUTH_HEADERS) as Record) + : undefined; + } catch { + return; + } +} + +function createMetricsExporter() { + if (env.INTERNAL_OTEL_METRIC_EXPORTER_URL) { + const headers = parseInternalMetricsHeaders() ?? {}; + + console.log( + `🔦 Tracer: OTLP metric exporter enabled to ${ + env.INTERNAL_OTEL_METRIC_EXPORTER_URL + } with headers: ${Object.keys(headers)}` + ); + + return new OTLPMetricExporter({ + url: env.INTERNAL_OTEL_METRIC_EXPORTER_URL, + timeoutMillis: 30_000, + headers, + }); + } else { + return new ConsoleMetricExporter(); + } +} diff --git a/apps/webapp/package.json b/apps/webapp/package.json index 126590f175..9d4fb449c6 100644 --- a/apps/webapp/package.json +++ b/apps/webapp/package.json @@ -61,12 +61,14 @@ "@opentelemetry/core": "1.25.1", "@opentelemetry/exporter-logs-otlp-http": "0.52.1", "@opentelemetry/exporter-trace-otlp-http": "0.52.1", + "@opentelemetry/exporter-metrics-otlp-proto": "0.52.1", "@opentelemetry/instrumentation": "0.52.1", "@opentelemetry/instrumentation-express": "^0.36.1", "@opentelemetry/instrumentation-http": "0.52.1", "@opentelemetry/resources": "1.25.1", "@opentelemetry/sdk-logs": "0.52.1", "@opentelemetry/sdk-node": "0.52.1", + "@opentelemetry/sdk-metrics": "1.25.1", "@opentelemetry/sdk-trace-base": "1.25.1", "@opentelemetry/sdk-trace-node": "1.25.1", "@opentelemetry/semantic-conventions": "1.25.1", diff --git a/apps/webapp/test/engine/triggerTask.test.ts b/apps/webapp/test/engine/triggerTask.test.ts index 7b1804578d..ed7e02df6d 100644 --- a/apps/webapp/test/engine/triggerTask.test.ts +++ b/apps/webapp/test/engine/triggerTask.test.ts @@ -32,6 +32,7 @@ import { ValidationResult, } from "~/runEngine/types"; import { RunEngineTriggerTaskService } from "../../app/runEngine/services/triggerTask.server"; +import { setTimeout } from "node:timers/promises"; vi.setConfig({ testTimeout: 30_000 }); // 30 seconds timeout @@ -490,6 +491,8 @@ describe("RunEngineTriggerTaskService", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 100, }, runLock: { redis: redisOptions, @@ -569,11 +572,12 @@ describe("RunEngineTriggerTaskService", () => { ); expect(queueLength).toBe(1); + await setTimeout(500); + // Now we need to dequeue the run so so we can trigger a subtask - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: result?.run.masterQueue!, - maxRunCount: 1, + workerQueue: result?.run.workerQueue!, }); expect(dequeued.length).toBe(1); @@ -606,11 +610,12 @@ describe("RunEngineTriggerTaskService", () => { }, }); + await setTimeout(500); + // Okay, now lets dequeue the subtask - const dequeuedSubtask = await engine.dequeueFromMasterQueue({ + const dequeuedSubtask = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: subtaskResult?.run.masterQueue!, - maxRunCount: 1, + workerQueue: subtaskResult?.run.workerQueue!, }); expect(dequeuedSubtask.length).toBe(1); @@ -649,6 +654,8 @@ describe("RunEngineTriggerTaskService", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 100, }, runLock: { redis: redisOptions, @@ -722,11 +729,12 @@ describe("RunEngineTriggerTaskService", () => { expect(parentResult?.run.queue).toBe(`task/${taskIdentifier1}`); expect(parentResult?.run.lockedQueueId).toBeDefined(); + await setTimeout(500); + // Dequeue the parent run to simulate it running - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentResult?.run.masterQueue!, - maxRunCount: 1, + workerQueue: parentResult?.run.workerQueue!, }); expect(dequeued.length).toBe(1); @@ -923,11 +931,12 @@ describe("RunEngineTriggerTaskService", () => { expect(parentResult?.run.queue).toBe(`task/${taskIdentifier1}`); expect(parentResult?.run.lockedQueueId).toBeDefined(); + await setTimeout(500); + // Dequeue the parent run to simulate it running - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentResult?.run.masterQueue!, - maxRunCount: 1, + workerQueue: parentResult?.run.workerQueue!, }); expect(dequeued.length).toBe(1); @@ -980,6 +989,8 @@ describe("RunEngineTriggerTaskService", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 100, }, runLock: { redis: redisOptions, @@ -1053,11 +1064,12 @@ describe("RunEngineTriggerTaskService", () => { expect(parentResult?.run.queue).toBe(`task/${taskIdentifier1}`); expect(parentResult?.run.lockedQueueId).toBeDefined(); + await setTimeout(500); + // Dequeue the parent run to simulate it running - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentResult?.run.masterQueue!, - maxRunCount: 1, + workerQueue: parentResult?.run.workerQueue!, }); expect(dequeued.length).toBe(1); @@ -1109,6 +1121,8 @@ describe("RunEngineTriggerTaskService", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 100, }, runLock: { redis: redisOptions, @@ -1186,11 +1200,12 @@ describe("RunEngineTriggerTaskService", () => { ); expect(queueLength).toBe(1); + await setTimeout(500); + // Now we need to dequeue the run so so we can trigger a subtask - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: result?.run.masterQueue!, - maxRunCount: 1, + workerQueue: result?.run.workerQueue!, }); expect(dequeued.length).toBe(1); @@ -1223,11 +1238,12 @@ describe("RunEngineTriggerTaskService", () => { }, }); + await setTimeout(500); + // Okay, now lets dequeue the subtask - const dequeuedSubtask = await engine.dequeueFromMasterQueue({ + const dequeuedSubtask = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: subtaskResult?.run.masterQueue!, - maxRunCount: 1, + workerQueue: subtaskResult?.run.workerQueue!, }); expect(dequeuedSubtask.length).toBe(1); diff --git a/internal-packages/database/prisma/schema.prisma b/internal-packages/database/prisma/schema.prisma index 016bb7ca64..934a72a737 100644 --- a/internal-packages/database/prisma/schema.prisma +++ b/internal-packages/database/prisma/schema.prisma @@ -1764,7 +1764,8 @@ model TaskRun { lockedQueueId String? /// The main queue that this run is part of - masterQueue String @default("main") + workerQueue String @default("main") @map("masterQueue") + /// @deprecated secondaryMasterQueue String? /// From engine v2+ this will be defined after a run has been dequeued (starting at 1) diff --git a/internal-packages/run-engine/src/engine/index.ts b/internal-packages/run-engine/src/engine/index.ts index 5aad070707..620eac3f64 100644 --- a/internal-packages/run-engine/src/engine/index.ts +++ b/internal-packages/run-engine/src/engine/index.ts @@ -1,5 +1,5 @@ import { createRedisClient, Redis } from "@internal/redis"; -import { startSpan, trace, Tracer } from "@internal/tracing"; +import { getMeter, Meter, startSpan, trace, Tracer } from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { CheckpointInput, @@ -7,12 +7,11 @@ import { CreateCheckpointResult, DequeuedMessage, ExecutionResult, - MachineResources, RunExecutionData, StartRunAttemptResult, TaskRunExecutionResult, } from "@trigger.dev/core/v3"; -import { BatchId, RunId, WaitpointId } from "@trigger.dev/core/v3/isomorphic"; +import { RunId, WaitpointId } from "@trigger.dev/core/v3/isomorphic"; import { Prisma, PrismaClient, @@ -37,12 +36,13 @@ import { DelayedRunSystem } from "./systems/delayedRunSystem.js"; import { DequeueSystem } from "./systems/dequeueSystem.js"; import { EnqueueSystem } from "./systems/enqueueSystem.js"; import { + executionDataFromSnapshot, ExecutionSnapshotSystem, - getLatestExecutionSnapshot, getExecutionSnapshotsSince, - executionDataFromSnapshot, + getLatestExecutionSnapshot, } from "./systems/executionSnapshotSystem.js"; import { PendingVersionSystem } from "./systems/pendingVersionSystem.js"; +import { RaceSimulationSystem } from "./systems/raceSimulationSystem.js"; import { ReleaseConcurrencySystem } from "./systems/releaseConcurrencySystem.js"; import { RunAttemptSystem } from "./systems/runAttemptSystem.js"; import { SystemResources } from "./systems/systems.js"; @@ -50,14 +50,14 @@ import { TtlSystem } from "./systems/ttlSystem.js"; import { WaitpointSystem } from "./systems/waitpointSystem.js"; import { EngineWorker, HeartbeatTimeouts, RunEngineOptions, TriggerParams } from "./types.js"; import { workerCatalog } from "./workerCatalog.js"; -import { RaceSimulationSystem } from "./systems/raceSimulationSystem.js"; export class RunEngine { private runLockRedis: Redis; private runLock: RunLocker; private worker: EngineWorker; - private logger = new Logger("RunEngine", "debug"); + private logger: Logger; private tracer: Tracer; + private meter: Meter; private heartbeatTimeouts: HeartbeatTimeouts; prisma: PrismaClient; @@ -77,6 +77,7 @@ export class RunEngine { raceSimulationSystem: RaceSimulationSystem = new RaceSimulationSystem(); constructor(private readonly options: RunEngineOptions) { + this.logger = options.logger ?? new Logger("RunEngine", this.options.logLevel ?? "info"); this.prisma = options.prisma; this.runLockRedis = createRedisClient( { @@ -96,6 +97,7 @@ export class RunEngine { redis: this.runLockRedis, logger: this.logger, tracer: trace.getTracer("RunLocker"), + meter: options.meter, }); const keys = new RunQueueFullKeyProducer(); @@ -110,13 +112,26 @@ export class RunEngine { defaultEnvConcurrencyLimit: options.queue?.defaultEnvConcurrency ?? 10, }), defaultEnvConcurrency: options.queue?.defaultEnvConcurrency ?? 10, - logger: new Logger("RunQueue", "debug"), + logger: new Logger("RunQueue", this.options.logLevel ?? "info"), redis: { ...options.queue.redis, keyPrefix: `${options.queue.redis.keyPrefix}runqueue:` }, retryOptions: options.queue?.retryOptions, + workerOptions: { + disabled: options.worker.disabled, + concurrency: options.worker, + pollIntervalMs: options.worker.pollIntervalMs, + immediatePollIntervalMs: options.worker.immediatePollIntervalMs, + shutdownTimeoutMs: options.worker.shutdownTimeoutMs, + }, + shardCount: options.queue?.shardCount, + masterQueueConsumersDisabled: options.queue?.masterQueueConsumersDisabled, + masterQueueConsumersIntervalMs: options.queue?.masterQueueConsumersIntervalMs, + processWorkerQueueDebounceMs: options.queue?.processWorkerQueueDebounceMs, + dequeueBlockingTimeoutSeconds: options.queue?.dequeueBlockingTimeoutSeconds, + meter: options.meter, }); this.worker = new Worker({ - name: "worker", + name: "run-engine-worker", redisOptions: { ...options.worker.redis, keyPrefix: `${options.worker.redis.keyPrefix}worker:`, @@ -176,6 +191,7 @@ export class RunEngine { } this.tracer = options.tracer; + this.meter = options.meter ?? getMeter("run-engine"); const defaultHeartbeatTimeouts: HeartbeatTimeouts = { PENDING_EXECUTING: 60_000, @@ -195,6 +211,7 @@ export class RunEngine { eventBus: this.eventBus, logger: this.logger, tracer: this.tracer, + meter: this.meter, runLock: this.runLock, runQueue: this.runQueue, raceSimulationSystem: this.raceSimulationSystem, @@ -228,6 +245,7 @@ export class RunEngine { pollInterval: options.releaseConcurrency?.pollInterval ?? 1000, batchSize: options.releaseConcurrency?.batchSize ?? 10, tracer: this.tracer, + meter: this.meter, }, }); @@ -318,7 +336,7 @@ export class RunEngine { sdkVersion, cliVersion, concurrencyKey, - masterQueue, + workerQueue, queue, lockedQueueId, isTest, @@ -359,22 +377,6 @@ export class RunEngine { async (span) => { const status = delayUntil ? "DELAYED" : "PENDING"; - let secondaryMasterQueue: string | undefined = undefined; - - if (environment.type === "DEVELOPMENT") { - // In dev we use the environment id as the master queue, or the locked worker id - masterQueue = this.#environmentMasterQueueKey(environment.id); - if (lockedToVersionId) { - masterQueue = this.#backgroundWorkerQueueKey(lockedToVersionId); - } - } else { - // For deployed runs, we add the env/worker id as the secondary master queue - secondaryMasterQueue = this.#environmentMasterQueueKey(environment.id); - if (lockedToVersionId) { - secondaryMasterQueue = this.#backgroundWorkerQueueKey(lockedToVersionId); - } - } - //create run let taskRun: TaskRun; try { @@ -406,8 +408,7 @@ export class RunEngine { concurrencyKey, queue, lockedQueueId, - masterQueue, - secondaryMasterQueue, + workerQueue, isTest, delayUntil, queuedAt, @@ -557,45 +558,46 @@ export class RunEngine { /** * Gets a fairly selected run from the specified master queue, returning the information required to run it. * @param consumerId: The consumer that is pulling, allows multiple consumers to pull from the same queue - * @param masterQueue: The shared queue to pull from, can be an individual environment (for dev) + * @param workerQueue: The worker queue to pull from, can be an individual environment (for dev) * @returns */ - async dequeueFromMasterQueue({ + async dequeueFromWorkerQueue({ consumerId, - masterQueue, - maxRunCount, - maxResources, + workerQueue, backgroundWorkerId, workerId, runnerId, tx, }: { consumerId: string; - masterQueue: string; - maxRunCount: number; - maxResources?: MachineResources; + workerQueue: string; backgroundWorkerId?: string; workerId?: string; runnerId?: string; tx?: PrismaClientOrTransaction; }): Promise { - return this.dequeueSystem.dequeueFromMasterQueue({ + // We only do this with "prod" worker queues because we don't want to observe dev (e.g. environment) worker queues + this.runQueue.registerObservableWorkerQueue(workerQueue); + + const dequeuedMessage = await this.dequeueSystem.dequeueFromWorkerQueue({ consumerId, - masterQueue, - maxRunCount, - maxResources, + workerQueue, backgroundWorkerId, workerId, runnerId, tx, }); + + if (!dequeuedMessage) { + return []; + } + + return [dequeuedMessage]; } - async dequeueFromEnvironmentMasterQueue({ + async dequeueFromEnvironmentWorkerQueue({ consumerId, environmentId, - maxRunCount, - maxResources, backgroundWorkerId, workerId, runnerId, @@ -603,47 +605,14 @@ export class RunEngine { }: { consumerId: string; environmentId: string; - maxRunCount: number; - maxResources?: MachineResources; backgroundWorkerId?: string; workerId?: string; runnerId?: string; tx?: PrismaClientOrTransaction; }): Promise { - return this.dequeueFromMasterQueue({ + return this.dequeueFromWorkerQueue({ consumerId, - masterQueue: this.#environmentMasterQueueKey(environmentId), - maxRunCount, - maxResources, - backgroundWorkerId, - workerId, - runnerId, - tx, - }); - } - - async dequeueFromBackgroundWorkerMasterQueue({ - consumerId, - backgroundWorkerId, - maxRunCount, - maxResources, - workerId, - runnerId, - tx, - }: { - consumerId: string; - backgroundWorkerId: string; - maxRunCount: number; - maxResources?: MachineResources; - workerId?: string; - runnerId?: string; - tx?: PrismaClientOrTransaction; - }): Promise { - return this.dequeueFromMasterQueue({ - consumerId, - masterQueue: this.#backgroundWorkerQueueKey(backgroundWorkerId), - maxRunCount, - maxResources, + workerQueue: environmentId, backgroundWorkerId, workerId, runnerId, @@ -779,16 +748,16 @@ export class RunEngine { } async removeEnvironmentQueuesFromMasterQueue({ - masterQueue, + runtimeEnvironmentId, organizationId, projectId, }: { - masterQueue: string; + runtimeEnvironmentId: string; organizationId: string; projectId: string; }) { return this.runQueue.removeEnvironmentQueuesFromMasterQueue( - masterQueue, + runtimeEnvironmentId, organizationId, projectId ); @@ -1128,6 +1097,39 @@ export class RunEngine { return this.raceSimulationSystem.registerRacepointForRun({ runId, waitInterval }); } + async migrateLegacyMasterQueues() { + const workerGroups = await this.prisma.workerInstanceGroup.findMany({ + where: { + type: "MANAGED", + }, + select: { + id: true, + name: true, + masterQueue: true, + }, + }); + + this.logger.info("Migrating legacy master queues", { + workerGroups, + }); + + for (const workerGroup of workerGroups) { + this.logger.info("Migrating legacy master queue", { + workerGroupId: workerGroup.id, + workerGroupName: workerGroup.name, + workerGroupMasterQueue: workerGroup.masterQueue, + }); + + await this.runQueue.migrateLegacyMasterQueue(workerGroup.masterQueue); + + this.logger.info("Migrated legacy master queue", { + workerGroupId: workerGroup.id, + workerGroupName: workerGroup.name, + workerGroupMasterQueue: workerGroup.masterQueue, + }); + } + } + async quit() { try { //stop the run queue @@ -1318,12 +1320,4 @@ export class RunEngine { } }); } - - #environmentMasterQueueKey(environmentId: string) { - return `master-env:${environmentId}`; - } - - #backgroundWorkerQueueKey(backgroundWorkerId: string) { - return `master-background-worker:${backgroundWorkerId}`; - } } diff --git a/internal-packages/run-engine/src/engine/locking.ts b/internal-packages/run-engine/src/engine/locking.ts index c8e790204f..1e120a3feb 100644 --- a/internal-packages/run-engine/src/engine/locking.ts +++ b/internal-packages/run-engine/src/engine/locking.ts @@ -5,11 +5,27 @@ import { Redis } from "@internal/redis"; import * as redlock from "redlock"; import { tryCatch } from "@trigger.dev/core"; import { Logger } from "@trigger.dev/core/logger"; -import { startSpan, Tracer } from "@internal/tracing"; +import { + startSpan, + Tracer, + Meter, + getMeter, + ValueType, + ObservableResult, + Attributes, + Histogram, +} from "@internal/tracing"; + +const SemanticAttributes = { + LOCK_TYPE: "run_engine.lock.type", + LOCK_RESOURCES: "run_engine.lock.resources", + LOCK_SUCCESS: "run_engine.lock.success", +}; interface LockContext { resources: string; signal: redlock.RedlockAbortSignal; + lockType: string; } export class RunLocker { @@ -17,8 +33,11 @@ export class RunLocker { private asyncLocalStorage: AsyncLocalStorage; private logger: Logger; private tracer: Tracer; + private meter: Meter; + private activeLocks: Map = new Map(); + private lockDurationHistogram: Histogram; - constructor(options: { redis: Redis; logger: Logger; tracer: Tracer }) { + constructor(options: { redis: Redis; logger: Logger; tracer: Tracer; meter?: Meter }) { this.redlock = new Redlock([options.redis], { driftFactor: 0.01, retryCount: 10, @@ -29,6 +48,39 @@ export class RunLocker { this.asyncLocalStorage = new AsyncLocalStorage(); this.logger = options.logger; this.tracer = options.tracer; + this.meter = options.meter ?? getMeter("run-engine"); + + const activeLocksObservableGauge = this.meter.createObservableGauge("run_engine.locks.active", { + description: "The number of active locks by type", + unit: "locks", + valueType: ValueType.INT, + }); + + const lockDurationHistogram = this.meter.createHistogram("run_engine.lock.duration", { + description: "The duration of lock operations", + unit: "ms", + valueType: ValueType.DOUBLE, + }); + + activeLocksObservableGauge.addCallback(this.#updateActiveLocksCount.bind(this)); + this.lockDurationHistogram = lockDurationHistogram; + } + + async #updateActiveLocksCount(observableResult: ObservableResult) { + // Group active locks by type + const lockCountsByType = new Map(); + + for (const [_, lockInfo] of this.activeLocks) { + const count = lockCountsByType.get(lockInfo.lockType) || 0; + lockCountsByType.set(lockInfo.lockType, count + 1); + } + + // Report metrics for each lock type + for (const [lockType, count] of lockCountsByType) { + observableResult.observe(count, { + [SemanticAttributes.LOCK_TYPE]: lockType, + }); + } } /** Locks resources using RedLock. It won't lock again if we're already inside a lock with the same resources. */ @@ -54,17 +106,53 @@ export class RunLocker { span.setAttribute("nested", false); // Different resources or not in a lock, proceed with new lock + const lockId = `${name}:${joinedResources}:${Date.now()}`; + const lockStartTime = performance.now(); + const [error, result] = await tryCatch( this.redlock.using(resources, duration, async (signal) => { - const newContext: LockContext = { resources: joinedResources, signal }; + const newContext: LockContext = { + resources: joinedResources, + signal, + lockType: name, + }; - return this.asyncLocalStorage.run(newContext, async () => { - return routine(signal); + // Track active lock + this.activeLocks.set(lockId, { + lockType: name, + resources: resources, }); + + let lockSuccess = true; + try { + return this.asyncLocalStorage.run(newContext, async () => { + return routine(signal); + }); + } catch (lockError) { + lockSuccess = false; + throw lockError; + } finally { + // Record lock duration + const lockDuration = performance.now() - lockStartTime; + this.lockDurationHistogram.record(lockDuration, { + [SemanticAttributes.LOCK_TYPE]: name, + [SemanticAttributes.LOCK_SUCCESS]: lockSuccess.toString(), + }); + + // Remove from active locks when done + this.activeLocks.delete(lockId); + } }) ); if (error) { + // Record failed lock acquisition + const lockDuration = performance.now() - lockStartTime; + this.lockDurationHistogram.record(lockDuration, { + [SemanticAttributes.LOCK_TYPE]: name, + [SemanticAttributes.LOCK_SUCCESS]: "false", + }); + this.logger.error("[RunLocker] Error locking resources", { error, resources, duration }); throw error; } diff --git a/internal-packages/run-engine/src/engine/releaseConcurrencyTokenBucketQueue.ts b/internal-packages/run-engine/src/engine/releaseConcurrencyTokenBucketQueue.ts index 6c146d4baa..64fc1911f6 100644 --- a/internal-packages/run-engine/src/engine/releaseConcurrencyTokenBucketQueue.ts +++ b/internal-packages/run-engine/src/engine/releaseConcurrencyTokenBucketQueue.ts @@ -1,5 +1,13 @@ import { Callback, createRedisClient, Redis, Result, type RedisOptions } from "@internal/redis"; -import { startSpan, Tracer } from "@internal/tracing"; +import { + startSpan, + Tracer, + Meter, + getMeter, + ValueType, + ObservableResult, + Attributes, +} from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { z } from "zod"; import { setInterval } from "node:timers/promises"; @@ -39,6 +47,7 @@ export type ReleaseConcurrencyQueueOptions = { consumersCount?: number; masterQueuesKey?: string; tracer?: Tracer; + meter?: Meter; logger?: Logger; pollInterval?: number; batchSize?: number; @@ -56,6 +65,7 @@ type QueueItemMetadata = z.infer; export class ReleaseConcurrencyTokenBucketQueue { private redis: Redis; private logger: Logger; + private meter: Meter; private abortController: AbortController; private consumers: ReleaseConcurrencyQueueConsumer[]; private sweeper?: ReleaseConcurrencyReleasingsSweeper; @@ -74,6 +84,7 @@ export class ReleaseConcurrencyTokenBucketQueue { this.redis = createRedisClient(options.redis); this.keyPrefix = options.redis.keyPrefix ?? "re2:release-concurrency-queue:"; this.logger = options.logger ?? new Logger("ReleaseConcurrencyQueue", "debug"); + this.meter = options.meter ?? getMeter("release-concurrency"); this.abortController = new AbortController(); this.consumers = []; @@ -90,11 +101,32 @@ export class ReleaseConcurrencyTokenBucketQueue { factor: options.retry?.backoff?.factor ?? 2, }; + // Set up OpenTelemetry metrics + const releasingsLengthGauge = this.meter.createObservableGauge( + "release_concurrency.releasings.length", + { + description: "Number of items in the releasings sorted set", + unit: "items", + valueType: ValueType.INT, + } + ); + + const masterQueueLengthGauge = this.meter.createObservableGauge( + "release_concurrency.master_queue.length", + { + description: "Number of items in the master queue sorted set", + unit: "items", + valueType: ValueType.INT, + } + ); + + releasingsLengthGauge.addCallback(this.#updateReleasingsLength.bind(this)); + masterQueueLengthGauge.addCallback(this.#updateMasterQueueLength.bind(this)); + this.#registerCommands(); if (!options.disableConsumers) { this.#startConsumers(); - this.#startMetricsProducer(); this.#startReleasingsSweeper(); } } @@ -104,6 +136,16 @@ export class ReleaseConcurrencyTokenBucketQueue { await this.redis.quit(); } + async #updateReleasingsLength(observableResult: ObservableResult) { + const releasingsLength = await this.redis.zcard(this.#releasingsKey()); + observableResult.observe(releasingsLength); + } + + async #updateMasterQueueLength(observableResult: ObservableResult) { + const masterQueueLength = await this.redis.zcard(this.masterQueuesKey); + observableResult.observe(masterQueueLength); + } + /** * Attempt to release concurrency for a run. * @@ -489,30 +531,6 @@ export class ReleaseConcurrencyTokenBucketQueue { } } - async #startMetricsProducer() { - try { - // Produce metrics every 60 seconds, using a tracer span - for await (const _ of setInterval(60_000)) { - const metrics = await this.getQueueMetrics(); - this.logger.info("Queue metrics:", { metrics }); - - await startSpan( - this.options.tracer, - "ReleaseConcurrencyTokenBucketQueue.metrics", - async (span) => {}, - { - attributes: { - ...flattenAttributes(metrics, "queues"), - forceRecording: true, - }, - } - ); - } - } catch (error) { - this.logger.error("Error starting metrics producer:", { error }); - } - } - #calculateBackoffScore(item: QueueItemMetadata): string { const delay = Math.min( this.backoff.maxDelay, diff --git a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts index 8ac46d5e89..0bfcf44c62 100644 --- a/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/dequeueSystem.ts @@ -1,17 +1,17 @@ import { startSpan } from "@internal/tracing"; import { assertExhaustive } from "@trigger.dev/core"; -import { DequeuedMessage, MachineResources, RetryOptions } from "@trigger.dev/core/v3"; -import { getMaxDuration, sanitizeQueueName } from "@trigger.dev/core/v3/isomorphic"; +import { DequeuedMessage, RetryOptions } from "@trigger.dev/core/v3"; +import { getMaxDuration } from "@trigger.dev/core/v3/isomorphic"; import { PrismaClientOrTransaction } from "@trigger.dev/database"; import { getRunWithBackgroundWorkerTasks } from "../db/worker.js"; +import { sendNotificationToWorker } from "../eventBus.js"; import { getMachinePreset } from "../machinePresets.js"; import { isDequeueableExecutionStatus } from "../statuses.js"; import { RunEngineOptions } from "../types.js"; import { ExecutionSnapshotSystem, getLatestExecutionSnapshot } from "./executionSnapshotSystem.js"; +import { ReleaseConcurrencySystem } from "./releaseConcurrencySystem.js"; import { RunAttemptSystem } from "./runAttemptSystem.js"; import { SystemResources } from "./systems.js"; -import { sendNotificationToWorker } from "../eventBus.js"; -import { ReleaseConcurrencySystem } from "./releaseConcurrencySystem.js"; export type DequeueSystemOptions = { resources: SystemResources; @@ -35,534 +35,490 @@ export class DequeueSystem { } /** - * Gets a fairly selected run from the specified master queue, returning the information required to run it. + * Gets a fairly selected run from the specified worker queue, returning the information required to run it. * @param consumerId: The consumer that is pulling, allows multiple consumers to pull from the same queue - * @param masterQueue: The shared queue to pull from, can be an individual environment (for dev) + * @param workerQueue: The worker queue to pull from, can be an individual environment (for dev) * @returns */ - async dequeueFromMasterQueue({ + async dequeueFromWorkerQueue({ consumerId, - masterQueue, - maxRunCount, - maxResources, + workerQueue, backgroundWorkerId, workerId, runnerId, tx, }: { consumerId: string; - masterQueue: string; - maxRunCount: number; - maxResources?: MachineResources; + workerQueue: string; backgroundWorkerId?: string; workerId?: string; runnerId?: string; tx?: PrismaClientOrTransaction; - }): Promise { + }): Promise { const prisma = tx ?? this.$.prisma; return startSpan( this.$.tracer, - "dequeueFromMasterQueue", + "dequeueFromWorkerQueue", async (span) => { //gets multiple runs from the queue - const messages = await this.$.runQueue.dequeueMessageFromMasterQueue( + const message = await this.$.runQueue.dequeueMessageFromWorkerQueue( consumerId, - masterQueue, - maxRunCount + workerQueue ); - if (messages.length === 0) { - return []; + if (!message) { + return; } - //we can't send more than the max resources - const consumedResources: MachineResources = { - cpu: 0, - memory: 0, - }; - - const dequeuedRuns: DequeuedMessage[] = []; - - for (const message of messages) { - const orgId = message.message.orgId; - const runId = message.messageId; - - span.setAttribute("runId", runId); - - //lock the run so nothing else can modify it - try { - const dequeuedRun = await this.$.runLock.lock( - "dequeueFromMasterQueue", - [runId], - 5000, - async (signal) => { - const snapshot = await getLatestExecutionSnapshot(prisma, runId); - - if (!isDequeueableExecutionStatus(snapshot.executionStatus)) { - // If it's pending executing it will be picked up by the stalled system if there's an issue - if (snapshot.executionStatus === "PENDING_EXECUTING") { - this.$.logger.error( - "RunEngine.dequeueFromMasterQueue(): Run is already PENDING_EXECUTING, removing from queue", - { - runId, - orgId, - } - ); - // remove the run from the queue - await this.$.runQueue.acknowledgeMessage(orgId, runId); - return null; - } + const orgId = message.message.orgId; + const runId = message.messageId; + + span.setAttribute("runId", runId); + + //lock the run so nothing else can modify it + try { + const dequeuedRun = await this.$.runLock.lock( + "dequeueFromWorkerQueue", + [runId], + 5000, + async (signal) => { + const snapshot = await getLatestExecutionSnapshot(prisma, runId); + + if (!isDequeueableExecutionStatus(snapshot.executionStatus)) { + // If it's pending executing it will be picked up by the stalled system if there's an issue + if (snapshot.executionStatus === "PENDING_EXECUTING") { + this.$.logger.error( + "RunEngine.dequeueFromMasterQueue(): Run is already PENDING_EXECUTING, removing from queue", + { + runId, + orgId, + } + ); + // remove the run from the queue + await this.$.runQueue.acknowledgeMessage(orgId, runId); + return; + } + + //create a failed snapshot + await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run: { + id: snapshot.runId, + status: snapshot.runStatus, + }, + snapshot: { + executionStatus: snapshot.executionStatus, + description: + "Tried to dequeue a run that is not in a valid state to be dequeued.", + }, + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + checkpointId: snapshot.checkpointId ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints, + error: `Tried to dequeue a run that is not in a valid state to be dequeued.`, + workerId, + runnerId, + }); + + //todo is there a way to recover this, so the run can be retried? + //for example should we update the status to a dequeuable status and nack it? + //then at least it has a chance of succeeding and we have the error log above + await this.runAttemptSystem.systemFailure({ + runId, + error: { + type: "INTERNAL_ERROR", + code: "TASK_DEQUEUED_INVALID_STATE", + message: `Task was in the ${snapshot.executionStatus} state when it was dequeued for execution.`, + }, + tx: prisma, + }); + this.$.logger.error( + `RunEngine.dequeueFromWorkerQueue(): Run is not in a valid state to be dequeued: ${runId}\n ${snapshot.id}:${snapshot.executionStatus}` + ); + + return; + } - //create a failed snapshot - await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + if (snapshot.executionStatus === "QUEUED_EXECUTING") { + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( + prisma, + { run: { - id: snapshot.runId, + id: runId, status: snapshot.runStatus, + attemptNumber: snapshot.attemptNumber, }, snapshot: { - executionStatus: snapshot.executionStatus, - description: - "Tried to dequeue a run that is not in a valid state to be dequeued.", + executionStatus: "EXECUTING", + description: "Run was continued, whilst still executing.", }, previousSnapshotId: snapshot.id, environmentId: snapshot.environmentId, environmentType: snapshot.environmentType, projectId: snapshot.projectId, organizationId: snapshot.organizationId, - checkpointId: snapshot.checkpointId ?? undefined, - completedWaitpoints: snapshot.completedWaitpoints, - error: `Tried to dequeue a run that is not in a valid state to be dequeued.`, - workerId, - runnerId, - }); + batchId: snapshot.batchId ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints.map((waitpoint) => ({ + id: waitpoint.id, + index: waitpoint.index, + })), + } + ); - //todo is there a way to recover this, so the run can be retried? - //for example should we update the status to a dequeuable status and nack it? - //then at least it has a chance of succeeding and we have the error log above - await this.runAttemptSystem.systemFailure({ - runId, - error: { - type: "INTERNAL_ERROR", - code: "TASK_DEQUEUED_INVALID_STATE", - message: `Task was in the ${snapshot.executionStatus} state when it was dequeued for execution.`, - }, - tx: prisma, - }); - this.$.logger.error( - `RunEngine.dequeueFromMasterQueue(): Run is not in a valid state to be dequeued: ${runId}\n ${snapshot.id}:${snapshot.executionStatus}` + if (snapshot.previousSnapshotId) { + await this.releaseConcurrencySystem.refillTokensForSnapshot( + snapshot.previousSnapshotId ); - return null; } - if (snapshot.executionStatus === "QUEUED_EXECUTING") { - const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( - prisma, - { - run: { - id: runId, - status: snapshot.runStatus, - attemptNumber: snapshot.attemptNumber, - }, - snapshot: { - executionStatus: "EXECUTING", - description: "Run was continued, whilst still executing.", - }, - previousSnapshotId: snapshot.id, - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - projectId: snapshot.projectId, - organizationId: snapshot.organizationId, - batchId: snapshot.batchId ?? undefined, - completedWaitpoints: snapshot.completedWaitpoints.map((waitpoint) => ({ - id: waitpoint.id, - index: waitpoint.index, - })), - } - ); - - if (snapshot.previousSnapshotId) { - await this.releaseConcurrencySystem.refillTokensForSnapshot( - snapshot.previousSnapshotId - ); - } - - await sendNotificationToWorker({ - runId, - snapshot: newSnapshot, - eventBus: this.$.eventBus, - }); + await sendNotificationToWorker({ + runId, + snapshot: newSnapshot, + eventBus: this.$.eventBus, + }); - return null; - } + return; + } - const result = await getRunWithBackgroundWorkerTasks( - prisma, - runId, - backgroundWorkerId - ); + const result = await getRunWithBackgroundWorkerTasks( + prisma, + runId, + backgroundWorkerId + ); - if (!result.success) { - switch (result.code) { - case "NO_RUN": { - //this should not happen, the run is unrecoverable so we'll ack it - this.$.logger.error("RunEngine.dequeueFromMasterQueue(): No run found", { - runId, - latestSnapshot: snapshot.id, - }); - await this.$.runQueue.acknowledgeMessage(orgId, runId); - return null; - } - case "RUN_ENVIRONMENT_ARCHIVED": { - //this happens if the preview branch was archived - this.$.logger.warn( - "RunEngine.dequeueFromMasterQueue(): Run environment archived", - { - runId, - latestSnapshot: snapshot.id, - result, - } - ); - await this.$.runQueue.acknowledgeMessage(orgId, runId); - return null; - } - case "NO_WORKER": - case "TASK_NEVER_REGISTERED": - case "QUEUE_NOT_FOUND": - case "TASK_NOT_IN_LATEST": { - this.$.logger.warn(`RunEngine.dequeueFromMasterQueue(): ${result.code}`, { + if (!result.success) { + switch (result.code) { + case "NO_RUN": { + //this should not happen, the run is unrecoverable so we'll ack it + this.$.logger.error("RunEngine.dequeueFromWorkerQueue(): No run found", { + runId, + latestSnapshot: snapshot.id, + }); + await this.$.runQueue.acknowledgeMessage(orgId, runId); + return; + } + case "RUN_ENVIRONMENT_ARCHIVED": { + //this happens if the preview branch was archived + this.$.logger.warn( + "RunEngine.dequeueFromWorkerQueue(): Run environment archived", + { runId, latestSnapshot: snapshot.id, result, - }); - - //not deployed yet, so we'll wait for the deploy - await this.#pendingVersion({ - orgId, - runId, - reason: result.message, - statusReason: result.code, - tx: prisma, - }); - return null; - } - case "BACKGROUND_WORKER_MISMATCH": { - this.$.logger.warn( - "RunEngine.dequeueFromMasterQueue(): Background worker mismatch", - { - runId, - latestSnapshot: snapshot.id, - result, - } - ); - - //worker mismatch so put it back in the queue - await this.$.runQueue.nackMessage({ orgId, messageId: runId }); - - return null; - } - default: { - assertExhaustive(result); - } + } + ); + await this.$.runQueue.acknowledgeMessage(orgId, runId); + return; } - } - - //check for a valid deployment if it's not a development environment - if (result.run.runtimeEnvironment.type !== "DEVELOPMENT") { - if (!result.deployment || !result.deployment.imageReference) { - this.$.logger.warn("RunEngine.dequeueFromMasterQueue(): No deployment found", { + case "NO_WORKER": + case "TASK_NEVER_REGISTERED": + case "QUEUE_NOT_FOUND": + case "TASK_NOT_IN_LATEST": { + this.$.logger.warn(`RunEngine.dequeueFromWorkerQueue(): ${result.code}`, { runId, latestSnapshot: snapshot.id, result, }); + //not deployed yet, so we'll wait for the deploy await this.#pendingVersion({ orgId, runId, - reason: "No deployment or deployment image reference found for deployed run", - statusReason: "NO_DEPLOYMENT", + reason: result.message, + statusReason: result.code, tx: prisma, }); - - return null; + return; } - } - - const machinePreset = getMachinePreset({ - machines: this.options.machines.machines, - defaultMachine: this.options.machines.defaultMachine, - config: result.task.machineConfig ?? {}, - run: result.run, - }); - - //increment the consumed resources - consumedResources.cpu += machinePreset.cpu; - consumedResources.memory += machinePreset.memory; - - //are we under the limit? - if (maxResources) { - if ( - consumedResources.cpu > maxResources.cpu || - consumedResources.memory > maxResources.memory - ) { - this.$.logger.debug( - "RunEngine.dequeueFromMasterQueue(): Consumed resources over limit, nacking", + case "BACKGROUND_WORKER_MISMATCH": { + this.$.logger.warn( + "RunEngine.dequeueFromWorkerQueue(): Background worker mismatch", { runId, - consumedResources, - maxResources, + latestSnapshot: snapshot.id, + result, } ); - //put it back in the queue where it was - await this.$.runQueue.nackMessage({ - orgId, - messageId: runId, - incrementAttemptCount: false, - retryAt: result.run.createdAt.getTime() - result.run.priorityMs, - }); - return null; + //worker mismatch so put it back in the queue + await this.$.runQueue.nackMessage({ orgId, messageId: runId }); + + return; + } + default: { + assertExhaustive(result); } } + } + + //check for a valid deployment if it's not a development environment + if (result.run.runtimeEnvironment.type !== "DEVELOPMENT") { + if (!result.deployment || !result.deployment.imageReference) { + this.$.logger.warn("RunEngine.dequeueFromWorkerQueue(): No deployment found", { + runId, + latestSnapshot: snapshot.id, + result, + }); + //not deployed yet, so we'll wait for the deploy + await this.#pendingVersion({ + orgId, + runId, + reason: "No deployment or deployment image reference found for deployed run", + statusReason: "NO_DEPLOYMENT", + tx: prisma, + }); - // Check max attempts that can optionally be set when triggering a run - let maxAttempts: number | null | undefined = result.run.maxAttempts; + return; + } + } - // If it's not set, we'll grab it from the task's retry config - if (!maxAttempts) { - const retryConfig = result.task.retryConfig; + const machinePreset = getMachinePreset({ + machines: this.options.machines.machines, + defaultMachine: this.options.machines.defaultMachine, + config: result.task.machineConfig ?? {}, + run: result.run, + }); - this.$.logger.debug( - "RunEngine.dequeueFromMasterQueue(): maxAttempts not set, using task's retry config", - { - runId, - task: result.task.id, - rawRetryConfig: retryConfig, - } - ); + // Check max attempts that can optionally be set when triggering a run + let maxAttempts: number | null | undefined = result.run.maxAttempts; - const parsedConfig = RetryOptions.nullable().safeParse(retryConfig); + // If it's not set, we'll grab it from the task's retry config + if (!maxAttempts) { + const retryConfig = result.task.retryConfig; - if (!parsedConfig.success) { - this.$.logger.error( - "RunEngine.dequeueFromMasterQueue(): Invalid retry config", - { - runId, - task: result.task.id, - rawRetryConfig: retryConfig, - } - ); + this.$.logger.debug( + "RunEngine.dequeueFromWorkerQueue(): maxAttempts not set, using task's retry config", + { + runId, + task: result.task.id, + rawRetryConfig: retryConfig, } + ); + + const parsedConfig = RetryOptions.nullable().safeParse(retryConfig); - maxAttempts = parsedConfig.data?.maxAttempts; + if (!parsedConfig.success) { + this.$.logger.error("RunEngine.dequeueFromWorkerQueue(): Invalid retry config", { + runId, + task: result.task.id, + rawRetryConfig: retryConfig, + }); } - //update the run - const lockedAt = new Date(); - const startedAt = result.run.startedAt ?? lockedAt; - const maxDurationInSeconds = getMaxDuration( - result.run.maxDurationInSeconds, - result.task.maxDurationInSeconds - ); - const lockedTaskRun = await prisma.taskRun.update({ - where: { - id: runId, - }, - data: { - lockedAt, - lockedById: result.task.id, - lockedToVersionId: result.worker.id, - lockedQueueId: result.queue.id, - startedAt, - baseCostInCents: this.options.machines.baseCostInCents, - machinePreset: machinePreset.name, - taskVersion: result.worker.version, - sdkVersion: result.worker.sdkVersion, - cliVersion: result.worker.cliVersion, - maxDurationInSeconds, - maxAttempts: maxAttempts ?? undefined, - }, - include: { - runtimeEnvironment: true, - tags: true, - }, - }); + maxAttempts = parsedConfig.data?.maxAttempts; + } + //update the run + const lockedAt = new Date(); + const startedAt = result.run.startedAt ?? lockedAt; + const maxDurationInSeconds = getMaxDuration( + result.run.maxDurationInSeconds, + result.task.maxDurationInSeconds + ); - this.$.eventBus.emit("runLocked", { - time: new Date(), - run: { - id: runId, - status: lockedTaskRun.status, - lockedAt, - lockedById: result.task.id, - lockedToVersionId: result.worker.id, - lockedQueueId: result.queue.id, - startedAt, - baseCostInCents: this.options.machines.baseCostInCents, - machinePreset: machinePreset.name, - taskVersion: result.worker.version, - sdkVersion: result.worker.sdkVersion, - cliVersion: result.worker.cliVersion, - maxDurationInSeconds: lockedTaskRun.maxDurationInSeconds ?? undefined, - maxAttempts: lockedTaskRun.maxAttempts ?? undefined, - updatedAt: lockedTaskRun.updatedAt, - createdAt: lockedTaskRun.createdAt, - }, - organization: { - id: orgId, - }, - project: { - id: lockedTaskRun.projectId, - }, - environment: { - id: lockedTaskRun.runtimeEnvironmentId, - }, + const lockedTaskRun = await prisma.taskRun.update({ + where: { + id: runId, + }, + data: { + lockedAt, + lockedById: result.task.id, + lockedToVersionId: result.worker.id, + lockedQueueId: result.queue.id, + startedAt, + baseCostInCents: this.options.machines.baseCostInCents, + machinePreset: machinePreset.name, + taskVersion: result.worker.version, + sdkVersion: result.worker.sdkVersion, + cliVersion: result.worker.cliVersion, + maxDurationInSeconds, + maxAttempts: maxAttempts ?? undefined, + }, + include: { + runtimeEnvironment: true, + tags: true, + }, + }); + + this.$.eventBus.emit("runLocked", { + time: new Date(), + run: { + id: runId, + status: lockedTaskRun.status, + lockedAt, + lockedById: result.task.id, + lockedToVersionId: result.worker.id, + lockedQueueId: result.queue.id, + startedAt, + baseCostInCents: this.options.machines.baseCostInCents, + machinePreset: machinePreset.name, + taskVersion: result.worker.version, + sdkVersion: result.worker.sdkVersion, + cliVersion: result.worker.cliVersion, + maxDurationInSeconds: lockedTaskRun.maxDurationInSeconds ?? undefined, + maxAttempts: lockedTaskRun.maxAttempts ?? undefined, + updatedAt: lockedTaskRun.updatedAt, + createdAt: lockedTaskRun.createdAt, + }, + organization: { + id: orgId, + }, + project: { + id: lockedTaskRun.projectId, + }, + environment: { + id: lockedTaskRun.runtimeEnvironmentId, + }, + }); + + if (!lockedTaskRun) { + this.$.logger.error("RunEngine.dequeueFromWorkerQueue(): Failed to lock task run", { + taskRun: result.run.id, + taskIdentifier: result.run.taskIdentifier, + deployment: result.deployment?.id, + worker: result.worker.id, + task: result.task.id, + runId, }); - if (!lockedTaskRun) { - this.$.logger.error( - "RunEngine.dequeueFromMasterQueue(): Failed to lock task run", - { - taskRun: result.run.id, - taskIdentifier: result.run.taskIdentifier, - deployment: result.deployment?.id, - worker: result.worker.id, - task: result.task.id, - runId, - } - ); - - await this.$.runQueue.acknowledgeMessage(orgId, runId); - return null; - } + await this.$.runQueue.acknowledgeMessage(orgId, runId); - const currentAttemptNumber = lockedTaskRun.attemptNumber ?? 0; - const nextAttemptNumber = currentAttemptNumber + 1; + return; + } - const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( - prisma, - { - run: { - id: runId, - status: snapshot.runStatus, - attemptNumber: lockedTaskRun.attemptNumber, - }, - snapshot: { - executionStatus: "PENDING_EXECUTING", - description: "Run was dequeued for execution", - }, - previousSnapshotId: snapshot.id, - environmentId: snapshot.environmentId, - environmentType: snapshot.environmentType, - projectId: snapshot.projectId, - organizationId: snapshot.organizationId, - checkpointId: snapshot.checkpointId ?? undefined, - batchId: snapshot.batchId ?? undefined, - completedWaitpoints: snapshot.completedWaitpoints, - workerId, - runnerId, - } - ); + const currentAttemptNumber = lockedTaskRun.attemptNumber ?? 0; + const nextAttemptNumber = currentAttemptNumber + 1; - return { - version: "1" as const, - dequeuedAt: new Date(), - snapshot: { - id: newSnapshot.id, - friendlyId: newSnapshot.friendlyId, - executionStatus: newSnapshot.executionStatus, - description: newSnapshot.description, - createdAt: newSnapshot.createdAt, - }, - image: result.deployment?.imageReference ?? undefined, - checkpoint: newSnapshot.checkpoint ?? undefined, - completedWaitpoints: snapshot.completedWaitpoints, - backgroundWorker: { - id: result.worker.id, - friendlyId: result.worker.friendlyId, - version: result.worker.version, - }, - deployment: { - id: result.deployment?.id, - friendlyId: result.deployment?.friendlyId, - imagePlatform: result.deployment?.imagePlatform, - }, + const newSnapshot = await this.executionSnapshotSystem.createExecutionSnapshot( + prisma, + { run: { - id: lockedTaskRun.id, - friendlyId: lockedTaskRun.friendlyId, - isTest: lockedTaskRun.isTest, - machine: machinePreset, - attemptNumber: nextAttemptNumber, - masterQueue: lockedTaskRun.masterQueue, - traceContext: lockedTaskRun.traceContext as Record, - }, - environment: { - id: lockedTaskRun.runtimeEnvironment.id, - type: lockedTaskRun.runtimeEnvironment.type, - }, - organization: { - id: orgId, + id: runId, + status: snapshot.runStatus, + attemptNumber: lockedTaskRun.attemptNumber, }, - project: { - id: lockedTaskRun.projectId, + snapshot: { + executionStatus: "PENDING_EXECUTING", + description: "Run was dequeued for execution", }, - } satisfies DequeuedMessage; - } - ); + previousSnapshotId: snapshot.id, + environmentId: snapshot.environmentId, + environmentType: snapshot.environmentType, + projectId: snapshot.projectId, + organizationId: snapshot.organizationId, + checkpointId: snapshot.checkpointId ?? undefined, + batchId: snapshot.batchId ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints, + workerId, + runnerId, + } + ); - if (dequeuedRun !== null) { - dequeuedRuns.push(dequeuedRun); + return { + version: "1" as const, + dequeuedAt: new Date(), + snapshot: { + id: newSnapshot.id, + friendlyId: newSnapshot.friendlyId, + executionStatus: newSnapshot.executionStatus, + description: newSnapshot.description, + createdAt: newSnapshot.createdAt, + }, + image: result.deployment?.imageReference ?? undefined, + checkpoint: newSnapshot.checkpoint ?? undefined, + completedWaitpoints: snapshot.completedWaitpoints, + backgroundWorker: { + id: result.worker.id, + friendlyId: result.worker.friendlyId, + version: result.worker.version, + }, + deployment: { + id: result.deployment?.id, + friendlyId: result.deployment?.friendlyId, + imagePlatform: result.deployment?.imagePlatform, + }, + run: { + id: lockedTaskRun.id, + friendlyId: lockedTaskRun.friendlyId, + isTest: lockedTaskRun.isTest, + machine: machinePreset, + attemptNumber: nextAttemptNumber, + // Keeping this for backwards compatibility, but really this should be called workerQueue + masterQueue: lockedTaskRun.workerQueue, + traceContext: lockedTaskRun.traceContext as Record, + }, + environment: { + id: lockedTaskRun.runtimeEnvironment.id, + type: lockedTaskRun.runtimeEnvironment.type, + }, + organization: { + id: orgId, + }, + project: { + id: lockedTaskRun.projectId, + }, + } satisfies DequeuedMessage; + } + ); + + return dequeuedRun; + } catch (error) { + this.$.logger.error( + "RunEngine.dequeueFromWorkerQueue(): Thrown error while preparing run to be run", + { + error, + runId, } - } catch (error) { + ); + + const run = await prisma.taskRun.findFirst({ + where: { id: runId }, + include: { + runtimeEnvironment: true, + }, + }); + + if (!run) { + //this isn't ideal because we're not creating a snapshot… but we can't do much else this.$.logger.error( - "RunEngine.dequeueFromMasterQueue(): Thrown error while preparing run to be run", + "RunEngine.dequeueFromWorkerQueue(): Thrown error, then run not found. Nacking.", { - error, runId, + orgId, } ); + await this.$.runQueue.nackMessage({ orgId, messageId: runId }); - const run = await prisma.taskRun.findFirst({ - where: { id: runId }, - include: { - runtimeEnvironment: true, - }, - }); + return; + } - if (!run) { - //this isn't ideal because we're not creating a snapshot… but we can't do much else - this.$.logger.error( - "RunEngine.dequeueFromMasterQueue(): Thrown error, then run not found. Nacking.", - { - runId, - orgId, - } - ); - await this.$.runQueue.nackMessage({ orgId, messageId: runId }); - continue; - } + //this is an unknown error, we'll reattempt (with auto-backoff and eventually DLQ) + const gotRequeued = await this.runAttemptSystem.tryNackAndRequeue({ + run, + environment: run.runtimeEnvironment, + orgId, + projectId: run.runtimeEnvironment.projectId, + error: { + type: "INTERNAL_ERROR", + code: "TASK_RUN_DEQUEUED_MAX_RETRIES", + message: `We tried to dequeue the run the maximum number of times but it wouldn't start executing`, + }, + tx: prisma, + }); - //this is an unknown error, we'll reattempt (with auto-backoff and eventually DLQ) - const gotRequeued = await this.runAttemptSystem.tryNackAndRequeue({ - run, - environment: run.runtimeEnvironment, + if (!gotRequeued) { + this.$.logger.error("RunEngine.dequeueFromWorkerQueue(): Failed to requeue run", { + runId, orgId, - projectId: run.runtimeEnvironment.projectId, - error: { - type: "INTERNAL_ERROR", - code: "TASK_RUN_DEQUEUED_MAX_RETRIES", - message: `We tried to dequeue the run the maximum number of times but it wouldn't start executing`, - }, - tx: prisma, }); - //we don't need this, but it makes it clear we're in a loop here - continue; } } - return dequeuedRuns; + return; }, { - attributes: { consumerId, masterQueue }, + attributes: { consumerId, workerQueue }, } ); } @@ -586,84 +542,84 @@ export class DequeueSystem { }) { const prisma = tx ?? this.$.prisma; - return startSpan( - this.$.tracer, - "#pendingVersion", - async (span) => { - return this.$.runLock.lock("pendingVersion", [runId], 5_000, async (signal) => { - //mark run as waiting for deploy - const run = await prisma.taskRun.update({ - where: { id: runId }, - data: { - status: "PENDING_VERSION", - statusReason, - }, + this.$.logger.debug("RunEngine.dequeueFromWorkerQueue(): Pending version", { + runId, + reason, + statusReason, + }); + + return this.$.runLock.lock("pendingVersion", [runId], 5_000, async (signal) => { + this.$.logger.debug("RunEngine.dequeueFromWorkerQueue(): Pending version lock acquired", { + runId, + reason, + statusReason, + }); + + //mark run as waiting for deploy + const run = await prisma.taskRun.update({ + where: { id: runId }, + data: { + status: "PENDING_VERSION", + statusReason, + }, + select: { + id: true, + status: true, + attemptNumber: true, + updatedAt: true, + createdAt: true, + runtimeEnvironment: { select: { id: true, - status: true, - attemptNumber: true, - updatedAt: true, - createdAt: true, - runtimeEnvironment: { - select: { - id: true, - type: true, - projectId: true, - project: { select: { id: true, organizationId: true } }, - }, - }, - }, - }); - - this.$.logger.debug("RunEngine.dequeueFromMasterQueue(): Pending version", { - runId, - run, - }); - - await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "RUN_CREATED", - description: - reason ?? - "The run doesn't have a background worker, so we're going to ack it for now.", - }, - environmentId: run.runtimeEnvironment.id, - environmentType: run.runtimeEnvironment.type, - projectId: run.runtimeEnvironment.projectId, - organizationId: run.runtimeEnvironment.project.organizationId, - workerId, - runnerId, - }); - - //we ack because when it's deployed it will be requeued - await this.$.runQueue.acknowledgeMessage(orgId, runId); - - this.$.eventBus.emit("runStatusChanged", { - time: new Date(), - run: { - id: runId, - status: run.status, - updatedAt: run.updatedAt, - createdAt: run.createdAt, - }, - organization: { - id: run.runtimeEnvironment.project.organizationId, - }, - project: { - id: run.runtimeEnvironment.projectId, + type: true, + projectId: true, + project: { select: { id: true, organizationId: true } }, }, - environment: { - id: run.runtimeEnvironment.id, - }, - }); - }); - }, - { - attributes: { - runId, + }, }, - } - ); + }); + + this.$.logger.debug("RunEngine.dequeueFromWorkerQueue(): Pending version", { + runId, + run, + }); + + await this.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run, + snapshot: { + executionStatus: "RUN_CREATED", + description: + reason ?? "The run doesn't have a background worker, so we're going to ack it for now.", + }, + environmentId: run.runtimeEnvironment.id, + environmentType: run.runtimeEnvironment.type, + projectId: run.runtimeEnvironment.projectId, + organizationId: run.runtimeEnvironment.project.organizationId, + workerId, + runnerId, + }); + + //we ack because when it's deployed it will be requeued + await this.$.runQueue.acknowledgeMessage(orgId, runId); + + this.$.eventBus.emit("runStatusChanged", { + time: new Date(), + run: { + id: runId, + status: run.status, + updatedAt: run.updatedAt, + createdAt: run.createdAt, + }, + organization: { + id: run.runtimeEnvironment.project.organizationId, + }, + project: { + id: run.runtimeEnvironment.projectId, + }, + environment: { + id: run.runtimeEnvironment.id, + }, + }); + }); } } diff --git a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts index 086431821a..1f5383fe0a 100644 --- a/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/enqueueSystem.ts @@ -74,16 +74,14 @@ export class EnqueueSystem { runnerId, }); - const masterQueues = [run.masterQueue]; - if (run.secondaryMasterQueue) { - masterQueues.push(run.secondaryMasterQueue); - } + // Force development runs to use the environment id as the worker queue. + const workerQueue = env.type === "DEVELOPMENT" ? env.id : run.workerQueue; const timestamp = (run.queueTimestamp ?? run.createdAt).getTime() - run.priorityMs; await this.$.runQueue.enqueueMessage({ env, - masterQueues, + workerQueue, message: { runId: run.id, taskIdentifier: run.taskIdentifier, diff --git a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts index 687b5aa028..e7aec0202d 100644 --- a/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/runAttemptSystem.ts @@ -1050,7 +1050,9 @@ export class RunAttemptSystem { } //remove it from the queue and release concurrency - await this.$.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId); + await this.$.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId, { + removeFromWorkerQueue: true, + }); await this.releaseConcurrencySystem.refillTokensForSnapshot(latestSnapshot); @@ -1233,7 +1235,9 @@ export class RunAttemptSystem { throw new ServiceValidationError("No associated waitpoint found", 400); } - await this.$.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId); + await this.$.runQueue.acknowledgeMessage(run.runtimeEnvironment.organizationId, runId, { + removeFromWorkerQueue: true, + }); await this.waitpointSystem.completeWaitpoint({ id: run.associatedWaitpoint.id, diff --git a/internal-packages/run-engine/src/engine/systems/systems.ts b/internal-packages/run-engine/src/engine/systems/systems.ts index e20f347b93..35790259b4 100644 --- a/internal-packages/run-engine/src/engine/systems/systems.ts +++ b/internal-packages/run-engine/src/engine/systems/systems.ts @@ -1,4 +1,4 @@ -import { Tracer } from "@internal/tracing"; +import { Meter, Tracer } from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { PrismaClient } from "@trigger.dev/database"; import { RunQueue } from "../../run-queue/index.js"; @@ -13,6 +13,7 @@ export type SystemResources = { eventBus: EventBus; logger: Logger; tracer: Tracer; + meter: Meter; runLock: RunLocker; runQueue: RunQueue; raceSimulationSystem: RaceSimulationSystem; diff --git a/internal-packages/run-engine/src/engine/systems/ttlSystem.ts b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts index e9dc5e143b..5ab957c989 100644 --- a/internal-packages/run-engine/src/engine/systems/ttlSystem.ts +++ b/internal-packages/run-engine/src/engine/systems/ttlSystem.ts @@ -106,7 +106,13 @@ export class TtlSystem { }, }); - await this.$.runQueue.acknowledgeMessage(updatedRun.runtimeEnvironment.organizationId, runId); + await this.$.runQueue.acknowledgeMessage( + updatedRun.runtimeEnvironment.organizationId, + runId, + { + removeFromWorkerQueue: true, + } + ); if (!updatedRun.associatedWaitpoint) { throw new ServiceValidationError("No associated waitpoint found", 400); diff --git a/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts b/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts index 7bc5122f7e..2e71d92815 100644 --- a/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts +++ b/internal-packages/run-engine/src/engine/tests/attemptFailures.test.ts @@ -20,6 +20,8 @@ describe("RunEngine attempt failures", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -62,7 +64,7 @@ describe("RunEngine attempt failures", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -71,10 +73,10 @@ describe("RunEngine attempt failures", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -173,6 +175,8 @@ describe("RunEngine attempt failures", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -213,7 +217,7 @@ describe("RunEngine attempt failures", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -222,10 +226,10 @@ describe("RunEngine attempt failures", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -284,6 +288,8 @@ describe("RunEngine attempt failures", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -324,7 +330,7 @@ describe("RunEngine attempt failures", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -333,10 +339,10 @@ describe("RunEngine attempt failures", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -393,6 +399,8 @@ describe("RunEngine attempt failures", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -431,7 +439,7 @@ describe("RunEngine attempt failures", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -440,10 +448,10 @@ describe("RunEngine attempt failures", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -500,6 +508,8 @@ describe("RunEngine attempt failures", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -548,7 +558,7 @@ describe("RunEngine attempt failures", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -557,10 +567,10 @@ describe("RunEngine attempt failures", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -657,6 +667,8 @@ describe("RunEngine attempt failures", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -707,7 +719,7 @@ describe("RunEngine attempt failures", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -716,10 +728,10 @@ describe("RunEngine attempt failures", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create first attempt @@ -758,14 +770,11 @@ describe("RunEngine attempt failures", () => { expect(executionData.run.attemptNumber).toBe(1); expect(executionData.run.status).toBe("RETRYING_AFTER_FAILURE"); - //wait for 1s - await setTimeout(5_000); + await engine.runQueue.processMasterQueueForEnvironment(authenticatedEnvironment.id); - //dequeue again - const dequeued2 = await engine.dequeueFromMasterQueue({ + const dequeued2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); expect(dequeued2.length).toBe(1); diff --git a/internal-packages/run-engine/src/engine/tests/batchTrigger.test.ts b/internal-packages/run-engine/src/engine/tests/batchTrigger.test.ts index 7b6626bcf9..a2936c3665 100644 --- a/internal-packages/run-engine/src/engine/tests/batchTrigger.test.ts +++ b/internal-packages/run-engine/src/engine/tests/batchTrigger.test.ts @@ -24,6 +24,8 @@ describe("RunEngine batchTrigger", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -73,7 +75,7 @@ describe("RunEngine batchTrigger", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -94,7 +96,7 @@ describe("RunEngine batchTrigger", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -116,13 +118,13 @@ describe("RunEngine batchTrigger", () => { expect(queueLength).toBe(2); //dequeue + await setTimeout(500); const dequeued: DequeuedMessage[] = []; for (let i = 0; i < 2; i++) { dequeued.push( - ...(await engine.dequeueFromMasterQueue({ + ...(await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: "main", - maxRunCount: 1, + workerQueue: "main", })) ); } diff --git a/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts b/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts index 58ea7244ab..5811b081df 100644 --- a/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts +++ b/internal-packages/run-engine/src/engine/tests/batchTriggerAndWait.test.ts @@ -23,6 +23,8 @@ describe("RunEngine batchTriggerAndWait", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -70,7 +72,7 @@ describe("RunEngine batchTriggerAndWait", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${parentTask}`, isTest: false, tags: [], @@ -79,10 +81,10 @@ describe("RunEngine batchTriggerAndWait", () => { ); //dequeue parent - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -118,7 +120,7 @@ describe("RunEngine batchTriggerAndWait", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${childTask}`, isTest: false, tags: [], @@ -145,7 +147,7 @@ describe("RunEngine batchTriggerAndWait", () => { traceContext: {}, traceId: "t123456", spanId: "s123456", - masterQueue: "main", + workerQueue: "main", queue: `task/${childTask}`, isTest: false, tags: [], @@ -192,10 +194,10 @@ describe("RunEngine batchTriggerAndWait", () => { expect(batchWaitpoint?.waitpoint.completedByBatchId).toBe(batch.id); //dequeue and start the 1st child - const dequeuedChild = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeuedChild = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: child1.masterQueue, - maxRunCount: 1, + workerQueue: "main", }); expect(dequeuedChild.length).toBe(1); @@ -256,13 +258,11 @@ describe("RunEngine batchTriggerAndWait", () => { expect(parentExecutionDataAfterFirstChildComplete.batch?.id).toBe(batch.id); expect(parentExecutionDataAfterFirstChildComplete.completedWaitpoints.length).toBe(0); - expect(await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment)).toBe(1); - //dequeue and start the 2nd child - const dequeuedChild2 = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeuedChild2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: child2.masterQueue, - maxRunCount: 1, + workerQueue: "main", }); expect(dequeuedChild2.length).toBe(1); @@ -372,6 +372,8 @@ describe("RunEngine batchTriggerAndWait", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -424,7 +426,7 @@ describe("RunEngine batchTriggerAndWait", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${parentTask}`, isTest: false, tags: [], @@ -433,10 +435,10 @@ describe("RunEngine batchTriggerAndWait", () => { ); //dequeue parent - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -474,7 +476,7 @@ describe("RunEngine batchTriggerAndWait", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${batchChildTask}`, isTest: false, tags: [], @@ -491,10 +493,10 @@ describe("RunEngine batchTriggerAndWait", () => { expect(parentAfterBatchChild.batch?.id).toBe(batch.id); //dequeue and start the batch child - const dequeuedBatchChild = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeuedBatchChild = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: batchChild.masterQueue, - maxRunCount: 1, + workerQueue: "main", }); expect(dequeuedBatchChild.length).toBe(1); @@ -552,7 +554,7 @@ describe("RunEngine batchTriggerAndWait", () => { traceContext: {}, traceId: "t123456", spanId: "s123456", - masterQueue: "main", + workerQueue: "main", queue: `task/${triggerAndWaitChildTask}`, isTest: false, tags: [], diff --git a/internal-packages/run-engine/src/engine/tests/cancelling.test.ts b/internal-packages/run-engine/src/engine/tests/cancelling.test.ts index 91702faba7..5fb4fc345e 100644 --- a/internal-packages/run-engine/src/engine/tests/cancelling.test.ts +++ b/internal-packages/run-engine/src/engine/tests/cancelling.test.ts @@ -25,6 +25,8 @@ describe("RunEngine cancelling", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -64,7 +66,7 @@ describe("RunEngine cancelling", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${parentTask}`, isTest: false, tags: [], @@ -73,10 +75,10 @@ describe("RunEngine cancelling", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -99,7 +101,7 @@ describe("RunEngine cancelling", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${childTask}`, isTest: false, tags: [], @@ -110,10 +112,10 @@ describe("RunEngine cancelling", () => { ); //dequeue the child run - const dequeuedChild = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeuedChild = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: childRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //start the child run @@ -239,6 +241,8 @@ describe("RunEngine cancelling", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -277,7 +281,7 @@ describe("RunEngine cancelling", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${parentTask}`, isTest: false, tags: [], @@ -286,10 +290,10 @@ describe("RunEngine cancelling", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); let cancelledEventData: EventBusEventArgs<"runCancelled">[0][] = []; diff --git a/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts b/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts index d9fcd5da8c..edd56a2b34 100644 --- a/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts +++ b/internal-packages/run-engine/src/engine/tests/checkpoints.test.ts @@ -24,6 +24,8 @@ describe("RunEngine checkpoints", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -66,7 +68,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -75,11 +77,13 @@ describe("RunEngine checkpoints", () => { ); // Dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); + expect(dequeued.length).toBe(1); + assertNonNullable(dequeued[0]); // Create an attempt const attemptResult = await engine.startRunAttempt({ @@ -146,13 +150,12 @@ describe("RunEngine checkpoints", () => { await setTimeout(500); // Dequeue the run again - const dequeuedAgain = await engine.dequeueFromMasterQueue({ + const dequeuedAgain = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); - expect(dequeuedAgain.length).toBe(1); + assertNonNullable(dequeuedAgain[0]); // Continue execution from checkpoint const continueResult = await engine.continueRunExecution({ @@ -197,6 +200,8 @@ describe("RunEngine checkpoints", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -239,7 +244,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -285,6 +290,8 @@ describe("RunEngine checkpoints", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -325,7 +332,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -333,12 +340,15 @@ describe("RunEngine checkpoints", () => { prisma ); + await setTimeout(500); + // First checkpoint sequence - const dequeued1 = await engine.dequeueFromMasterQueue({ + const dequeued1 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); + expect(dequeued1.length).toBe(1); + assertNonNullable(dequeued1[0]); const attemptResult1 = await engine.startRunAttempt({ runId: dequeued1[0].run.id, @@ -382,11 +392,12 @@ describe("RunEngine checkpoints", () => { await setTimeout(500); // Dequeue again after waitpoint completion - const dequeued2 = await engine.dequeueFromMasterQueue({ + const dequeued2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); + expect(dequeued2.length).toBe(1); + assertNonNullable(dequeued2[0]); // Continue execution from first checkpoint const continueResult1 = await engine.continueRunExecution({ @@ -429,18 +440,16 @@ describe("RunEngine checkpoints", () => { id: waitpoint2.waitpoint.id, }); - await setTimeout(500); + await setTimeout(1000); // Dequeue again after second waitpoint completion - const dequeued3 = await engine.dequeueFromMasterQueue({ + const dequeued3 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); - expect(dequeued3.length).toBe(1); + assertNonNullable(dequeued3[0]); - // Verify latest checkpoint expect(dequeued3[0].checkpoint?.reason).toBe("CHECKPOINT_2"); expect(dequeued3[0].checkpoint?.location).toBe("location-2"); @@ -484,6 +493,8 @@ describe("RunEngine checkpoints", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -526,7 +537,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -534,12 +545,15 @@ describe("RunEngine checkpoints", () => { prisma ); + await setTimeout(500); + // Dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); + expect(dequeued.length).toBe(1); + assertNonNullable(dequeued[0]); // Create an attempt const attemptResult = await engine.startRunAttempt({ @@ -630,6 +644,8 @@ describe("RunEngine checkpoints", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -672,7 +688,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -680,12 +696,15 @@ describe("RunEngine checkpoints", () => { prisma ); + await setTimeout(500); + // Dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); + expect(dequeued.length).toBe(1); + assertNonNullable(dequeued[0]); // Create an attempt const attemptResult = await engine.startRunAttempt({ @@ -772,6 +791,8 @@ describe("RunEngine checkpoints", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -819,7 +840,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345-first", spanId: "s12345-first", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -827,12 +848,15 @@ describe("RunEngine checkpoints", () => { prisma ); + await setTimeout(500); + // Dequeue and start the first run - const dequeuedFirst = await engine.dequeueFromMasterQueue({ + const dequeuedFirst = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: firstRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); + expect(dequeuedFirst.length).toBe(1); + assertNonNullable(dequeuedFirst[0]); const firstAttempt = await engine.startRunAttempt({ runId: dequeuedFirst[0].run.id, @@ -873,7 +897,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345-second", spanId: "s12345-second", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -881,12 +905,15 @@ describe("RunEngine checkpoints", () => { prisma ); + await setTimeout(500); + // Dequeue and start the second run - const dequeuedSecond = await engine.dequeueFromMasterQueue({ + const dequeuedSecond = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: secondRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); + expect(dequeuedSecond.length).toBe(1); + assertNonNullable(dequeuedSecond[0]); const secondAttempt = await engine.startRunAttempt({ runId: dequeuedSecond[0].run.id, @@ -958,18 +985,15 @@ describe("RunEngine checkpoints", () => { }, }); - await setTimeout(500); + await setTimeout(1000); // Verify the first run is back in the queue - const queuedRun = await engine.dequeueFromMasterQueue({ + const queuedRun = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: firstRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); - expect(queuedRun.length).toBe(1); - expect(queuedRun[0].run.id).toBe(firstRun.id); - expect(queuedRun[0].snapshot.executionStatus).toBe("PENDING_EXECUTING"); + assertNonNullable(queuedRun[0]); // Now we can continue the run const continueResult = await engine.continueRunExecution({ @@ -998,6 +1022,8 @@ describe("RunEngine checkpoints", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -1045,7 +1071,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${parentTask}`, isTest: false, tags: [], @@ -1053,12 +1079,15 @@ describe("RunEngine checkpoints", () => { prisma ); + await setTimeout(500); + //dequeue parent - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); + expect(dequeued.length).toBe(1); + assertNonNullable(dequeued[0]); //create an attempt const initialExecutionData = await engine.getRunExecutionData({ runId: parentRun.id }); @@ -1093,7 +1122,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${childTask}`, isTest: false, tags: [], @@ -1120,7 +1149,7 @@ describe("RunEngine checkpoints", () => { traceContext: {}, traceId: "t123456", spanId: "s123456", - masterQueue: "main", + workerQueue: "main", queue: `task/${childTask}`, isTest: false, tags: [], @@ -1131,6 +1160,8 @@ describe("RunEngine checkpoints", () => { prisma ); + await setTimeout(500); + const parentAfterChild2 = await engine.getRunExecutionData({ runId: parentRun.id }); assertNonNullable(parentAfterChild2); expect(parentAfterChild2.snapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); @@ -1200,13 +1231,12 @@ describe("RunEngine checkpoints", () => { expect(executionData.checkpoint?.reason).toBe("TEST_CHECKPOINT"); //dequeue and start the 1st child - const dequeuedChild = await engine.dequeueFromMasterQueue({ + const dequeuedChild = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: child1.masterQueue, - maxRunCount: 1, + workerQueue: "main", }); - expect(dequeuedChild.length).toBe(1); + assertNonNullable(dequeuedChild[0]); const childAttempt1 = await engine.startRunAttempt({ runId: dequeuedChild[0].run.id, @@ -1262,16 +1292,15 @@ describe("RunEngine checkpoints", () => { expect(parentExecutionDataAfterFirstChildComplete.batch?.id).toBe(batch.id); expect(parentExecutionDataAfterFirstChildComplete.completedWaitpoints.length).toBe(0); - expect(await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment)).toBe(1); + await setTimeout(1000); //dequeue and start the 2nd child - const dequeuedChild2 = await engine.dequeueFromMasterQueue({ + const dequeuedChild2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: child2.masterQueue, - maxRunCount: 1, + workerQueue: "main", }); - expect(dequeuedChild2.length).toBe(1); + assertNonNullable(dequeuedChild2[0]); const childAttempt2 = await engine.startRunAttempt({ runId: child2.id, @@ -1323,16 +1352,15 @@ describe("RunEngine checkpoints", () => { expect(parentExecutionDataAfterSecondChildComplete.batch?.id).toBe(batch.id); expect(parentExecutionDataAfterSecondChildComplete.completedWaitpoints.length).toBe(3); + await setTimeout(500); + // Dequeue the run - const dequeuedParentAfterCheckpoint = await engine.dequeueFromMasterQueue({ + const dequeuedParentAfterCheckpoint = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); - expect(dequeuedParentAfterCheckpoint.length).toBe(1); - expect(dequeuedParentAfterCheckpoint[0].run.id).toBe(parentRun.id); - expect(dequeuedParentAfterCheckpoint[0].snapshot.executionStatus).toBe("PENDING_EXECUTING"); + assertNonNullable(dequeuedParentAfterCheckpoint[0]); // Create an attempt const parentResumed = await engine.continueRunExecution({ diff --git a/internal-packages/run-engine/src/engine/tests/delays.test.ts b/internal-packages/run-engine/src/engine/tests/delays.test.ts index cf131f55ad..162967e9e9 100644 --- a/internal-packages/run-engine/src/engine/tests/delays.test.ts +++ b/internal-packages/run-engine/src/engine/tests/delays.test.ts @@ -64,7 +64,7 @@ describe("RunEngine delays", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -146,7 +146,7 @@ describe("RunEngine delays", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -243,7 +243,7 @@ describe("RunEngine delays", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -347,7 +347,7 @@ describe("RunEngine delays", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -384,10 +384,10 @@ describe("RunEngine delays", () => { expect(executionData3.run.status).toBe("CANCELED"); //attempt to dequeue - should get nothing - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); expect(dequeued.length).toBe(0); diff --git a/internal-packages/run-engine/src/engine/tests/dequeuing.test.ts b/internal-packages/run-engine/src/engine/tests/dequeuing.test.ts index c0d269017f..8ea142f630 100644 --- a/internal-packages/run-engine/src/engine/tests/dequeuing.test.ts +++ b/internal-packages/run-engine/src/engine/tests/dequeuing.test.ts @@ -1,12 +1,12 @@ import { containerTest } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; +import { DequeuedMessage } from "@trigger.dev/core/v3"; import { generateFriendlyId } from "@trigger.dev/core/v3/isomorphic"; import { PrismaClientOrTransaction } from "@trigger.dev/database"; import { expect } from "vitest"; import { MinimalAuthenticatedEnvironment } from "../../shared/index.js"; import { RunEngine } from "../index.js"; import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; -import { DequeuedMessage } from "@trigger.dev/core/v3"; vi.setConfig({ testTimeout: 60_000 }); @@ -24,6 +24,8 @@ describe("RunEngine dequeuing", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -59,18 +61,15 @@ describe("RunEngine dequeuing", () => { }); expect(runs.length).toBe(10); - //check the queue length - const queueLength = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment); - expect(queueLength).toBe(10); - //dequeue + await engine.runQueue.processMasterQueueForEnvironment(authenticatedEnvironment.id, 5); + const dequeued: DequeuedMessage[] = []; for (let i = 0; i < 5; i++) { dequeued.push( - ...(await engine.dequeueFromMasterQueue({ + ...(await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: "main", - maxRunCount: 1, + workerQueue: "main", })) ); } @@ -80,99 +79,6 @@ describe("RunEngine dequeuing", () => { await engine.quit(); } }); - - //This will fail until we support dequeuing multiple runs from a single environment - containerTest.fails( - "Dequeues runs within machine constraints", - async ({ prisma, redisOptions }) => { - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0005, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); - - try { - const taskIdentifier = "test-task"; - - //create background worker - await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier, { - preset: "small-1x", - }); - - //trigger the runs - const runs = await triggerRuns({ - engine, - environment: authenticatedEnvironment, - taskIdentifier, - prisma, - count: 20, - }); - expect(runs.length).toBe(20); - - //check the queue length - const queueLength = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment); - expect(queueLength).toBe(20); - - //dequeue - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: "main", - maxRunCount: 5, - maxResources: { - cpu: 1.1, - memory: 3.8, - }, - }); - expect(dequeued.length).toBe(2); - - //check the queue length - const queueLength2 = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment); - expect(queueLength2).toBe(18); - - const dequeued2 = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: "main", - maxRunCount: 10, - maxResources: { - cpu: 4.7, - memory: 3.0, - }, - }); - expect(dequeued2.length).toBe(6); - - //check the queue length - const queueLength3 = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment); - expect(queueLength3).toBe(12); - } finally { - await engine.quit(); - } - } - ); }); async function triggerRuns({ @@ -202,7 +108,7 @@ async function triggerRuns({ traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${taskIdentifier}`, isTest: false, tags: [], diff --git a/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts b/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts index 5f2ef0325c..9983415f51 100644 --- a/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts +++ b/internal-packages/run-engine/src/engine/tests/heartbeats.test.ts @@ -26,6 +26,8 @@ describe("RunEngine heartbeats", () => { retryOptions: { maxTimeoutInMs: 50, }, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -71,7 +73,7 @@ describe("RunEngine heartbeats", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -79,11 +81,12 @@ describe("RunEngine heartbeats", () => { prisma ); + await setTimeout(500); + //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //expect it to be pending with 0 consecutiveFailures @@ -101,10 +104,9 @@ describe("RunEngine heartbeats", () => { await setTimeout(1_000); //have to dequeue again - const dequeued2 = await engine.dequeueFromMasterQueue({ + const dequeued2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); expect(dequeued2.length).toBe(1); @@ -142,6 +144,8 @@ describe("RunEngine heartbeats", () => { minTimeoutInMs: 50, maxTimeoutInMs: 50, }, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -188,7 +192,7 @@ describe("RunEngine heartbeats", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -196,11 +200,12 @@ describe("RunEngine heartbeats", () => { prisma ); + await setTimeout(500); + //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //expect it to be pending @@ -216,10 +221,9 @@ describe("RunEngine heartbeats", () => { expect(executionData2.snapshot.executionStatus).toBe("QUEUED"); //have to dequeue again - const dequeued2 = await engine.dequeueFromMasterQueue({ + const dequeued2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); expect(dequeued2.length).toBe(1); @@ -263,6 +267,8 @@ describe("RunEngine heartbeats", () => { minTimeoutInMs: 50, maxTimeoutInMs: 50, }, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -308,7 +314,7 @@ describe("RunEngine heartbeats", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -316,11 +322,12 @@ describe("RunEngine heartbeats", () => { prisma ); + await setTimeout(500); + //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -343,11 +350,12 @@ describe("RunEngine heartbeats", () => { assertNonNullable(executionData2); expect(executionData2.snapshot.executionStatus).toBe("QUEUED"); + await engine.runQueue.processMasterQueueForEnvironment(authenticatedEnvironment.id); + //have to dequeue again - const dequeued2 = await engine.dequeueFromMasterQueue({ + const dequeued2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); expect(dequeued2.length).toBe(1); @@ -392,6 +400,8 @@ describe("RunEngine heartbeats", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -437,7 +447,7 @@ describe("RunEngine heartbeats", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -445,11 +455,12 @@ describe("RunEngine heartbeats", () => { prisma ); + await setTimeout(500); + //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -494,6 +505,8 @@ describe("RunEngine heartbeats", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -539,7 +552,7 @@ describe("RunEngine heartbeats", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -547,11 +560,12 @@ describe("RunEngine heartbeats", () => { prisma ); + await setTimeout(500); + //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -637,6 +651,8 @@ describe("RunEngine heartbeats", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -682,7 +698,7 @@ describe("RunEngine heartbeats", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -690,11 +706,12 @@ describe("RunEngine heartbeats", () => { prisma ); + await setTimeout(500); + //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt diff --git a/internal-packages/run-engine/src/engine/tests/pendingVersion.test.ts b/internal-packages/run-engine/src/engine/tests/pendingVersion.test.ts index c3fa33eba1..65498e32ff 100644 --- a/internal-packages/run-engine/src/engine/tests/pendingVersion.test.ts +++ b/internal-packages/run-engine/src/engine/tests/pendingVersion.test.ts @@ -24,6 +24,8 @@ describe("RunEngine pending version", () => { }, queue: { redis: redisOptions, + processWorkerQueueDebounceMs: 50, + masterQueueConsumersDisabled: true, }, runLock: { redis: redisOptions, @@ -61,7 +63,6 @@ describe("RunEngine pending version", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -82,7 +83,6 @@ describe("RunEngine pending version", () => { traceContext: {}, traceId: "t12346", spanId: "s12346", - masterQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -100,15 +100,14 @@ describe("RunEngine pending version", () => { await setupBackgroundWorker(engine, authenticatedEnvironment, ["test-task-other"]); - //dequeuing should fail + await setTimeout(500); const dequeued: DequeuedMessage[] = []; for (let i = 0; i < 2; i++) { dequeued.push( - ...(await engine.dequeueFromMasterQueue({ + ...(await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: "main", - maxRunCount: 1, + workerQueue: "main", })) ); } @@ -139,7 +138,7 @@ describe("RunEngine pending version", () => { ); //it's async so we wait - await setTimeout(500); + await setTimeout(1000); //should now be queued const executionData3R1 = await engine.getRunExecutionData({ runId: run.id }); @@ -150,13 +149,6 @@ describe("RunEngine pending version", () => { expect(executionData3R2.snapshot.executionStatus).toBe("QUEUED"); expect(executionData3R1.run.status).toBe("PENDING"); expect(executionData3R2.run.status).toBe("PENDING"); - - //queue should be empty - const queueLength2 = await engine.runQueue.lengthOfQueue( - authenticatedEnvironment, - run.queue - ); - expect(queueLength2).toBe(2); } finally { await engine.quit(); } @@ -179,6 +171,8 @@ describe("RunEngine pending version", () => { }, queue: { redis: redisOptions, + processWorkerQueueDebounceMs: 50, + masterQueueConsumersDisabled: true, }, runLock: { redis: redisOptions, @@ -198,6 +192,7 @@ describe("RunEngine pending version", () => { //set this so we have to requeue the runs in two batches queueRunsWaitingForWorkerBatchSize: 1, tracer: trace.getTracer("test", "0.0.0"), + logLevel: "debug", }); try { @@ -218,7 +213,6 @@ describe("RunEngine pending version", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", queue: "custom-queue", isTest: false, tags: [], @@ -239,7 +233,6 @@ describe("RunEngine pending version", () => { traceContext: {}, traceId: "t12346", spanId: "s12346", - masterQueue: "main", queue: "custom-queue-2", isTest: false, tags: [], @@ -255,14 +248,21 @@ describe("RunEngine pending version", () => { expect(executionDataR1.snapshot.executionStatus).toBe("QUEUED"); expect(executionDataR2.snapshot.executionStatus).toBe("QUEUED"); + await engine.runQueue.processMasterQueueForEnvironment(authenticatedEnvironment.id); + //dequeuing should fail - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); expect(dequeued.length).toBe(0); + const dequeued2 = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); + expect(dequeued2.length).toBe(0); + //queue should be empty const queueLength = await engine.runQueue.lengthOfQueue( authenticatedEnvironment, @@ -293,7 +293,7 @@ describe("RunEngine pending version", () => { ); //it's async so we wait - await setTimeout(500); + await setTimeout(1000); //should now be queued const executionData3R1 = await engine.getRunExecutionData({ runId: run.id }); @@ -304,20 +304,6 @@ describe("RunEngine pending version", () => { expect(executionData3R2.snapshot.executionStatus).toBe("QUEUED"); expect(executionData3R1.run.status).toBe("PENDING"); expect(executionData3R2.run.status).toBe("PENDING"); - - // custom-queue should have 1 run - const queueLength2 = await engine.runQueue.lengthOfQueue( - authenticatedEnvironment, - "custom-queue" - ); - expect(queueLength2).toBe(1); - - // custom-queue-2 should have 1 run - const queueLength3 = await engine.runQueue.lengthOfQueue( - authenticatedEnvironment, - "custom-queue-2" - ); - expect(queueLength3).toBe(1); } finally { await engine.quit(); } diff --git a/internal-packages/run-engine/src/engine/tests/priority.test.ts b/internal-packages/run-engine/src/engine/tests/priority.test.ts index 6f31f9df7d..24bcce7dc4 100644 --- a/internal-packages/run-engine/src/engine/tests/priority.test.ts +++ b/internal-packages/run-engine/src/engine/tests/priority.test.ts @@ -27,6 +27,8 @@ describe("RunEngine priority", () => { }, queue: { redis: redisOptions, + processWorkerQueueDebounceMs: 50, + masterQueueConsumersDisabled: true, }, runLock: { redis: redisOptions, @@ -71,23 +73,31 @@ describe("RunEngine priority", () => { priorityMs: priority, })), }); + expect(runs.length).toBe(priorities.length); - //check the queue length - const queueLength = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment); - expect(queueLength).toBe(priorities.length); + await engine.runQueue.processMasterQueueForEnvironment(authenticatedEnvironment.id, 5); //dequeue 4 times, in order const dequeue: DequeuedMessage[] = []; for (let i = 0; i < 4; i++) { - const items = await engine.dequeueFromMasterQueue({ + const items = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: "main", - maxRunCount: 1, + workerQueue: "main", }); dequeue.push(...items); } expect(dequeue.length).toBe(4); + + console.log( + "runs", + runs.map((r) => r.friendlyId) + ); + console.log( + "dequeued run IDs", + dequeue.map((d) => d.run.friendlyId) + ); + expect(dequeue[0].run.friendlyId).toBe(runs[4].friendlyId); expect(dequeue[1].run.friendlyId).toBe(runs[3].friendlyId); expect(dequeue[2].run.friendlyId).toBe(runs[1].friendlyId); @@ -95,10 +105,12 @@ describe("RunEngine priority", () => { //wait 2 seconds (because of the negative priority) await setTimeout(2_000); - const dequeue2 = await engine.dequeueFromMasterQueue({ + + await engine.runQueue.processMasterQueueForEnvironment(authenticatedEnvironment.id, 1); + + const dequeue2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: "main", - maxRunCount: 20, + workerQueue: "main", }); expect(dequeue2.length).toBe(1); expect(dequeue2[0].run.friendlyId).toBe(runs[2].friendlyId); @@ -175,18 +187,15 @@ describe("RunEngine priority", () => { }); expect(runs.length).toBe(queueTimestamps.length); - //check the queue length - const queueLength = await engine.runQueue.lengthOfEnvQueue(authenticatedEnvironment); - expect(queueLength).toBe(queueTimestamps.length); + await setTimeout(500); //dequeue (expect 4 items because of the negative priority) const dequeue: DequeuedMessage[] = []; for (let i = 0; i < 5; i++) { dequeue.push( - ...(await engine.dequeueFromMasterQueue({ + ...(await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: "main", - maxRunCount: 1, + workerQueue: "main", })) ); } @@ -235,7 +244,7 @@ async function triggerRuns({ traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${taskIdentifier}`, isTest: false, tags: [], diff --git a/internal-packages/run-engine/src/engine/tests/releaseConcurrency.test.ts b/internal-packages/run-engine/src/engine/tests/releaseConcurrency.test.ts index 41a4fb7abc..d22387bcec 100644 --- a/internal-packages/run-engine/src/engine/tests/releaseConcurrency.test.ts +++ b/internal-packages/run-engine/src/engine/tests/releaseConcurrency.test.ts @@ -1,52 +1,22 @@ -import { assertNonNullable, containerTest } from "@internal/testcontainers"; -import { trace } from "@internal/tracing"; -import { RunEngine } from "../index.js"; +import { assertNonNullable } from "@internal/testcontainers"; import { setTimeout } from "node:timers/promises"; -import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; import { EventBusEventArgs } from "../eventBus.js"; +import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; +import { engineTest } from "./utils/engineTest.js"; vi.setConfig({ testTimeout: 60_000 }); describe("RunEngine Releasing Concurrency", () => { - containerTest("defaults to releasing env concurrency only", async ({ prisma, redisOptions }) => { + engineTest.scoped({ + engineOptions: { + queue: { masterQueueConsumersDisabled: true, processWorkerQueueDebounceMs: 50 }, + }, + }); + + engineTest("defaults to releasing env concurrency only", async ({ engine, prisma }) => { //create environment const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0001, - }, - releaseConcurrency: { - maxTokensRatio: 1, - maxRetries: 3, - consumersCount: 1, - pollInterval: 500, - batchSize: 1, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); const taskIdentifier = "test-task"; await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); @@ -63,7 +33,7 @@ describe("RunEngine Releasing Concurrency", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${taskIdentifier}`, isTest: false, tags: [], @@ -71,10 +41,11 @@ describe("RunEngine Releasing Concurrency", () => { prisma ); - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( @@ -195,415 +166,314 @@ describe("RunEngine Releasing Concurrency", () => { ); expect(queueConcurrencyAfterWaitpoint3).toBe(1); + + await engine.quit(); }); - containerTest( - "releases all concurrency when configured on queue", - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + engineTest("releases all concurrency when configured on queue", async ({ engine, prisma }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0001, - }, - releaseConcurrency: { - maxTokensRatio: 1, - maxRetries: 3, - consumersCount: 1, - pollInterval: 500, - batchSize: 1, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); - const taskIdentifier = "test-task"; + const taskIdentifier = "test-task"; - await setupBackgroundWorker( - engine, - authenticatedEnvironment, + await setupBackgroundWorker( + engine, + authenticatedEnvironment, + taskIdentifier, + undefined, + undefined, + { + releaseConcurrencyOnWaitpoint: true, + } + ); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, taskIdentifier, - undefined, - undefined, - { - releaseConcurrencyOnWaitpoint: true, - } - ); + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_p1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queue: `task/${taskIdentifier}`, - isTest: false, - tags: [], - }, - prisma - ); + await setTimeout(500); - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); - const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - expect(queueConcurrency).toBe(1); + expect(queueConcurrency).toBe(1); - const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - expect(envConcurrency).toBe(1); + expect(envConcurrency).toBe(1); - // create an attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); - expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); - // create a manual waitpoint - const result = await engine.createManualWaitpoint({ - environmentId: authenticatedEnvironment.id, - projectId: authenticatedEnvironment.projectId, - }); + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); - // Block the run, not specifying any release concurrency option - const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ - runId: run.id, - waitpoints: result.waitpoint.id, - projectId: authenticatedEnvironment.projectId, - organizationId: authenticatedEnvironment.organizationId, - }); + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); - expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); - // Now confirm the queue has the same concurrency as before - const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - expect(queueConcurrencyAfter).toBe(0); + expect(queueConcurrencyAfter).toBe(0); - // Now confirm the environment has a concurrency of 0 - const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + // Now confirm the environment has a concurrency of 0 + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - expect(envConcurrencyAfter).toBe(0); + expect(envConcurrencyAfter).toBe(0); - // Complete the waitpoint and make sure the run reacquires the queue and environment concurrency - await engine.completeWaitpoint({ - id: result.waitpoint.id, - }); + // Complete the waitpoint and make sure the run reacquires the queue and environment concurrency + await engine.completeWaitpoint({ + id: result.waitpoint.id, + }); - await setTimeout(500); + await setTimeout(500); - // Test that we've reacquired the queue concurrency - const queueConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + // Test that we've reacquired the queue concurrency + const queueConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - expect(queueConcurrencyAfterWaitpoint).toBe(1); + expect(queueConcurrencyAfterWaitpoint).toBe(1); - // Test that we've reacquired the environment concurrency - const envConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + // Test that we've reacquired the environment concurrency + const envConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - expect(envConcurrencyAfterWaitpoint).toBe(1); + expect(envConcurrencyAfterWaitpoint).toBe(1); - // Now we are going to block with another waitpoint, this time specifiying we dont want to release the concurrency in the waitpoint - const result2 = await engine.createManualWaitpoint({ - environmentId: authenticatedEnvironment.id, - projectId: authenticatedEnvironment.projectId, - }); + // Now we are going to block with another waitpoint, this time specifiying we dont want to release the concurrency in the waitpoint + const result2 = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); - const executingWithWaitpointSnapshot2 = await engine.blockRunWithWaitpoint({ - runId: run.id, - waitpoints: result2.waitpoint.id, - projectId: authenticatedEnvironment.projectId, - organizationId: authenticatedEnvironment.organizationId, - releaseConcurrency: false, - }); + const executingWithWaitpointSnapshot2 = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result2.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: false, + }); - expect(executingWithWaitpointSnapshot2.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + expect(executingWithWaitpointSnapshot2.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); - // Test that we've not released the queue concurrency - const queueConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + // Test that we've not released the queue concurrency + const queueConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - expect(queueConcurrencyAfterWaitpoint2).toBe(1); + expect(queueConcurrencyAfterWaitpoint2).toBe(1); - // Test that we've still released the environment concurrency since we always release env concurrency - const envConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + // Test that we've still released the environment concurrency since we always release env concurrency + const envConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - expect(envConcurrencyAfterWaitpoint2).toBe(0); - } - ); + expect(envConcurrencyAfterWaitpoint2).toBe(0); + }); - containerTest( - "releases all concurrency for unlimited queues", - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + engineTest("releases all concurrency for unlimited queues", async ({ engine, prisma }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0001, - }, - releaseConcurrency: { - maxTokensRatio: 1, - maxRetries: 3, - consumersCount: 1, - pollInterval: 500, - batchSize: 1, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); - const taskIdentifier = "test-task"; + const taskIdentifier = "test-task"; - await setupBackgroundWorker( - engine, - authenticatedEnvironment, - taskIdentifier, - undefined, - undefined, - { - releaseConcurrencyOnWaitpoint: true, - concurrencyLimit: null, - } - ); + await setupBackgroundWorker( + engine, + authenticatedEnvironment, + taskIdentifier, + undefined, + undefined, + { + releaseConcurrencyOnWaitpoint: true, + concurrencyLimit: null, + } + ); - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_p1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queue: `task/${taskIdentifier}`, - isTest: false, - tags: [], - }, - prisma - ); + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); + await setTimeout(500); - const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); - expect(queueConcurrency).toBe(1); + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + expect(queueConcurrency).toBe(1); - expect(envConcurrency).toBe(1); + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - // create an attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); + expect(envConcurrency).toBe(1); - expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); - // create a manual waitpoint - const result = await engine.createManualWaitpoint({ - environmentId: authenticatedEnvironment.id, - projectId: authenticatedEnvironment.projectId, - }); + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); - // Block the run, not specifying any release concurrency option - const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ - runId: run.id, - waitpoints: result.waitpoint.id, - projectId: authenticatedEnvironment.projectId, - organizationId: authenticatedEnvironment.organizationId, - }); + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); - expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); - // Now confirm the queue has the same concurrency as before - const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); - expect(queueConcurrencyAfter).toBe(0); + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - // Now confirm the environment has a concurrency of 0 - const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + expect(queueConcurrencyAfter).toBe(0); - expect(envConcurrencyAfter).toBe(0); + // Now confirm the environment has a concurrency of 0 + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - // Complete the waitpoint and make sure the run reacquires the queue and environment concurrency - await engine.completeWaitpoint({ - id: result.waitpoint.id, - }); + expect(envConcurrencyAfter).toBe(0); - await setTimeout(500); + // Complete the waitpoint and make sure the run reacquires the queue and environment concurrency + await engine.completeWaitpoint({ + id: result.waitpoint.id, + }); - // Test that we've reacquired the queue concurrency - const queueConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + await setTimeout(500); - expect(queueConcurrencyAfterWaitpoint).toBe(1); + // Test that we've reacquired the queue concurrency + const queueConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - // Test that we've reacquired the environment concurrency - const envConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + expect(queueConcurrencyAfterWaitpoint).toBe(1); - expect(envConcurrencyAfterWaitpoint).toBe(1); + // Test that we've reacquired the environment concurrency + const envConcurrencyAfterWaitpoint = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - // Now we are going to block with another waitpoint, this time specifiying we dont want to release the concurrency in the waitpoint - const result2 = await engine.createManualWaitpoint({ - environmentId: authenticatedEnvironment.id, - projectId: authenticatedEnvironment.projectId, - }); + expect(envConcurrencyAfterWaitpoint).toBe(1); - const executingWithWaitpointSnapshot2 = await engine.blockRunWithWaitpoint({ - runId: run.id, - waitpoints: result2.waitpoint.id, - projectId: authenticatedEnvironment.projectId, - organizationId: authenticatedEnvironment.organizationId, - releaseConcurrency: false, - }); + // Now we are going to block with another waitpoint, this time specifiying we dont want to release the concurrency in the waitpoint + const result2 = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); - expect(executingWithWaitpointSnapshot2.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + const executingWithWaitpointSnapshot2 = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result2.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: false, + }); - // Test that we've not released the queue concurrency - const queueConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + expect(executingWithWaitpointSnapshot2.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); - expect(queueConcurrencyAfterWaitpoint2).toBe(1); + // Test that we've not released the queue concurrency + const queueConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - // Test that we've still released the environment concurrency since we always release env concurrency - const envConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + expect(queueConcurrencyAfterWaitpoint2).toBe(1); - expect(envConcurrencyAfterWaitpoint2).toBe(0); - } - ); + // Test that we've still released the environment concurrency since we always release env concurrency + const envConcurrencyAfterWaitpoint2 = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - containerTest( - "delays env concurrency release when token unavailable", - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + expect(envConcurrencyAfterWaitpoint2).toBe(0); + }); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0001, - }, + describe("when maxTokenRation is 0.1", () => { + engineTest.scoped({ + engineOptions: { releaseConcurrency: { maxTokensRatio: 0.1, // 10% of the concurrency limit = 1 token maxRetries: 3, @@ -611,170 +481,141 @@ describe("RunEngine Releasing Concurrency", () => { pollInterval: 500, batchSize: 1, }, - tracer: trace.getTracer("test", "0.0.0"), - }); - const taskIdentifier = "test-task"; - - await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + }, + }); - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_p1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queue: `task/${taskIdentifier}`, - isTest: false, - tags: [], - }, - prisma - ); + engineTest( + "delays env concurrency release when token unavailable", + async ({ engine, prisma }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const taskIdentifier = "test-task"; + + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); + await setTimeout(500); - const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); - expect(queueConcurrency).toBe(1); + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + expect(queueConcurrency).toBe(1); - expect(envConcurrency).toBe(1); + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - // create an attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); + expect(envConcurrency).toBe(1); - expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); - // create a manual waitpoint - const result = await engine.createManualWaitpoint({ - environmentId: authenticatedEnvironment.id, - projectId: authenticatedEnvironment.projectId, - }); + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); - await engine.releaseConcurrencySystem.consumeToken( - { - orgId: authenticatedEnvironment.organizationId, + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, projectId: authenticatedEnvironment.projectId, - envId: authenticatedEnvironment.id, - }, - "test_12345" - ); + }); - // Block the run, not specifying any release concurrency option - const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ - runId: run.id, - waitpoints: result.waitpoint.id, - projectId: authenticatedEnvironment.projectId, - organizationId: authenticatedEnvironment.organizationId, - }); + await engine.releaseConcurrencySystem.consumeToken( + { + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }, + "test_12345" + ); - expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); - // Now confirm the queue has the same concurrency as before - const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); - expect(queueConcurrencyAfter).toBe(1); + // Now confirm the queue has the same concurrency as before + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - // Now confirm the environment is the same as before - const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + expect(queueConcurrencyAfter).toBe(1); - expect(envConcurrencyAfter).toBe(1); + // Now confirm the environment is the same as before + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - // Now we return the token so the concurrency can be released - await engine.releaseConcurrencySystem.returnToken( - { - orgId: authenticatedEnvironment.organizationId, - projectId: authenticatedEnvironment.projectId, - envId: authenticatedEnvironment.id, - }, - "test_12345" - ); + expect(envConcurrencyAfter).toBe(1); - // Wait until the token is released - await setTimeout(1_000); + // Now we return the token so the concurrency can be released + await engine.releaseConcurrencySystem.returnToken( + { + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }, + "test_12345" + ); - // Now the environment should have a concurrency of 0 - const envConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + // Wait until the token is released + await setTimeout(1_000); - expect(envConcurrencyAfterReturn).toBe(0); + // Now the environment should have a concurrency of 0 + const envConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - // and the queue should have a concurrency of 1 - const queueConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + expect(envConcurrencyAfterReturn).toBe(0); - expect(queueConcurrencyAfterReturn).toBe(1); - } - ); + // and the queue should have a concurrency of 1 + const queueConcurrencyAfterReturn = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); + + expect(queueConcurrencyAfterReturn).toBe(1); + } + ); - containerTest( - "delays env concurrency release after checkpoint", - async ({ prisma, redisOptions }) => { + engineTest("delays env concurrency release after checkpoint", async ({ prisma, engine }) => { //create environment const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0001, - }, - releaseConcurrency: { - maxTokensRatio: 0.1, // 10% of the concurrency limit = 1 token - maxRetries: 3, - consumersCount: 1, - pollInterval: 500, - batchSize: 1, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); const taskIdentifier = "test-task"; await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); @@ -791,7 +632,7 @@ describe("RunEngine Releasing Concurrency", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${taskIdentifier}`, isTest: false, tags: [], @@ -799,10 +640,11 @@ describe("RunEngine Releasing Concurrency", () => { prisma ); - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( @@ -912,51 +754,13 @@ describe("RunEngine Releasing Concurrency", () => { `task/${taskIdentifier}` ); - expect(queueConcurrencyAfterReturn).toBe(1); - } - ); - - containerTest( - "maintains concurrency after waitpoint completion", - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0001, - }, - releaseConcurrency: { - maxTokensRatio: 0.1, // 10% of the concurrency limit = 1 token - maxRetries: 3, - consumersCount: 1, - pollInterval: 500, - batchSize: 1, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); + expect(queueConcurrencyAfterReturn).toBe(1); + }); + + engineTest("maintains concurrency after waitpoint completion", async ({ engine, prisma }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + const taskIdentifier = "test-task"; await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); @@ -973,7 +777,7 @@ describe("RunEngine Releasing Concurrency", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${taskIdentifier}`, isTest: false, tags: [], @@ -981,10 +785,11 @@ describe("RunEngine Releasing Concurrency", () => { prisma ); - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( @@ -1086,195 +891,126 @@ describe("RunEngine Releasing Concurrency", () => { ); expect(queueConcurrencyAfterReturn).toBe(1); - } - ); - - containerTest( - "refills token bucket after waitpoint completion when snapshot not in release queue", - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + }); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, + engineTest( + "refills token bucket after waitpoint completion when snapshot not in release queue", + async ({ prisma, engine }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); + + const taskIdentifier = "test-task"; + + await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], }, - baseCostInCents: 0.0001, - }, - releaseConcurrency: { - maxTokensRatio: 0.1, // 10% of the concurrency limit = 1 token - maxRetries: 3, - consumersCount: 1, - pollInterval: 500, - batchSize: 1, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); - const taskIdentifier = "test-task"; - - await setupBackgroundWorker(engine, authenticatedEnvironment, taskIdentifier); + prisma + ); - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_p1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queue: `task/${taskIdentifier}`, - isTest: false, - tags: [], - }, - prisma - ); + await setTimeout(500); - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); - const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - expect(queueConcurrency).toBe(1); + expect(queueConcurrency).toBe(1); - const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - expect(envConcurrency).toBe(1); + expect(envConcurrency).toBe(1); - // create an attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); - expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); - // create a manual waitpoint - const result = await engine.createManualWaitpoint({ - environmentId: authenticatedEnvironment.id, - projectId: authenticatedEnvironment.projectId, - }); + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); - // Block the run, not specifying any release concurrency option - const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ - runId: run.id, - waitpoints: result.waitpoint.id, - projectId: authenticatedEnvironment.projectId, - organizationId: authenticatedEnvironment.organizationId, - }); + // Block the run, not specifying any release concurrency option + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + }); - expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); - // Now confirm the environment concurrency has been released - const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + // Now confirm the environment concurrency has been released + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - expect(envConcurrencyAfter).toBe(0); + expect(envConcurrencyAfter).toBe(0); - // And confirm the release concurrency system has consumed the token - const queueMetrics = - await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ - orgId: authenticatedEnvironment.organizationId, - projectId: authenticatedEnvironment.projectId, - envId: authenticatedEnvironment.id, - }); + // And confirm the release concurrency system has consumed the token + const queueMetrics = + await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }); - expect(queueMetrics?.currentTokens).toBe(0); + expect(queueMetrics?.currentTokens).toBe(0); - await engine.completeWaitpoint({ - id: result.waitpoint.id, - }); + await engine.completeWaitpoint({ + id: result.waitpoint.id, + }); - await setTimeout(1_000); + await setTimeout(1_000); - const executionData2 = await engine.getRunExecutionData({ runId: run.id }); - expect(executionData2?.snapshot.executionStatus).toBe("EXECUTING"); + const executionData2 = await engine.getRunExecutionData({ runId: run.id }); + expect(executionData2?.snapshot.executionStatus).toBe("EXECUTING"); - const queueMetricsAfter = - await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ - orgId: authenticatedEnvironment.organizationId, - projectId: authenticatedEnvironment.projectId, - envId: authenticatedEnvironment.id, - }); + const queueMetricsAfter = + await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }); - expect(queueMetricsAfter?.currentTokens).toBe(1); - } - ); + expect(queueMetricsAfter?.currentTokens).toBe(1); + } + ); + }); - containerTest( + engineTest( "refills token bucket after waitpoint completion when unable to reacquire concurrency, after dequeuing the queued executing run", - async ({ prisma, redisOptions }) => { + async ({ prisma, engine }) => { //create environment const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0001, - }, - releaseConcurrency: { - maxTokensRatio: 1, - maxRetries: 3, - consumersCount: 1, - pollInterval: 500, - batchSize: 1, - }, - tracer: trace.getTracer("test", "0.0.0"), - }); const taskIdentifier = "test-task"; await setupBackgroundWorker( @@ -1300,7 +1036,7 @@ describe("RunEngine Releasing Concurrency", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: `task/${taskIdentifier}`, isTest: false, tags: [], @@ -1308,10 +1044,11 @@ describe("RunEngine Releasing Concurrency", () => { prisma ); - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( @@ -1389,7 +1126,7 @@ describe("RunEngine Releasing Concurrency", () => { traceContext: {}, traceId: "t12345-second", spanId: "s12345-second", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -1398,10 +1135,11 @@ describe("RunEngine Releasing Concurrency", () => { ); // Dequeue and start the second run - const dequeuedSecond = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + + const dequeuedSecond = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: secondRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); // Now confirm the environment concurrency has been released @@ -1460,29 +1198,17 @@ describe("RunEngine Releasing Concurrency", () => { await setTimeout(500); - // Check the current concurrency of the queue/environment - const queueConcurrencyAfterSecondFinished = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); - - expect(queueConcurrencyAfterSecondFinished).toBe(0); - - const envConcurrencyAfterSecondFinished = - await engine.runQueue.currentConcurrencyOfEnvironment(authenticatedEnvironment); - - expect(envConcurrencyAfterSecondFinished).toBe(0); - let event: EventBusEventArgs<"workerNotification">[0] | undefined = undefined; engine.eventBus.on("workerNotification", (result) => { event = result; }); + await setTimeout(500); + // Verify the first run is back in the queue - const queuedRun = await engine.dequeueFromMasterQueue({ + const queuedRun = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); // We don't actually return the run here from dequeuing, it's instead sent to the cluster as a workerNotification @@ -1505,38 +1231,9 @@ describe("RunEngine Releasing Concurrency", () => { } ); - containerTest( - "refills token bucket after the run has a new snapshot created by the release concurrency sweeper system", - async ({ prisma, redisOptions }) => { - //create environment - const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - - const engine = new RunEngine({ - prisma, - worker: { - redis: redisOptions, - workers: 1, - tasksPerWorker: 10, - pollIntervalMs: 100, - }, - queue: { - redis: redisOptions, - }, - runLock: { - redis: redisOptions, - }, - machines: { - defaultMachine: "small-1x", - machines: { - "small-1x": { - name: "small-1x" as const, - cpu: 0.5, - memory: 0.5, - centsPerMs: 0.0001, - }, - }, - baseCostInCents: 0.0001, - }, + describe("release concurrency sweeper system", () => { + engineTest.scoped({ + engineOptions: { releaseConcurrency: { maxTokensRatio: 1, maxRetries: 3, @@ -1546,143 +1243,152 @@ describe("RunEngine Releasing Concurrency", () => { batchSize: 1, releasingsMaxAge: 2_000, }, - tracer: trace.getTracer("test", "0.0.0"), - }); - const taskIdentifier = "test-task"; + }, + }); - await setupBackgroundWorker( - engine, - authenticatedEnvironment, - taskIdentifier, - undefined, - undefined, - { - concurrencyLimit: 1, - } - ); + engineTest( + "refills token bucket after the run has a new snapshot created by the release concurrency sweeper system", + async ({ prisma, engine }) => { + //create environment + const authenticatedEnvironment = await setupAuthenticatedEnvironment(prisma, "PRODUCTION"); - const run = await engine.trigger( - { - number: 1, - friendlyId: "run_p1234", - environment: authenticatedEnvironment, - taskIdentifier, - payload: "{}", - payloadType: "application/json", - context: {}, - traceContext: {}, - traceId: "t12345", - spanId: "s12345", - masterQueue: "main", - queue: `task/${taskIdentifier}`, - isTest: false, - tags: [], - }, - prisma - ); + const taskIdentifier = "test-task"; - const dequeued = await engine.dequeueFromMasterQueue({ - consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, - }); + await setupBackgroundWorker( + engine, + authenticatedEnvironment, + taskIdentifier, + undefined, + undefined, + { + concurrencyLimit: 1, + } + ); + + const run = await engine.trigger( + { + number: 1, + friendlyId: "run_p1234", + environment: authenticatedEnvironment, + taskIdentifier, + payload: "{}", + payloadType: "application/json", + context: {}, + traceContext: {}, + traceId: "t12345", + spanId: "s12345", + workerQueue: "main", + queue: `task/${taskIdentifier}`, + isTest: false, + tags: [], + }, + prisma + ); - const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + await setTimeout(500); - expect(queueConcurrency).toBe(1); + const dequeued = await engine.dequeueFromWorkerQueue({ + consumerId: "test_12345", + workerQueue: "main", + }); - const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + const queueConcurrency = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - expect(envConcurrency).toBe(1); + expect(queueConcurrency).toBe(1); - // create an attempt - const attemptResult = await engine.startRunAttempt({ - runId: dequeued[0].run.id, - snapshotId: dequeued[0].snapshot.id, - }); + const envConcurrency = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); + expect(envConcurrency).toBe(1); - // create a manual waitpoint - const result = await engine.createManualWaitpoint({ - environmentId: authenticatedEnvironment.id, - projectId: authenticatedEnvironment.projectId, - }); + // create an attempt + const attemptResult = await engine.startRunAttempt({ + runId: dequeued[0].run.id, + snapshotId: dequeued[0].snapshot.id, + }); - // Block the run, specifying the release concurrency option as true - const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ - runId: run.id, - waitpoints: result.waitpoint.id, - projectId: authenticatedEnvironment.projectId, - organizationId: authenticatedEnvironment.organizationId, - releaseConcurrency: true, - }); + expect(attemptResult.snapshot.executionStatus).toBe("EXECUTING"); - expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); + // create a manual waitpoint + const result = await engine.createManualWaitpoint({ + environmentId: authenticatedEnvironment.id, + projectId: authenticatedEnvironment.projectId, + }); - // Now confirm the environment concurrency has been released - const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( - authenticatedEnvironment - ); + // Block the run, specifying the release concurrency option as true + const executingWithWaitpointSnapshot = await engine.blockRunWithWaitpoint({ + runId: run.id, + waitpoints: result.waitpoint.id, + projectId: authenticatedEnvironment.projectId, + organizationId: authenticatedEnvironment.organizationId, + releaseConcurrency: true, + }); - expect(envConcurrencyAfter).toBe(0); + expect(executingWithWaitpointSnapshot.executionStatus).toBe("EXECUTING_WITH_WAITPOINTS"); - const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( - authenticatedEnvironment, - `task/${taskIdentifier}` - ); + // Now confirm the environment concurrency has been released + const envConcurrencyAfter = await engine.runQueue.currentConcurrencyOfEnvironment( + authenticatedEnvironment + ); - expect(queueConcurrencyAfter).toBe(0); + expect(envConcurrencyAfter).toBe(0); - // And confirm the release concurrency system has consumed the token - const queueMetrics = - await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ - orgId: authenticatedEnvironment.organizationId, - projectId: authenticatedEnvironment.projectId, - envId: authenticatedEnvironment.id, - }); + const queueConcurrencyAfter = await engine.runQueue.currentConcurrencyOfQueue( + authenticatedEnvironment, + `task/${taskIdentifier}` + ); - expect(queueMetrics?.currentTokens).toBe(9); + expect(queueConcurrencyAfter).toBe(0); - await setTimeout(3_000); + // And confirm the release concurrency system has consumed the token + const queueMetrics = + await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }); - const queueMetricsAfter = - await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ - orgId: authenticatedEnvironment.organizationId, - projectId: authenticatedEnvironment.projectId, - envId: authenticatedEnvironment.id, - }); + expect(queueMetrics?.currentTokens).toBe(9); - expect(queueMetricsAfter?.currentTokens).toBe(9); + await setTimeout(3_000); - // Now we create a new snapshot for the run, which will cause the sweeper system to refill the token bucket - await engine.executionSnapshotSystem.createExecutionSnapshot(prisma, { - run, - snapshot: { - executionStatus: "PENDING_CANCEL", - description: "Pending cancel", - }, - environmentId: authenticatedEnvironment.id, - environmentType: "PRODUCTION", - projectId: authenticatedEnvironment.projectId, - organizationId: authenticatedEnvironment.organizationId, - }); + const queueMetricsAfter = + await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }); - await setTimeout(3_000); + expect(queueMetricsAfter?.currentTokens).toBe(9); - const queueMetricsAfterRefill = - await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ - orgId: authenticatedEnvironment.organizationId, + // Now we create a new snapshot for the run, which will cause the sweeper system to refill the token bucket + await engine.executionSnapshotSystem.createExecutionSnapshot(prisma, { + run, + snapshot: { + executionStatus: "PENDING_CANCEL", + description: "Pending cancel", + }, + environmentId: authenticatedEnvironment.id, + environmentType: "PRODUCTION", projectId: authenticatedEnvironment.projectId, - envId: authenticatedEnvironment.id, + organizationId: authenticatedEnvironment.organizationId, }); - expect(queueMetricsAfterRefill?.currentTokens).toBe(10); - } - ); + await setTimeout(3_000); + + const queueMetricsAfterRefill = + await engine.releaseConcurrencySystem.releaseConcurrencyQueue?.getReleaseQueueMetrics({ + orgId: authenticatedEnvironment.organizationId, + projectId: authenticatedEnvironment.projectId, + envId: authenticatedEnvironment.id, + }); + + expect(queueMetricsAfterRefill?.currentTokens).toBe(10); + } + ); + }); }); diff --git a/internal-packages/run-engine/src/engine/tests/trigger.test.ts b/internal-packages/run-engine/src/engine/tests/trigger.test.ts index 2716cf3df1..0fd5921f10 100644 --- a/internal-packages/run-engine/src/engine/tests/trigger.test.ts +++ b/internal-packages/run-engine/src/engine/tests/trigger.test.ts @@ -4,6 +4,7 @@ import { expect } from "vitest"; import { EventBusEventArgs } from "../eventBus.js"; import { RunEngine } from "../index.js"; import { setupAuthenticatedEnvironment, setupBackgroundWorker } from "./setup.js"; +import { setTimeout } from "node:timers/promises"; vi.setConfig({ testTimeout: 60_000 }); @@ -22,6 +23,8 @@ describe("RunEngine trigger()", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -64,7 +67,7 @@ describe("RunEngine trigger()", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -106,11 +109,12 @@ describe("RunEngine trigger()", () => { ); expect(envConcurrencyBefore).toBe(0); + await setTimeout(500); + //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); expect(dequeued.length).toBe(1); expect(dequeued[0].run.id).toBe(run.id); @@ -215,6 +219,8 @@ describe("RunEngine trigger()", () => { pollIntervalMs: 100, }, queue: { + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, redis: redisOptions, }, runLock: { @@ -258,7 +264,7 @@ describe("RunEngine trigger()", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -266,11 +272,12 @@ describe("RunEngine trigger()", () => { prisma ); + await setTimeout(500); + //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt diff --git a/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts b/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts index fe806168bb..101314e86d 100644 --- a/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts +++ b/internal-packages/run-engine/src/engine/tests/triggerAndWait.test.ts @@ -22,6 +22,8 @@ describe("RunEngine triggerAndWait", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -61,19 +63,19 @@ describe("RunEngine triggerAndWait", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", queue: `task/${parentTask}`, isTest: false, tags: [], + workerQueue: "main", }, prisma ); //dequeue parent - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -96,12 +98,12 @@ describe("RunEngine triggerAndWait", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", queue: `task/${childTask}`, isTest: false, tags: [], resumeParentOnCompletion: true, parentTaskRunId: parentRun.id, + workerQueue: "main", }, prisma ); @@ -128,10 +130,10 @@ describe("RunEngine triggerAndWait", () => { expect(runWaitpoint.waitpoint.completedByTaskRunId).toBe(childRun.id); //dequeue the child run - const dequeuedChild = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeuedChild = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: childRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //start the child run @@ -210,6 +212,8 @@ describe("RunEngine triggerAndWait", () => { }, queue: { redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, }, runLock: { redis: redisOptions, @@ -249,19 +253,19 @@ describe("RunEngine triggerAndWait", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", queue: `task/${parentTask}`, isTest: false, tags: [], + workerQueue: "main", }, prisma ); //dequeue parent and create the attempt - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun1.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); const attemptResult = await engine.startRunAttempt({ runId: parentRun1.id, @@ -281,12 +285,12 @@ describe("RunEngine triggerAndWait", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", queue: `task/${childTask}`, isTest: false, tags: [], resumeParentOnCompletion: true, parentTaskRunId: parentRun1.id, + workerQueue: "main", }, prisma ); @@ -313,10 +317,10 @@ describe("RunEngine triggerAndWait", () => { expect(runWaitpoint.waitpoint.completedByTaskRunId).toBe(childRun.id); //dequeue the child run - const dequeuedChild = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeuedChild = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: childRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //start the child run @@ -338,18 +342,18 @@ describe("RunEngine triggerAndWait", () => { traceContext: {}, traceId: "t12346", spanId: "s12346", - masterQueue: "main", queue: `task/${parentTask}`, isTest: false, tags: [], + workerQueue: "main", }, prisma ); //dequeue 2nd parent - const dequeued2 = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued2 = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: parentRun2.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create the 2nd parent attempt diff --git a/internal-packages/run-engine/src/engine/tests/ttl.test.ts b/internal-packages/run-engine/src/engine/tests/ttl.test.ts index 0ede60fbfd..737fd6fbad 100644 --- a/internal-packages/run-engine/src/engine/tests/ttl.test.ts +++ b/internal-packages/run-engine/src/engine/tests/ttl.test.ts @@ -23,6 +23,8 @@ describe("RunEngine ttl", () => { }, queue: { redis: redisOptions, + processWorkerQueueDebounceMs: 50, + masterQueueConsumersDisabled: true, }, runLock: { redis: redisOptions, @@ -65,7 +67,7 @@ describe("RunEngine ttl", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], diff --git a/internal-packages/run-engine/src/engine/tests/utils/engineTest.ts b/internal-packages/run-engine/src/engine/tests/utils/engineTest.ts new file mode 100644 index 0000000000..4c3563e1e6 --- /dev/null +++ b/internal-packages/run-engine/src/engine/tests/utils/engineTest.ts @@ -0,0 +1,159 @@ +import { TaskContext, test, TestAPI } from "vitest"; +import { + logCleanup, + network, + postgresContainer, + prisma, + redisContainer, + redisOptions, + StartedNetwork, + StartedPostgreSqlContainer, + StartedRedisContainer, + type PostgresAndRedisContext, +} from "@internal/testcontainers"; +import { RunEngine } from "../../index.js"; +import { PrismaClient } from "@trigger.dev/database"; +import { RedisOptions } from "@internal/redis"; +import { trace } from "@internal/tracing"; +import { RunEngineOptions } from "../../types.js"; + +type Use = (value: T) => Promise; + +type EngineOptions = { + worker?: { + workers?: number; + tasksPerWorker?: number; + pollIntervalMs?: number; + }; + queue?: { + processWorkerQueueDebounceMs?: number; + masterQueueConsumersDisabled?: boolean; + }; + machines?: { + defaultMachine?: RunEngineOptions["machines"]["defaultMachine"]; + machines?: RunEngineOptions["machines"]["machines"]; + baseCostInCents?: number; + }; + releaseConcurrency?: { + maxTokensRatio?: number; + maxRetries?: number; + consumersCount?: number; + pollInterval?: number; + batchSize?: number; + releasingsPollInterval?: number; + releasingsMaxAge?: number; + }; +}; + +const engineOptions = async ({}: TaskContext, use: Use) => { + const options: EngineOptions = { + worker: { + workers: 1, + tasksPerWorker: 10, + pollIntervalMs: 100, + }, + queue: { + processWorkerQueueDebounceMs: 50, + masterQueueConsumersDisabled: true, + }, + machines: { + defaultMachine: "small-1x", + machines: { + "small-1x": { + name: "small-1x" as const, + cpu: 0.5, + memory: 0.5, + centsPerMs: 0.0001, + }, + }, + baseCostInCents: 0.0001, + }, + releaseConcurrency: { + maxTokensRatio: 1, + maxRetries: 3, + consumersCount: 1, + pollInterval: 500, + batchSize: 1, + }, + }; + + await use(options); +}; + +const engine = async ( + { + engineOptions, + task, + redisOptions, + prisma, + }: { + engineOptions: EngineOptions; + redisOptions: RedisOptions; + prisma: PrismaClient; + } & TaskContext, + use: Use +) => { + const engine = new RunEngine({ + prisma, + worker: { + redis: redisOptions, + workers: engineOptions.worker?.workers ?? 1, + tasksPerWorker: engineOptions.worker?.tasksPerWorker ?? 10, + pollIntervalMs: engineOptions.worker?.pollIntervalMs ?? 100, + }, + queue: { + redis: redisOptions, + processWorkerQueueDebounceMs: engineOptions.queue?.processWorkerQueueDebounceMs ?? 50, + masterQueueConsumersDisabled: engineOptions.queue?.masterQueueConsumersDisabled ?? true, + }, + runLock: { + redis: redisOptions, + }, + machines: { + defaultMachine: engineOptions.machines?.defaultMachine ?? ("small-1x" as const), + machines: engineOptions.machines?.machines ?? {}, + baseCostInCents: engineOptions.machines?.baseCostInCents ?? 0.0001, + }, + releaseConcurrency: { + maxTokensRatio: engineOptions.releaseConcurrency?.maxTokensRatio ?? 1, + maxRetries: engineOptions.releaseConcurrency?.maxRetries ?? 3, + consumersCount: engineOptions.releaseConcurrency?.consumersCount ?? 1, + pollInterval: engineOptions.releaseConcurrency?.pollInterval ?? 500, + batchSize: engineOptions.releaseConcurrency?.batchSize ?? 1, + releasingsPollInterval: engineOptions.releaseConcurrency?.releasingsPollInterval, + releasingsMaxAge: engineOptions.releaseConcurrency?.releasingsMaxAge, + }, + tracer: trace.getTracer("test", "0.0.0"), + }); + + const testName = task.name; + + try { + await use(engine); + } finally { + await logCleanup("engine", engine.quit(), { testName }); + } +}; + +export type EngineContext = PostgresAndRedisContext & { + engineOptions: EngineOptions; + engine: RunEngine; +}; + +export const engineTest: TestAPI<{ + redisOptions: RedisOptions; + prisma: PrismaClient; + engineOptions: EngineOptions; + engine: RunEngine; + network: StartedNetwork; + postgresContainer: StartedPostgreSqlContainer; + redisContainer: StartedRedisContainer; +}> = test.extend({ + network, + postgresContainer, + prisma, + redisContainer, + redisOptions, + engineOptions, + engine, +}); diff --git a/internal-packages/run-engine/src/engine/tests/waitpointRace.test.ts b/internal-packages/run-engine/src/engine/tests/waitpointRace.test.ts index 17df3f724a..f1e8c58006 100644 --- a/internal-packages/run-engine/src/engine/tests/waitpointRace.test.ts +++ b/internal-packages/run-engine/src/engine/tests/waitpointRace.test.ts @@ -15,7 +15,11 @@ describe("RunEngine Waitpoints – race condition", () => { const engine = new RunEngine({ prisma, worker: { redis: redisOptions, workers: 1, tasksPerWorker: 10, pollIntervalMs: 100 }, - queue: { redis: redisOptions }, + queue: { + redis: redisOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + }, runLock: { redis: redisOptions }, machines: { defaultMachine: "small-1x", @@ -43,7 +47,7 @@ describe("RunEngine Waitpoints – race condition", () => { traceContext: {}, traceId: "race-trace", spanId: "race-span", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -51,10 +55,10 @@ describe("RunEngine Waitpoints – race condition", () => { prisma ); - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); await engine.startRunAttempt({ runId: dequeued[0].run.id, diff --git a/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts b/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts index 4beadd6a74..25414a91e8 100644 --- a/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts +++ b/internal-packages/run-engine/src/engine/tests/waitpoints.test.ts @@ -62,7 +62,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -71,10 +71,10 @@ describe("RunEngine Waitpoints", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -177,7 +177,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -186,10 +186,10 @@ describe("RunEngine Waitpoints", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -319,7 +319,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -328,10 +328,10 @@ describe("RunEngine Waitpoints", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -457,7 +457,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -466,10 +466,10 @@ describe("RunEngine Waitpoints", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -574,7 +574,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -583,10 +583,10 @@ describe("RunEngine Waitpoints", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -723,7 +723,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -732,10 +732,10 @@ describe("RunEngine Waitpoints", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -871,7 +871,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -880,10 +880,10 @@ describe("RunEngine Waitpoints", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -1022,7 +1022,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345", spanId: "s12345", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -1031,10 +1031,10 @@ describe("RunEngine Waitpoints", () => { ); //dequeue the run - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); //create an attempt @@ -1192,7 +1192,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345-first", spanId: "s12345-first", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -1201,10 +1201,10 @@ describe("RunEngine Waitpoints", () => { ); // Dequeue and start the first run - const dequeuedFirst = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeuedFirst = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: firstRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); const firstAttempt = await engine.startRunAttempt({ @@ -1246,7 +1246,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t12345-second", spanId: "s12345-second", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -1255,10 +1255,10 @@ describe("RunEngine Waitpoints", () => { ); // Dequeue and start the second run - const dequeuedSecond = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeuedSecond = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: secondRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); const secondAttempt = await engine.startRunAttempt({ @@ -1322,10 +1322,9 @@ describe("RunEngine Waitpoints", () => { }); // Verify the first run is back in the queue - const queuedRun = await engine.dequeueFromMasterQueue({ + const queuedRun = await engine.dequeueFromWorkerQueue({ consumerId: "test_12345", - masterQueue: firstRun.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); expect(queuedRun.length).toBe(0); @@ -1395,7 +1394,7 @@ describe("RunEngine Waitpoints", () => { traceContext: {}, traceId: "t_snapshotsince", spanId: "s_snapshotsince", - masterQueue: "main", + workerQueue: "main", queue: "task/test-task", isTest: false, tags: [], @@ -1404,10 +1403,10 @@ describe("RunEngine Waitpoints", () => { ); // Dequeue and start the run (snapshot 1) - const dequeued = await engine.dequeueFromMasterQueue({ + await setTimeout(500); + const dequeued = await engine.dequeueFromWorkerQueue({ consumerId: "test_snapshotsince", - masterQueue: run.masterQueue, - maxRunCount: 10, + workerQueue: "main", }); const attemptResult = await engine.startRunAttempt({ runId: dequeued[0].run.id, diff --git a/internal-packages/run-engine/src/engine/types.ts b/internal-packages/run-engine/src/engine/types.ts index 84281bc505..f07dd703ab 100644 --- a/internal-packages/run-engine/src/engine/types.ts +++ b/internal-packages/run-engine/src/engine/types.ts @@ -1,6 +1,6 @@ import { type RedisOptions } from "@internal/redis"; import { Worker, type WorkerConcurrencyOptions } from "@trigger.dev/redis-worker"; -import { Tracer } from "@internal/tracing"; +import { Meter, Tracer } from "@internal/tracing"; import { MachinePreset, MachinePresetName, @@ -12,6 +12,7 @@ import { PrismaClient } from "@trigger.dev/database"; import { FairQueueSelectionStrategyOptions } from "../run-queue/fairQueueSelectionStrategy.js"; import { MinimalAuthenticatedEnvironment } from "../shared/index.js"; import { workerCatalog } from "./workerCatalog.js"; +import { Logger, LogLevel } from "@trigger.dev/core/logger"; export type RunEngineOptions = { prisma: PrismaClient; @@ -29,12 +30,18 @@ export type RunEngineOptions = { }; queue: { redis: RedisOptions; + shardCount?: number; + masterQueueConsumersDisabled?: boolean; + processWorkerQueueDebounceMs?: number; + masterQueueConsumersIntervalMs?: number; + workerOptions?: WorkerConcurrencyOptions; retryOptions?: RetryOptions; defaultEnvConcurrency?: number; queueSelectionStrategyOptions?: Pick< FairQueueSelectionStrategyOptions, "parentQueueLimit" | "tracer" | "biases" | "reuseSnapshotCount" | "maximumEnvCount" >; + dequeueBlockingTimeoutSeconds?: number; }; runLock: { redis: RedisOptions; @@ -44,6 +51,9 @@ export type RunEngineOptions = { heartbeatTimeoutsMs?: Partial; queueRunsWaitingForWorkerBatchSize?: number; tracer: Tracer; + meter?: Meter; + logger?: Logger; + logLevel?: LogLevel; releaseConcurrency?: { disabled?: boolean; maxTokensRatio?: number; @@ -90,7 +100,7 @@ export type TriggerParams = { sdkVersion?: string; cliVersion?: string; concurrencyKey?: string; - masterQueue?: string; + workerQueue?: string; queue: string; lockedQueueId?: string; isTest: boolean; diff --git a/internal-packages/run-engine/src/run-queue/index.test.ts b/internal-packages/run-engine/src/run-queue/index.test.ts index 302e7b6c68..a587a0a06d 100644 --- a/internal-packages/run-engine/src/run-queue/index.test.ts +++ b/internal-packages/run-engine/src/run-queue/index.test.ts @@ -1,4 +1,4 @@ -import { redisTest } from "@internal/testcontainers"; +import { assertNonNullable, redisTest } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { describe } from "node:test"; @@ -15,7 +15,7 @@ const testOptions = { workers: 1, defaultEnvConcurrency: 25, enableRebalancing: false, - logger: new Logger("RunQueue", "warn"), + logger: new Logger("RunQueue", "debug"), retryOptions: { maxAttempts: 5, factor: 1.1, @@ -67,96 +67,8 @@ const messageDev: InputPayload = { }; describe("RunQueue", () => { - redisTest("Get/set Queue concurrency limit", { timeout: 15_000 }, async ({ redisContainer }) => { - const queue = new RunQueue({ - ...testOptions, - queueSelectionStrategy: new FairQueueSelectionStrategy({ - redis: { - keyPrefix: "runqueue:test:", - host: redisContainer.getHost(), - port: redisContainer.getPort(), - }, - keys: testOptions.keys, - }), - redis: { - keyPrefix: "runqueue:test:", - host: redisContainer.getHost(), - port: redisContainer.getPort(), - }, - }); - - try { - //initial value - const initial = await queue.getQueueConcurrencyLimit(authenticatedEnvProd, "task/my-task"); - expect(initial).toBe(undefined); - - //set 20 - const result = await queue.updateQueueConcurrencyLimits( - authenticatedEnvProd, - "task/my-task", - 20 - ); - expect(result).toBe("OK"); - - //get 20 - const updated = await queue.getQueueConcurrencyLimit(authenticatedEnvProd, "task/my-task"); - expect(updated).toBe(20); - - //remove - const result2 = await queue.removeQueueConcurrencyLimits( - authenticatedEnvProd, - "task/my-task" - ); - expect(result2).toBe(1); - - //get undefined - const removed = await queue.getQueueConcurrencyLimit(authenticatedEnvProd, "task/my-task"); - expect(removed).toBe(undefined); - } finally { - await queue.quit(); - } - }); - - redisTest("Update env concurrency limits", { timeout: 5_000 }, async ({ redisContainer }) => { - const queue = new RunQueue({ - ...testOptions, - queueSelectionStrategy: new FairQueueSelectionStrategy({ - redis: { - keyPrefix: "runqueue:test:", - host: redisContainer.getHost(), - port: redisContainer.getPort(), - }, - keys: testOptions.keys, - }), - redis: { - keyPrefix: "runqueue:test:", - host: redisContainer.getHost(), - port: redisContainer.getPort(), - }, - }); - - try { - //initial value - const initial = await queue.getEnvConcurrencyLimit(authenticatedEnvProd); - expect(initial).toBe(25); - - //set 20 - await queue.updateEnvConcurrencyLimits({ - ...authenticatedEnvProd, - maximumConcurrencyLimit: 20, - }); - - //get 20 - const updated = await queue.getEnvConcurrencyLimit(authenticatedEnvProd); - expect(updated).toBe(20); - } finally { - await queue.quit(); - } - }); - redisTest( "Enqueue/Dequeue a message in env (DEV run, no concurrency key)", - { timeout: 5_000 }, async ({ redisContainer }) => { const queue = new RunQueue({ ...testOptions, @@ -186,13 +98,11 @@ describe("RunQueue", () => { const oldestScore = await queue.oldestMessageInQueue(authenticatedEnvDev, messageDev.queue); expect(oldestScore).toBe(undefined); - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - //enqueue message await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: authenticatedEnvDev.id, }); //queue length @@ -213,20 +123,27 @@ describe("RunQueue", () => { authenticatedEnvDev, messageDev.queue ); + expect(queueConcurrency).toBe(0); const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); expect(envConcurrency).toBe(0); - const dequeued = await queue.dequeueMessageFromMasterQueue( + await setTimeout(1000); + + const dequeued = await queue.dequeueMessageFromWorkerQueue( "test_12345", - envMasterQueue, - 10 + authenticatedEnvDev.id ); - expect(dequeued.length).toBe(1); - expect(dequeued[0].messageId).toEqual(messageDev.runId); - expect(dequeued[0].message.orgId).toEqual(messageDev.orgId); - expect(dequeued[0].message.version).toEqual("1"); - expect(dequeued[0].message.masterQueues).toEqual(["main", envMasterQueue]); + + assertNonNullable(dequeued); + expect(dequeued.messageId).toEqual(messageDev.runId); + expect(dequeued.message.orgId).toEqual(messageDev.orgId); + expect(dequeued.message.version).toEqual("2"); + + const workerQueue = + dequeued.message.version == "2" ? dequeued.message.workerQueue : undefined; + + expect(workerQueue).toEqual(authenticatedEnvDev.id); //concurrencies const queueConcurrency2 = await queue.currentConcurrencyOfQueue( @@ -243,15 +160,11 @@ describe("RunQueue", () => { const envQueueLength3 = await queue.lengthOfEnvQueue(authenticatedEnvDev); expect(envQueueLength3).toBe(0); - const dequeued2 = await queue.dequeueMessageFromMasterQueue( + const dequeued2 = await queue.dequeueMessageFromWorkerQueue( "test_12345", - envMasterQueue, - 10 + authenticatedEnvDev.id ); - expect(dequeued2.length).toBe(0); - - const dequeued3 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(dequeued3.length).toBe(0); + expect(dequeued2).toBe(undefined); } finally { await queue.quit(); } @@ -260,7 +173,6 @@ describe("RunQueue", () => { redisTest( "Enqueue/Dequeue a message from the main queue (PROD run, no concurrency key)", - { timeout: 5_000 }, async ({ redisContainer }) => { const queue = new RunQueue({ ...testOptions, @@ -293,13 +205,114 @@ describe("RunQueue", () => { ); expect(oldestScore).toBe(undefined); - const envMasterQueue = `env:${authenticatedEnvDev.id}`; + //enqueue message + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + workerQueue: "main", + }); + + //queue length + const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(queueLength).toBe(1); + const envLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envLength).toBe(1); + + //oldest message + const oldestScore2 = await queue.oldestMessageInQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(oldestScore2).toBe(messageProd.timestamp); + + //concurrencies + const queueConcurrency = await queue.currentConcurrencyOfQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(queueConcurrency).toBe(0); + const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); + expect(envConcurrency).toBe(0); + + await setTimeout(1000); + + //dequeue + const dequeued = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + + assertNonNullable(dequeued); + expect(dequeued).toBeDefined(); + expect(dequeued!.messageId).toEqual(messageProd.runId); + expect(dequeued!.message.orgId).toEqual(messageProd.orgId); + expect(dequeued!.message.version).toEqual("2"); + + const workerQueue = + dequeued.message.version == "2" ? dequeued.message.workerQueue : undefined; + expect(workerQueue).toEqual("main"); + + //concurrencies + const queueConcurrency2 = await queue.currentConcurrencyOfQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(queueConcurrency2).toBe(1); + const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); + expect(envConcurrency2).toBe(1); + + //queue length + const length2 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(length2).toBe(0); + const envLength2 = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envLength2).toBe(0); + + const dequeued2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(dequeued2).toBe(undefined); + } finally { + await queue.quit(); + } + } + ); + + redisTest( + "Enqueue/Dequeue a message with dequeue consumers disabled", + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + //initial queue length + const result = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(result).toBe(0); + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envQueueLength).toBe(0); + + //initial oldest message + const oldestScore = await queue.oldestMessageInQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(oldestScore).toBe(undefined); //enqueue message await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: ["main", envMasterQueue], + workerQueue: "main", }); //queue length @@ -324,13 +337,20 @@ describe("RunQueue", () => { const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); expect(envConcurrency).toBe(0); + await setTimeout(1000); + //dequeue - const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(dequeued.length).toBe(1); - expect(dequeued[0].messageId).toEqual(messageProd.runId); - expect(dequeued[0].message.orgId).toEqual(messageProd.orgId); - expect(dequeued[0].message.version).toEqual("1"); - expect(dequeued[0].message.masterQueues).toEqual(["main", envMasterQueue]); + const dequeued = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + + assertNonNullable(dequeued); + expect(dequeued).toBeDefined(); + expect(dequeued!.messageId).toEqual(messageProd.runId); + expect(dequeued!.message.orgId).toEqual(messageProd.orgId); + expect(dequeued!.message.version).toEqual("2"); + + const workerQueue = + dequeued.message.version == "2" ? dequeued.message.workerQueue : undefined; + expect(workerQueue).toEqual("main"); //concurrencies const queueConcurrency2 = await queue.currentConcurrencyOfQueue( @@ -347,21 +367,21 @@ describe("RunQueue", () => { const envLength2 = await queue.lengthOfEnvQueue(authenticatedEnvProd); expect(envLength2).toBe(0); - const dequeued2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(dequeued2.length).toBe(0); + const dequeued2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(dequeued2).toBe(undefined); } finally { await queue.quit(); } } ); - // This test fails now because we only return a single run per env. We will change this in the future. - redisTest.fails( - "Dequeue multiple messages from the queue", - { timeout: 5_000 }, + redisTest( + "Dequeue a message when another message on the same queue is acked", async ({ redisContainer }) => { const queue = new RunQueue({ ...testOptions, + masterQueueConsumersDisabled: true, + processWorkerQueueDebounceMs: 50, queueSelectionStrategy: new FairQueueSelectionStrategy({ redis: { keyPrefix: "runqueue:test:", @@ -378,63 +398,186 @@ describe("RunQueue", () => { }); try { - // Create 20 messages with different runIds and some with different queues - const messages = Array.from({ length: 20 }, (_, i) => ({ - ...messageProd, - taskIdentifier: i < 15 ? "task/my-task" : "task/other-task", // Mix up the queues - runId: `r${i + 1}`, - queue: i < 15 ? "task/my-task" : "task/other-task", // Mix up the queues - })); - - // Enqueue all messages - for (const message of messages) { - await queue.enqueueMessage({ - env: authenticatedEnvProd, - message, - masterQueues: "main", - }); - } - - // Check initial queue lengths - const initialLength1 = await queue.lengthOfQueue(authenticatedEnvProd, "task/my-task"); - const initialLength2 = await queue.lengthOfQueue(authenticatedEnvProd, "task/other-task"); - expect(initialLength1).toBe(15); - expect(initialLength2).toBe(5); + // Set queue concurrency limit to 1 + await queue.updateQueueConcurrencyLimits(authenticatedEnvProd, messageProd.queue, 1); + + //initial queue length + const result = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(result).toBe(0); const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); - expect(envQueueLength).toBe(20); - - // Dequeue first batch of 10 messages - const dequeued1 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(dequeued1.length).toBe(10); - - // Dequeue second batch of 10 messages - const dequeued2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(dequeued2.length).toBe(10); - - // Combine all dequeued message IDs - const dequeuedIds = [...dequeued1, ...dequeued2].map((m) => m.messageId); - - // Check that all original messages were dequeued - const allOriginalIds = messages.map((m) => m.runId); - expect(dequeuedIds.sort()).toEqual(allOriginalIds.sort()); - - // Try to dequeue more - should get none - const dequeued3 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(dequeued3.length).toBe(0); - - // Check final queue lengths - const finalLength1 = await queue.lengthOfQueue(authenticatedEnvProd, "task/my-task"); - const finalLength2 = await queue.lengthOfQueue(authenticatedEnvProd, "task/other-task"); - expect(finalLength1).toBe(0); - expect(finalLength2).toBe(0); - const finalEnvQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); - expect(finalEnvQueueLength).toBe(0); + expect(envQueueLength).toBe(0); + + //initial oldest message + const oldestScore = await queue.oldestMessageInQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(oldestScore).toBe(undefined); + + //enqueue message + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + workerQueue: "main", + skipDequeueProcessing: true, + }); + + // Enqueue another message + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: { ...messageProd, runId: "r4322" }, + workerQueue: "main", + skipDequeueProcessing: true, + }); + + //queue length + const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(queueLength).toBe(2); + const envLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envLength).toBe(2); + + //oldest message + const oldestScore2 = await queue.oldestMessageInQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(oldestScore2).toBe(messageProd.timestamp); + + //concurrencies + const queueConcurrency = await queue.currentConcurrencyOfQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(queueConcurrency).toBe(0); + const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); + expect(envConcurrency).toBe(0); + + // Process the message so it can be dequeued + await queue.processMasterQueueForEnvironment(authenticatedEnvProd.id, 1); + + //dequeue + const dequeued = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + + assertNonNullable(dequeued); + expect(dequeued).toBeDefined(); + expect(dequeued!.messageId).toEqual(messageProd.runId); + expect(dequeued!.message.orgId).toEqual(messageProd.orgId); + expect(dequeued!.message.version).toEqual("2"); + + // Now lets ack the message + await queue.acknowledgeMessage(messageProd.orgId, messageProd.runId); + + await setTimeout(1000); + + // Now we can dequeue the other message + const dequeued2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + assertNonNullable(dequeued2); + expect(dequeued2).toBeDefined(); + expect(dequeued2!.messageId).toEqual("r4322"); + expect(dequeued2!.message.orgId).toEqual(messageProd.orgId); + expect(dequeued2!.message.version).toEqual("2"); } finally { await queue.quit(); } } ); + redisTest("Enqueue/Dequeue with 8 shards", async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + shardCount: 8, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + //initial queue length + const result = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(result).toBe(0); + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envQueueLength).toBe(0); + + //initial oldest message + const oldestScore = await queue.oldestMessageInQueue(authenticatedEnvProd, messageProd.queue); + expect(oldestScore).toBe(undefined); + + //enqueue message + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + workerQueue: "main", + }); + + //queue length + const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(queueLength).toBe(1); + const envLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envLength).toBe(1); + + //oldest message + const oldestScore2 = await queue.oldestMessageInQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(oldestScore2).toBe(messageProd.timestamp); + + //concurrencies + const queueConcurrency = await queue.currentConcurrencyOfQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(queueConcurrency).toBe(0); + const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); + expect(envConcurrency).toBe(0); + + await setTimeout(1000); + + //dequeue + const dequeued = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + + assertNonNullable(dequeued); + expect(dequeued).toBeDefined(); + expect(dequeued!.messageId).toEqual(messageProd.runId); + expect(dequeued!.message.orgId).toEqual(messageProd.orgId); + expect(dequeued!.message.version).toEqual("2"); + + const workerQueue = + dequeued.message.version == "2" ? dequeued.message.workerQueue : undefined; + expect(workerQueue).toEqual("main"); + + //concurrencies + const queueConcurrency2 = await queue.currentConcurrencyOfQueue( + authenticatedEnvProd, + messageProd.queue + ); + expect(queueConcurrency2).toBe(1); + const envConcurrency2 = await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd); + expect(envConcurrency2).toBe(1); + + //queue length + const length2 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(length2).toBe(0); + const envLength2 = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envLength2).toBe(0); + + const dequeued2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(dequeued2).toBe(undefined); + } finally { + await queue.quit(); + } + }); + redisTest("Acking", { timeout: 5_000 }, async ({ redisContainer, redisOptions }) => { const queue = new RunQueue({ ...testOptions, @@ -459,7 +602,7 @@ describe("RunQueue", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: "main", }); const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); @@ -467,8 +610,12 @@ describe("RunQueue", () => { const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); expect(envQueueLength).toBe(1); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages.length).toBe(1); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(message).toBeDefined(); + + assertNonNullable(message); const queueLength2 = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); expect(queueLength2).toBe(0); @@ -476,11 +623,11 @@ describe("RunQueue", () => { expect(envQueueLength2).toBe(0); //check the message is gone - const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId); + const key = queue.keys.messageKey(message.message.orgId, message.messageId); const exists = await redis.exists(key); expect(exists).toBe(1); - await queue.acknowledgeMessage(messages[0].message.orgId, messages[0].messageId); + await queue.acknowledgeMessage(message.message.orgId, message.messageId); //concurrencies const queueConcurrency = await queue.currentConcurrencyOfQueue( @@ -502,8 +649,8 @@ describe("RunQueue", () => { expect(exists2).toBe(0); //dequeue - const messages2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages2.length).toBe(0); + const message2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(message2).toBe(undefined); } finally { try { await queue.quit(); @@ -534,7 +681,7 @@ describe("RunQueue", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: "main", }); const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); @@ -542,6 +689,8 @@ describe("RunQueue", () => { const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); expect(envQueueLength).toBe(1); + await setTimeout(1000); + await queue.acknowledgeMessage(messageProd.orgId, messageProd.runId); //concurrencies @@ -560,13 +709,105 @@ describe("RunQueue", () => { expect(envQueueLength3).toBe(0); //dequeue - const messages2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages2.length).toBe(0); + const message2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(message2).toBe(undefined); } finally { await queue.quit(); } }); + redisTest( + "Ack after moving to workerQueue with removeFromWorkerQueue = undefined", + { timeout: 5_000 }, + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + workerQueue: "main", + }); + + const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(queueLength).toBe(1); + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envQueueLength).toBe(1); + + await setTimeout(1000); + + await queue.acknowledgeMessage(messageProd.orgId, messageProd.runId); + + const messages = await queue.peekAllOnWorkerQueue("main"); + expect(messages.length).toEqual(1); + } finally { + await queue.quit(); + } + } + ); + + redisTest( + "Ack after moving to workerQueue with removeFromWorkerQueue = true", + { timeout: 5_000 }, + async ({ redisContainer }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + try { + await queue.enqueueMessage({ + env: authenticatedEnvProd, + message: messageProd, + workerQueue: "main", + }); + + const queueLength = await queue.lengthOfQueue(authenticatedEnvProd, messageProd.queue); + expect(queueLength).toBe(1); + const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvProd); + expect(envQueueLength).toBe(1); + + await setTimeout(1000); + + await queue.acknowledgeMessage(messageProd.orgId, messageProd.runId, { + removeFromWorkerQueue: true, + }); + + const messages = await queue.peekAllOnWorkerQueue("main"); + expect(messages.length).toEqual(0); + } finally { + await queue.quit(); + } + } + ); + redisTest("Nacking", { timeout: 15_000 }, async ({ redisContainer, redisOptions }) => { const queue = new RunQueue({ ...testOptions, @@ -591,14 +832,18 @@ describe("RunQueue", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main2", + workerQueue: "main", }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main2", 10); - expect(messages.length).toBe(1); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(message).toBeDefined(); + + assertNonNullable(message); //check the message is there - const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId); + const key = queue.keys.messageKey(message.message.orgId, message.messageId); const exists = await redis.exists(key); expect(exists).toBe(1); @@ -612,8 +857,8 @@ describe("RunQueue", () => { expect(envConcurrency).toBe(1); await queue.nackMessage({ - orgId: messages[0].message.orgId, - messageId: messages[0].messageId, + orgId: message.message.orgId, + messageId: message.messageId, }); //we need to wait because the default wait is 1 second @@ -638,9 +883,13 @@ describe("RunQueue", () => { const exists2 = await redis.exists(key); expect(exists2).toBe(1); + await setTimeout(1000); + //dequeue - const messages2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main2", 10); - expect(messages2[0].messageId).toBe(messageProd.runId); + const messages2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(messages2).toBeDefined(); + assertNonNullable(messages2); + expect(messages2.messageId).toBe(messageProd.runId); } finally { try { await queue.quit(); @@ -673,14 +922,18 @@ describe("RunQueue", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: "main", }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages.length).toBe(1); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(message).toBeDefined(); + + assertNonNullable(message); //check the message is gone - const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId); + const key = queue.keys.messageKey(message.message.orgId, message.messageId); const exists = await redis.exists(key); expect(exists).toBe(1); @@ -691,10 +944,7 @@ describe("RunQueue", () => { expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); //release the concurrency - await queue.releaseAllConcurrency( - authenticatedEnvProd.organization.id, - messages[0].messageId - ); + await queue.releaseAllConcurrency(authenticatedEnvProd.organization.id, message.messageId); //concurrencies expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -703,7 +953,7 @@ describe("RunQueue", () => { expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0); //reacquire the concurrency - await queue.reacquireConcurrency(authenticatedEnvProd.organization.id, messages[0].messageId); + await queue.reacquireConcurrency(authenticatedEnvProd.organization.id, message.messageId); //check concurrencies are back to what they were before expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -712,10 +962,7 @@ describe("RunQueue", () => { expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(1); //release the concurrency (with the queue this time) - await queue.releaseAllConcurrency( - authenticatedEnvProd.organization.id, - messages[0].messageId - ); + await queue.releaseAllConcurrency(authenticatedEnvProd.organization.id, message.messageId); //concurrencies expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -724,7 +971,7 @@ describe("RunQueue", () => { expect(await queue.currentConcurrencyOfEnvironment(authenticatedEnvProd)).toBe(0); //reacquire the concurrency - await queue.reacquireConcurrency(authenticatedEnvProd.organization.id, messages[0].messageId); + await queue.reacquireConcurrency(authenticatedEnvProd.organization.id, message.messageId); //check concurrencies are back to what they were before expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -766,26 +1013,30 @@ describe("RunQueue", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: "main", }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages.length).toBe(1); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(message).toBeDefined(); + + assertNonNullable(message); //check the message is there - const key = queue.keys.messageKey(messages[0].message.orgId, messages[0].messageId); + const key = queue.keys.messageKey(message.message.orgId, message.messageId); const exists = await redis.exists(key); expect(exists).toBe(1); //nack (we only have attempts set to 1) await queue.nackMessage({ - orgId: messages[0].message.orgId, - messageId: messages[0].messageId, + orgId: message.message.orgId, + messageId: message.messageId, }); //dequeue - const messages2 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages2.length).toBe(0); + const message2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(message2).toBe(undefined); //concurrencies const queueConcurrency2 = await queue.currentConcurrencyOfQueue( @@ -797,8 +1048,8 @@ describe("RunQueue", () => { expect(envConcurrency2).toBe(0); //check the message is still there - const message = await queue.readMessage(messages[0].message.orgId, messages[0].messageId); - expect(message).toBeDefined(); + const messageRead = await queue.readMessage(message.message.orgId, message.messageId); + expect(messageRead).toBeDefined(); const deadLetterQueueLengthBefore = await queue.lengthOfDeadLetterQueue(authenticatedEnvProd); expect(deadLetterQueueLengthBefore).toBe(1); @@ -823,8 +1074,10 @@ describe("RunQueue", () => { expect(existsInDlqAfter).toBe(false); //dequeue - const messages3 = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages3[0].messageId).toBe(messageProd.runId); + const message3 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(message3).toBeDefined(); + assertNonNullable(message3); + expect(message3.messageId).toBe(messageProd.runId); } finally { try { await queue.quit(); diff --git a/internal-packages/run-engine/src/run-queue/index.ts b/internal-packages/run-engine/src/run-queue/index.ts index b792dda793..fbcfe291bd 100644 --- a/internal-packages/run-engine/src/run-queue/index.ts +++ b/internal-packages/run-engine/src/run-queue/index.ts @@ -8,9 +8,14 @@ import { SEMATTRS_MESSAGE_ID, SEMATTRS_MESSAGING_OPERATION, SEMATTRS_MESSAGING_SYSTEM, + Meter, + getMeter, + ValueType, + ObservableResult, + Attributes, } from "@internal/tracing"; -import { Logger } from "@trigger.dev/core/logger"; -import { calculateNextRetryDelay } from "@trigger.dev/core/v3"; +import { Logger, LogLevel } from "@trigger.dev/core/logger"; +import { calculateNextRetryDelay, flattenAttributes } from "@trigger.dev/core/v3"; import { type RetryOptions } from "@trigger.dev/core/v3/schemas"; import { attributesFromAuthenticatedEnv, @@ -19,6 +24,7 @@ import { import { InputPayload, OutputPayload, + OutputPayloadV2, RunQueueKeyProducer, RunQueueSelectionStrategy, } from "./types.js"; @@ -31,10 +37,16 @@ import { } from "@internal/redis"; import { MessageNotFoundError } from "./errors.js"; import { tryCatch } from "@trigger.dev/core"; +import { setInterval } from "node:timers/promises"; +import { nanoid } from "nanoid"; +import { Worker, type WorkerConcurrencyOptions } from "@trigger.dev/redis-worker"; +import { z } from "zod"; const SemanticAttributes = { QUEUE: "runqueue.queue", - MASTER_QUEUES: "runqueue.masterQueues", + WORKER_QUEUE: "runqueue.workerQueue", + MASTER_QUEUE_SHARD: "runqueue.masterQueueShard", + CONSUMER_ID: "runqueue.consumerId", RUN_ID: "runqueue.runId", RESULT_COUNT: "runqueue.resultCount", CONCURRENCY_KEY: "runqueue.concurrencyKey", @@ -51,7 +63,21 @@ export type RunQueueOptions = { queueSelectionStrategy: RunQueueSelectionStrategy; verbose?: boolean; logger?: Logger; + logLevel?: LogLevel; retryOptions?: RetryOptions; + shardCount?: number; + masterQueueConsumersDisabled?: boolean; + masterQueueConsumersIntervalMs?: number; + processWorkerQueueDebounceMs?: number; + workerOptions?: { + pollIntervalMs?: number; + immediatePollIntervalMs?: number; + shutdownTimeoutMs?: number; + concurrency?: WorkerConcurrencyOptions; + disabled?: boolean; + }; + meter?: Meter; + dequeueBlockingTimeoutSeconds?: number; }; type DequeuedMessage = { @@ -68,18 +94,35 @@ const defaultRetrySettings = { randomize: true, }; +const workerCatalog = { + processQueueForWorkerQueue: { + schema: z.object({ + queueKey: z.string(), + environmentId: z.string(), + }), + visibilityTimeoutMs: 30_000, + }, +}; + /** * RunQueue – the queue that's used to process runs */ export class RunQueue { private retryOptions: RetryOptions; private subscriber: Redis; + private luaDebugSubscriber: Redis; private logger: Logger; private redis: Redis; public keys: RunQueueKeyProducer; private queueSelectionStrategy: RunQueueSelectionStrategy; + private shardCount: number; + private abortController: AbortController; + private worker: Worker; + private _observableWorkerQueues: Set = new Set(); + private _meter: Meter; constructor(private readonly options: RunQueueOptions) { + this.shardCount = options.shardCount ?? 2; this.retryOptions = options.retryOptions ?? defaultRetrySettings; this.redis = createRedisClient(options.redis, { onError: (error) => { @@ -89,21 +132,64 @@ export class RunQueue { }); }, }); - this.logger = options.logger ?? new Logger("RunQueue", "warn"); + this.logger = options.logger ?? new Logger("RunQueue", options.logLevel ?? "info"); + this._meter = options.meter ?? getMeter("run-queue"); + + const workerQueueObservableGauge = this._meter.createObservableGauge( + "runqueue.workerQueue.length", + { + description: "The number of messages in the worker queue", + unit: "messages", + valueType: ValueType.INT, + } + ); + + const masterQueueObservableGauge = this._meter.createObservableGauge( + "runqueue.masterQueue.length", + { + description: "The number of queues in the master queue shard", + unit: "queues", + valueType: ValueType.INT, + } + ); + + workerQueueObservableGauge.addCallback(this.#updateWorkerQueueLength.bind(this)); + masterQueueObservableGauge.addCallback(this.#updateMasterQueueLength.bind(this)); + + this.abortController = new AbortController(); this.keys = options.keys; this.queueSelectionStrategy = options.queueSelectionStrategy; - this.subscriber = createRedisClient(options.redis, { - onError: (error) => { - this.logger.error(`RunQueue subscriber redis client error:`, { - error, - keyPrefix: options.redis.keyPrefix, - }); + this.subscriber = this.redis.duplicate(); + this.luaDebugSubscriber = this.redis.duplicate(); + + this.worker = new Worker({ + name: "run-queue-worker", + redisOptions: { + ...options.redis, + keyPrefix: `${options.redis.keyPrefix}:worker`, + }, + catalog: workerCatalog, + concurrency: options.workerOptions?.concurrency, + pollIntervalMs: options.workerOptions?.pollIntervalMs ?? 1000, + immediatePollIntervalMs: options.workerOptions?.immediatePollIntervalMs ?? 100, + shutdownTimeoutMs: options.workerOptions?.shutdownTimeoutMs ?? 10_000, + logger: new Logger("RunQueueWorker", options.logLevel ?? "log"), + jobs: { + processQueueForWorkerQueue: async (job) => { + await this.#processQueueForWorkerQueue(job.payload.queueKey, job.payload.environmentId); + }, }, }); - this.#setupSubscriber(); + if (!options.workerOptions?.disabled) { + this.worker.start(); + } + + this.#setupSubscriber(); + this.#setupLuaLogSubscriber(); + this.#startMasterQueueConsumers(); this.#registerCommands(); } @@ -115,6 +201,35 @@ export class RunQueue { return this.options.tracer; } + get meter() { + return this._meter; + } + + public async registerObservableWorkerQueue(workerQueue: string) { + this._observableWorkerQueues.add(workerQueue); + } + + async #updateWorkerQueueLength(observableResult: ObservableResult) { + for (const workerQueue of this._observableWorkerQueues) { + const workerQueueLength = await this.redis.llen(this.keys.workerQueueKey(workerQueue)); + + observableResult.observe(workerQueueLength, { + [SemanticAttributes.WORKER_QUEUE]: workerQueue, + }); + } + } + + async #updateMasterQueueLength(observableResult: ObservableResult) { + for (let shard = 0; shard < this.shardCount; shard++) { + const masterQueueKey = this.keys.masterQueueKeyForShard(shard); + const masterQueueLength = await this.redis.zcard(masterQueueKey); + + observableResult.observe(masterQueueLength, { + [SemanticAttributes.MASTER_QUEUE_SHARD]: shard.toString(), + }); + } + } + public async updateQueueConcurrencyLimits( env: MinimalAuthenticatedEnvironment, queue: string, @@ -197,7 +312,9 @@ export class RunQueue { return; } - return Number(result[1]); + const score = Number(result[1]); + + return score; } public async currentConcurrencyOfQueue( @@ -324,39 +441,54 @@ export class RunQueue { public async enqueueMessage({ env, message, - masterQueues, + workerQueue, + skipDequeueProcessing = false, }: { env: MinimalAuthenticatedEnvironment; message: InputPayload; - masterQueues: string | string[]; + workerQueue: string; + skipDequeueProcessing?: boolean; }) { return await this.#trace( "enqueueMessage", async (span) => { const { runId, concurrencyKey } = message; - const queue = this.keys.queueKey(env, message.queue, concurrencyKey); + const queueKey = this.keys.queueKey(env, message.queue, concurrencyKey); propagation.inject(context.active(), message); - const parentQueues = typeof masterQueues === "string" ? [masterQueues] : masterQueues; - span.setAttributes({ - [SemanticAttributes.QUEUE]: queue, + [SemanticAttributes.QUEUE]: queueKey, [SemanticAttributes.RUN_ID]: runId, [SemanticAttributes.CONCURRENCY_KEY]: concurrencyKey, - [SemanticAttributes.MASTER_QUEUES]: parentQueues.join(","), + [SemanticAttributes.WORKER_QUEUE]: workerQueue, }); - const messagePayload: OutputPayload = { + const messagePayload: OutputPayloadV2 = { ...message, - version: "1", - queue, - masterQueues: parentQueues, + version: "2", + queue: queueKey, + workerQueue, attempt: 0, }; - return await this.#callEnqueueMessage(messagePayload, parentQueues); + if (!skipDequeueProcessing) { + // This will move the message to the worker queue so it can be dequeued + await this.worker.enqueueOnce({ + id: queueKey, // dedupe by environment, queue, and concurrency key + job: "processQueueForWorkerQueue", + payload: { + queueKey, + environmentId: env.id, + }, + // Add a small delay to dedupe messages so at most one of these will processed, + // every 500ms per queue, concurrency key, and environment + availableAt: new Date(Date.now() + (this.options.processWorkerQueueDebounceMs ?? 500)), // 500ms from now + }); + } + + return await this.#callEnqueueMessage(messagePayload); }, { kind: SpanKind.PRODUCER, @@ -371,84 +503,39 @@ export class RunQueue { } /** - * Dequeue messages from the master queue + * Dequeue messages from the worker queue */ - public async dequeueMessageFromMasterQueue( + public async dequeueMessageFromWorkerQueue( consumerId: string, - masterQueue: string, - maxCount: number - ): Promise { + workerQueue: string + ): Promise { return this.#trace( - "dequeueMessageInSharedQueue", + "dequeueMessageFromWorkerQueue", async (span) => { - const envQueues = await this.queueSelectionStrategy.distributeFairQueuesFromParentQueue( - masterQueue, - consumerId - ); - - span.setAttribute("environment_count", envQueues.length); - - if (envQueues.length === 0) { - return []; - } - - let attemptedEnvs = 0; - let attemptedQueues = 0; - - const messages: DequeuedMessage[] = []; - - for (const env of envQueues) { - attemptedEnvs++; - - for (const queue of env.queues) { - attemptedQueues++; - - // Attempt to dequeue from this queue - const [error, message] = await tryCatch( - this.#callDequeueMessage({ - messageQueue: queue, - }) - ); - - if (error) { - this.logger.error( - `[dequeueMessageInSharedQueue][${this.name}] Failed to dequeue from queue ${queue}`, - { - error, - } - ); - } - - if (message) { - messages.push(message); - } - - // If we've reached maxCount, we don't want to look at this env anymore - if (messages.length >= maxCount) { - break; - } - } + const dequeuedMessage = await this.#callDequeueMessageFromWorkerQueue({ + workerQueue, + }); - // If we've reached maxCount, we're completely done - if (messages.length >= maxCount) { - break; - } + if (!dequeuedMessage) { + return; } span.setAttributes({ - [SemanticAttributes.RESULT_COUNT]: messages.length, - [SemanticAttributes.MASTER_QUEUES]: masterQueue, - attempted_environments: attemptedEnvs, - attempted_queues: attemptedQueues, + [SemanticAttributes.QUEUE]: dequeuedMessage.message.queue, + [SemanticAttributes.RUN_ID]: dequeuedMessage.messageId, + [SemanticAttributes.CONCURRENCY_KEY]: dequeuedMessage.message.concurrencyKey, + ...flattenAttributes(dequeuedMessage.message, "message"), }); - return messages; + return dequeuedMessage; }, { kind: SpanKind.CONSUMER, attributes: { [SEMATTRS_MESSAGING_OPERATION]: "receive", [SEMATTRS_MESSAGING_SYSTEM]: "runqueue", + [SemanticAttributes.WORKER_QUEUE]: workerQueue, + [SemanticAttributes.CONSUMER_ID]: consumerId, }, } ); @@ -461,7 +548,11 @@ export class RunQueue { * This is done when the run is in a final state. * @param messageId */ - public async acknowledgeMessage(orgId: string, messageId: string) { + public async acknowledgeMessage( + orgId: string, + messageId: string, + options?: { skipDequeueProcessing?: boolean; removeFromWorkerQueue?: boolean } + ) { return this.#trace( "acknowledgeMessage", async (span) => { @@ -479,8 +570,24 @@ export class RunQueue { [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey, }); + if (!options?.skipDequeueProcessing) { + // This will move the message to the worker queue so it can be dequeued + await this.worker.enqueueOnce({ + id: message.queue, // dedupe by environment, queue, and concurrency key + job: "processQueueForWorkerQueue", + payload: { + queueKey: message.queue, + environmentId: message.environmentId, + }, + // Add a small delay to dedupe messages so at most one of these will processed, + // every 500ms per queue, concurrency key, and environment + availableAt: new Date(Date.now() + (this.options.processWorkerQueueDebounceMs ?? 500)), // 500ms from now + }); + } + await this.#callAcknowledgeMessage({ message, + removeFromWorkerQueue: options?.removeFromWorkerQueue, }); }, { @@ -503,11 +610,13 @@ export class RunQueue { messageId, retryAt, incrementAttemptCount = true, + skipDequeueProcessing = false, }: { orgId: string; messageId: string; retryAt?: number; incrementAttemptCount?: boolean; + skipDequeueProcessing?: boolean; }) { return this.#trace( "nackMessage", @@ -530,7 +639,7 @@ export class RunQueue { [SemanticAttributes.QUEUE]: message.queue, [SemanticAttributes.RUN_ID]: messageId, [SemanticAttributes.CONCURRENCY_KEY]: message.concurrencyKey, - [SemanticAttributes.MASTER_QUEUES]: message.masterQueues.join(","), + [SemanticAttributes.WORKER_QUEUE]: this.#getWorkerQueueFromMessage(message), }); if (incrementAttemptCount) { @@ -541,6 +650,21 @@ export class RunQueue { } } + if (!skipDequeueProcessing) { + // This will move the message to the worker queue so it can be dequeued + await this.worker.enqueueOnce({ + id: message.queue, // dedupe by environment, queue, and concurrency key + job: "processQueueForWorkerQueue", + payload: { + queueKey: message.queue, + environmentId: message.environmentId, + }, + // Add a small delay to dedupe messages so at most one of these will processed, + // every 500ms per queue, concurrency key, and environment + availableAt: new Date(Date.now() + (this.options.processWorkerQueueDebounceMs ?? 500)), // 500ms from now + }); + } + await this.#callNackMessage({ message, retryAt }); return true; @@ -679,10 +803,16 @@ export class RunQueue { } public async removeEnvironmentQueuesFromMasterQueue( - masterQueue: string, + runtimeEnvironmentId: string, organizationId: string, projectId: string ) { + // Calculate the master queue shard for this environment + const masterQueue = this.keys.masterQueueKeyForEnvironment( + runtimeEnvironmentId, + this.shardCount + ); + // Use scanStream to find all matching members const stream = this.redis.zscanStream(masterQueue, { match: this.keys.queueKey(organizationId, projectId, "*", "*"), @@ -712,11 +842,27 @@ export class RunQueue { } async quit() { - await this.subscriber.unsubscribe(); - await this.subscriber.quit(); + this.abortController.abort(); + + await Promise.all([ + this.subscriber.unsubscribe(), + this.luaDebugSubscriber.unsubscribe(), + this.subscriber.quit(), + this.luaDebugSubscriber.quit(), + this.worker.stop(), + ]); + await this.redis.quit(); } + /** + * Peek all messages on a worker queue (useful for tests or debugging) + */ + async peekAllOnWorkerQueue(workerQueue: string) { + const workerQueueKey = this.keys.workerQueueKey(workerQueue); + return await this.redis.lrange(workerQueueKey, 0, -1); + } + private async handleRedriveMessage(channel: string, message: string) { try { const { runId, envId, projectId, orgId } = JSON.parse(message) as any; @@ -757,7 +903,7 @@ export class RunQueue { ...data, attempt: 0, }, - masterQueues: data.masterQueues, + workerQueue: this.#getWorkerQueueFromMessage(data), }); //remove from the dlq @@ -825,19 +971,299 @@ export class RunQueue { this.subscriber.on("message", this.handleRedriveMessage.bind(this)); } - async #callEnqueueMessage(message: OutputPayload, masterQueues: string[]) { + /** + * Debug lua scripts by publishing to this channel + * + * @example + * + * ```lua + * redis.call("PUBLISH", "runqueue:lua:debug", "workerQueueKey: " .. workerQueueKey .. " messageKeyValue -> " .. tostring(messageKeyValue)) + * ``` + */ + async #setupLuaLogSubscriber() { + this.luaDebugSubscriber.subscribe("runqueue:lua:debug", (err) => { + if (err) { + this.logger.error(`Failed to subscribe to runqueue:lua:debug`, { error: err }); + } else { + this.logger.log(`Subscribed to runqueue:lua:debug`); + } + }); + + this.luaDebugSubscriber.on("message", (_channel, msg) => { + this.logger.debug("runqueue lua debug", { msg }); + }); + } + + #startMasterQueueConsumers() { + if (this.options.masterQueueConsumersDisabled) { + this.logger.debug("Master queue consumers disabled"); + + return; + } + + for (let i = 0; i < this.shardCount; i++) { + this.logger.debug(`Starting master queue consumer ${i}`); + // We will start a consumer for each shard + this.#startMasterQueueConsumer(i).catch((err) => { + this.logger.error(`Failed to start master queue consumer ${i}`, { error: err }); + }); + } + + this.logger.debug(`Started ${this.shardCount} master queue consumers`); + } + + async #startMasterQueueConsumer(shard: number) { + let lastProcessedAt = Date.now(); + let processedCount = 0; + + const consumerId = nanoid(); + + try { + for await (const _ of setInterval(this.options.masterQueueConsumersIntervalMs ?? 500, null, { + signal: this.abortController.signal, + })) { + this.logger.verbose(`Processing master queue shard ${shard}`, { + processedCount, + lastProcessedAt, + service: this.name, + shard, + consumerId, + }); + + const now = performance.now(); + + const [error, results] = await tryCatch(this.#processMasterQueueShard(shard, consumerId)); + + if (error) { + this.logger.error(`Failed to process master queue shard ${shard}`, { + error, + service: this.name, + shard, + consumerId, + }); + + continue; + } + + const duration = performance.now() - now; + + this.logger.verbose(`Processed master queue shard ${shard} in ${duration}ms`, { + processedCount, + lastProcessedAt, + service: this.name, + shard, + duration, + results, + consumerId, + }); + + processedCount++; + lastProcessedAt = Date.now(); + } + } catch (error) { + if (error instanceof Error && error.name !== "AbortError") { + throw error; + } + + this.logger.debug(`Master queue consumer ${shard} stopped`, { + service: this.name, + shard, + processedCount, + lastProcessedAt, + }); + } + } + + async migrateLegacyMasterQueue(legacyMasterQueue: string) { + const legacyMasterQueueKey = this.keys.legacyMasterQueueKey(legacyMasterQueue); + + this.logger.debug("Migrating legacy master queue", { + legacyMasterQueueKey, + service: this.name, + }); + + // Get all items from the legacy master queue + const queueNames = await this.redis.zrange(legacyMasterQueueKey, 0, -1); + + this.logger.debug("Found items in legacy master queue", { + queueNames, + service: this.name, + }); + + // We need to group the items by the new masterQueueKey, so we need to extract out the environmentId from the queue name and calculate the shard + const queuesByMasterQueueKey = new Map(); + + for (const queueName of queueNames) { + const environmentId = this.keys.envIdFromQueue(queueName); + const shard = this.keys.masterQueueShardForEnvironment(environmentId, this.shardCount); + const masterQueueKey = this.keys.masterQueueKeyForShard(shard); + queuesByMasterQueueKey.set(masterQueueKey, [ + ...(queuesByMasterQueueKey.get(masterQueueKey) ?? []), + queueName, + ]); + } + + this.logger.debug("Grouping items by new master queue key", { + queuesByMasterQueueKey: Object.fromEntries(queuesByMasterQueueKey.entries()), + service: this.name, + }); + + const pipeline = this.redis.pipeline(); + + for (const [masterQueueKey, queueNames] of queuesByMasterQueueKey) { + pipeline.migrateLegacyMasterQueues( + masterQueueKey, + this.options.redis.keyPrefix ?? "", + ...queueNames + ); + } + + await pipeline.exec(); + + this.logger.debug("Migrated legacy master queue", { + legacyMasterQueueKey, + service: this.name, + }); + } + + // This is used for test purposes only + async processMasterQueueForEnvironment(environmentId: string, maxCount: number = 10) { + const shard = this.keys.masterQueueShardForEnvironment(environmentId, this.shardCount); + + return this.#processMasterQueueShard(shard, environmentId, maxCount); + } + + async #processMasterQueueShard(shard: number, consumerId: string, maxCount: number = 10) { + return this.#trace( + "processMasterQueueShard", + async (span) => { + const masterQueueKey = this.keys.masterQueueKeyForShard(shard); + + const envQueues = await this.queueSelectionStrategy.distributeFairQueuesFromParentQueue( + masterQueueKey, + consumerId + ); + + span.setAttribute("environment_count", envQueues.length); + + if (envQueues.length === 0) { + return []; + } + + let attemptedEnvs = 0; + let attemptedQueues = 0; + + for (const env of envQueues) { + attemptedEnvs++; + + for (const queue of env.queues) { + attemptedQueues++; + + // Attempt to dequeue from this queue + const [error, messages] = await tryCatch( + this.#callDequeueMessagesFromQueue({ + messageQueue: queue, + shard, + // TODO: make this configurable + maxCount, + }) + ); + + if (error) { + this.logger.error( + `[processMasterQueueShard][${this.name}] Failed to dequeue from queue ${queue}`, + { + error, + } + ); + + continue; + } + + if (messages.length === 0) { + continue; + } + + await this.#enqueueMessagesToWorkerQueues(messages); + } + } + }, + { + kind: SpanKind.CONSUMER, + attributes: { + [SEMATTRS_MESSAGING_OPERATION]: "receive", + [SEMATTRS_MESSAGING_SYSTEM]: "runqueue", + }, + } + ); + } + + async #processQueueForWorkerQueue(queueKey: string, environmentId: string) { + const shard = this.keys.masterQueueShardForEnvironment(environmentId, this.shardCount); + + this.logger.debug("processQueueForWorkerQueue", { + queueKey, + shard, + service: this.name, + }); + + const messages = await this.#callDequeueMessagesFromQueue({ + messageQueue: queueKey, + shard, + maxCount: 10, + }); + + await this.#enqueueMessagesToWorkerQueues(messages); + } + + async #enqueueMessagesToWorkerQueues(messages: DequeuedMessage[]) { + await this.#trace("enqueueMessagesToWorkerQueues", async (span) => { + span.setAttribute("message_count", messages.length); + + const pipeline = this.redis.pipeline(); + + const workerQueueKeys = new Set(); + + for (const message of messages) { + const workerQueueKey = this.keys.workerQueueKey( + this.#getWorkerQueueFromMessage(message.message) + ); + + workerQueueKeys.add(workerQueueKey); + + const messageKeyValue = this.keys.messageKey(message.message.orgId, message.messageId); + + pipeline.rpush(workerQueueKey, messageKeyValue); + } + + span.setAttribute("worker_queue_count", workerQueueKeys.size); + span.setAttribute("worker_queue_keys", Array.from(workerQueueKeys)); + + this.logger.debug("enqueueMessagesToWorkerQueues pipeline", { + service: this.name, + messages, + workerQueueKeys: Array.from(workerQueueKeys), + }); + + await pipeline.exec(); + }); + } + + async #callEnqueueMessage(message: OutputPayloadV2) { const queueKey = message.queue; const messageKey = this.keys.messageKey(message.orgId, message.runId); const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); + const masterQueueKey = this.keys.masterQueueKeyForEnvironment( + message.environmentId, + this.shardCount + ); const queueName = message.queue; const messageId = message.runId; const messageData = JSON.stringify(message); const messageScore = String(message.timestamp); - const $masterQueues = JSON.stringify(masterQueues); - const keyPrefix = this.options.redis.keyPrefix ?? ""; this.logger.debug("Calling enqueueMessage", { queueKey, @@ -849,11 +1275,12 @@ export class RunQueue { messageId, messageData, messageScore, - masterQueues: $masterQueues, + masterQueueKey, service: this.name, }); await this.redis.enqueueMessage( + masterQueueKey, queueKey, messageKey, queueCurrentConcurrencyKey, @@ -862,25 +1289,28 @@ export class RunQueue { queueName, messageId, messageData, - messageScore, - $masterQueues, - keyPrefix + messageScore ); } - async #callDequeueMessage({ + async #callDequeueMessagesFromQueue({ messageQueue, + shard, + maxCount, }: { messageQueue: string; - }): Promise { + shard: number; + maxCount: number; + }): Promise { const queueConcurrencyLimitKey = this.keys.concurrencyLimitKeyFromQueue(messageQueue); const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(messageQueue); const envConcurrencyLimitKey = this.keys.envConcurrencyLimitKeyFromQueue(messageQueue); const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(messageQueue); const messageKeyPrefix = this.keys.messageKeyPrefixFromQueue(messageQueue); const envQueueKey = this.keys.envQueueKeyFromQueue(messageQueue); + const masterQueueKey = this.keys.masterQueueKeyForShard(shard); - this.logger.debug("#callDequeueMessage", { + this.logger.debug("#callDequeueMessagesFromQueue", { messageQueue, queueConcurrencyLimitKey, envConcurrencyLimitKey, @@ -888,9 +1318,12 @@ export class RunQueue { envCurrentConcurrencyKey, messageKeyPrefix, envQueueKey, + masterQueueKey, + shard, + maxCount, }); - const result = await this.redis.dequeueMessage( + const result = await this.redis.dequeueMessagesFromQueue( //keys messageQueue, queueConcurrencyLimitKey, @@ -899,31 +1332,122 @@ export class RunQueue { envCurrentConcurrencyKey, messageKeyPrefix, envQueueKey, + masterQueueKey, //args messageQueue, String(Date.now()), String(this.options.defaultEnvConcurrency), - this.options.redis.keyPrefix ?? "" + this.options.redis.keyPrefix ?? "", + String(maxCount) + ); + + if (!result) { + return []; + } + + this.logger.debug("dequeueMessagesFromQueue raw result", { + result, + service: this.name, + }); + + const messages = []; + for (let i = 0; i < result.length; i += 3) { + const messageId = result[i]; + const messageScore = result[i + 1]; + const rawMessage = result[i + 2]; + + //read message + const parsedMessage = OutputPayload.safeParse(JSON.parse(rawMessage)); + if (!parsedMessage.success) { + this.logger.error(`[${this.name}] Failed to parse message`, { + messageId, + error: parsedMessage.error, + service: this.name, + }); + + continue; + } + + const message = parsedMessage.data; + + messages.push({ + messageId, + messageScore, + message, + }); + } + + this.logger.debug("dequeueMessagesFromQueue parsed result", { + messages, + service: this.name, + }); + + return messages.filter(Boolean) as DequeuedMessage[]; + } + + async #callDequeueMessageFromWorkerQueue({ + workerQueue, + }: { + workerQueue: string; + }): Promise { + const workerQueueKey = this.keys.workerQueueKey(workerQueue); + + this.logger.debug("#callDequeueMessageFromWorkerQueue", { + workerQueue, + workerQueueKey, + }); + + if (this.abortController.signal.aborted) { + return; + } + + const blockingClient = this.#createBlockingDequeueClient(); + + async function cleanup() { + await blockingClient.quit(); + } + + this.abortController.signal.addEventListener("abort", cleanup); + + const result = await blockingClient.dequeueMessageFromWorkerQueue( + //keys + workerQueueKey, + //args + this.options.redis.keyPrefix ?? "", + String(this.options.dequeueBlockingTimeoutSeconds ?? 10) ); + this.abortController.signal.removeEventListener("abort", cleanup); + + await cleanup(); + if (!result) { return; } - this.logger.debug("Dequeue message result", { + this.logger.debug("dequeueMessageFromWorkerQueue raw result", { result, service: this.name, }); - if (result.length !== 3) { - this.logger.error("Invalid dequeue message result", { + if (result.length !== 2) { + this.logger.error("Invalid dequeue message from worker queue result", { result, service: this.name, }); return; } - const [messageId, messageScore, rawMessage] = result; + // Make sure they are both strings + if (typeof result[0] !== "string" || typeof result[1] !== "string") { + this.logger.error("Invalid dequeue message from worker queue result", { + result, + service: this.name, + }); + return; + } + + const [messageId, rawMessage] = result; //read message const parsedMessage = OutputPayload.safeParse(JSON.parse(rawMessage)); @@ -941,19 +1465,31 @@ export class RunQueue { return { messageId, - messageScore, + messageScore: String(message.timestamp), message, }; } - async #callAcknowledgeMessage({ message }: { message: OutputPayload }) { + async #callAcknowledgeMessage({ + message, + removeFromWorkerQueue, + }: { + message: OutputPayload; + removeFromWorkerQueue?: boolean; + }) { const messageId = message.runId; const messageKey = this.keys.messageKey(message.orgId, messageId); const messageQueue = message.queue; const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); - const masterQueues = message.masterQueues; + const masterQueueKey = this.keys.masterQueueKeyForEnvironment( + message.environmentId, + this.shardCount + ); + const workerQueue = this.#getWorkerQueueFromMessage(message); + const workerQueueKey = this.keys.workerQueueKey(workerQueue); + const messageKeyValue = this.keys.messageKey(message.orgId, messageId); this.logger.debug("Calling acknowledgeMessage", { messageKey, @@ -962,20 +1498,26 @@ export class RunQueue { envCurrentConcurrencyKey, envQueueKey, messageId, - masterQueues, + masterQueueKey, + workerQueue, + workerQueueKey, + removeFromWorkerQueue, + messageKeyValue, service: this.name, }); return this.redis.acknowledgeMessage( + masterQueueKey, messageKey, messageQueue, queueCurrentConcurrencyKey, envCurrentConcurrencyKey, envQueueKey, + workerQueueKey, messageId, messageQueue, - JSON.stringify(masterQueues), - this.options.redis.keyPrefix ?? "" + messageKeyValue, + removeFromWorkerQueue ? "1" : "0" ); } @@ -986,6 +1528,10 @@ export class RunQueue { const queueCurrentConcurrencyKey = this.keys.currentConcurrencyKeyFromQueue(message.queue); const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); + const masterQueueKey = this.keys.masterQueueKeyForEnvironment( + message.environmentId, + this.shardCount + ); const nextRetryDelay = calculateNextRetryDelay(this.retryOptions, message.attempt); const messageScore = retryAt ?? (nextRetryDelay ? Date.now() + nextRetryDelay : Date.now()); @@ -993,7 +1539,7 @@ export class RunQueue { this.logger.debug("Calling nackMessage", { messageKey, messageQueue, - masterQueues: message.masterQueues, + masterQueueKey, queueCurrentConcurrencyKey, envCurrentConcurrencyKey, envQueueKey, @@ -1005,6 +1551,7 @@ export class RunQueue { await this.redis.nackMessage( //keys + masterQueueKey, messageKey, messageQueue, queueCurrentConcurrencyKey, @@ -1014,9 +1561,7 @@ export class RunQueue { messageId, messageQueue, JSON.stringify(message), - String(messageScore), - JSON.stringify(message.masterQueues), - this.options.redis.keyPrefix ?? "" + String(messageScore) ); } @@ -1028,8 +1573,13 @@ export class RunQueue { const envCurrentConcurrencyKey = this.keys.envCurrentConcurrencyKeyFromQueue(message.queue); const envQueueKey = this.keys.envQueueKeyFromQueue(message.queue); const deadLetterQueueKey = this.keys.deadLetterQueueKeyFromQueue(message.queue); + const masterQueueKey = this.keys.masterQueueKeyForEnvironment( + message.environmentId, + this.shardCount + ); await this.redis.moveToDeadLetterQueue( + masterQueueKey, messageKey, messageQueue, queueCurrentConcurrencyKey, @@ -1037,9 +1587,7 @@ export class RunQueue { envQueueKey, deadLetterQueueKey, messageId, - messageQueue, - JSON.stringify(message.masterQueues), - this.options.redis.keyPrefix ?? "" + messageQueue ); } @@ -1056,22 +1604,104 @@ export class RunQueue { ); } + #getWorkerQueueFromMessage(message: OutputPayload) { + if (message.version === "2") { + return message.workerQueue; + } + + // In v2, if the environment is development, the worker queue is the environment id. + if (message.environmentType === "DEVELOPMENT") { + return message.environmentId; + } + + // In v1, the master queue is something like us-nyc-3, + // which in v2 is the worker queue. + return message.masterQueues[0]; + } + + #createBlockingDequeueClient() { + const blockingClient = this.redis.duplicate(); + + blockingClient.defineCommand("dequeueMessageFromWorkerQueue", { + numberOfKeys: 1, + lua: ` +local workerQueueKey = KEYS[1] + +local keyPrefix = ARGV[1] +local timeoutInSeconds = tonumber(ARGV[2]) + +-- Attempt to dequeue using BLPOP +-- result is either nil or [queueName, messageId] +local result = redis.call('BLPOP', workerQueueKey, timeoutInSeconds) + +if not result or type(result) ~= "table" then + return nil +end + +local messageKeyValue = result[2] + +-- Get the message payload +local messageKey = keyPrefix .. messageKeyValue + +local messagePayload = redis.call('GET', messageKey) + +-- if the messagePayload is nil, then the message is not in the queue +if not messagePayload then + return nil +end + +-- messageKeyValue is {org:}:message: and we want to extract the messageId +local messageId = messageKeyValue:match("([^:]+)$") + +if not messageId then + return nil +end + +return {messageId, messagePayload} -- Return message details + `, + }); + + return blockingClient; + } + #registerCommands() { + this.redis.defineCommand("migrateLegacyMasterQueues", { + numberOfKeys: 1, + lua: ` +local masterQueueKey = KEYS[1] + +local keyPrefix = ARGV[1] + +for i = 2, #ARGV do + local queueName = ARGV[i] + local queueKey = keyPrefix .. queueName + + -- Rebalance the parent queues + local earliestMessage = redis.call('ZRANGE', queueKey, 0, 0, 'WITHSCORES') + + if #earliestMessage == 0 then + redis.call('ZREM', masterQueueKey, queueName) + else + redis.call('ZADD', masterQueueKey, earliestMessage[2], queueName) + end +end + `, + }); + this.redis.defineCommand("enqueueMessage", { - numberOfKeys: 5, + numberOfKeys: 6, lua: ` -local queueKey = KEYS[1] -local messageKey = KEYS[2] -local queueCurrentConcurrencyKey = KEYS[3] -local envCurrentConcurrencyKey = KEYS[4] -local envQueueKey = KEYS[5] +local masterQueueKey = KEYS[1] +local queueKey = KEYS[2] +local messageKey = KEYS[3] +local queueCurrentConcurrencyKey = KEYS[4] +local envCurrentConcurrencyKey = KEYS[5] +local envQueueKey = KEYS[6] local queueName = ARGV[1] local messageId = ARGV[2] local messageData = ARGV[3] local messageScore = ARGV[4] -local parentQueues = cjson.decode(ARGV[5]) -local keyPrefix = ARGV[6] -- Write the message to the message key redis.call('SET', messageKey, messageData) @@ -1085,13 +1715,10 @@ redis.call('ZADD', envQueueKey, messageScore, messageId) -- Rebalance the parent queues local earliestMessage = redis.call('ZRANGE', queueKey, 0, 0, 'WITHSCORES') -for _, parentQueue in ipairs(parentQueues) do - local prefixedParentQueue = keyPrefix .. parentQueue - if #earliestMessage == 0 then - redis.call('ZREM', prefixedParentQueue, queueName) - else - redis.call('ZADD', prefixedParentQueue, earliestMessage[2], queueName) - end +if #earliestMessage == 0 then + redis.call('ZREM', masterQueueKey, queueName) +else + redis.call('ZADD', masterQueueKey, earliestMessage[2], queueName) end -- Update the concurrency keys @@ -1100,8 +1727,8 @@ redis.call('SREM', envCurrentConcurrencyKey, messageId) `, }); - this.redis.defineCommand("dequeueMessage", { - numberOfKeys: 7, + this.redis.defineCommand("dequeueMessagesFromQueue", { + numberOfKeys: 8, lua: ` local queueKey = KEYS[1] local queueConcurrencyLimitKey = KEYS[2] @@ -1110,11 +1737,13 @@ local queueCurrentConcurrencyKey = KEYS[4] local envCurrentConcurrencyKey = KEYS[5] local messageKeyPrefix = KEYS[6] local envQueueKey = KEYS[7] +local masterQueueKey = KEYS[8] local queueName = ARGV[1] local currentTime = tonumber(ARGV[2]) local defaultEnvConcurrencyLimit = ARGV[3] local keyPrefix = ARGV[4] +local maxCount = tonumber(ARGV[5] or '1') -- Check current env concurrency against the limit local envCurrentConcurrency = tonumber(redis.call('SCARD', envCurrentConcurrencyKey) or '0') @@ -1135,57 +1764,81 @@ if queueCurrentConcurrency >= totalQueueConcurrencyLimit then return nil end --- Attempt to dequeue the next message -local messages = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'WITHSCORES', 'LIMIT', 0, 1) +-- Calculate how many messages we can actually dequeue based on concurrency limits +local envAvailableCapacity = totalEnvConcurrencyLimit - envCurrentConcurrency +local queueAvailableCapacity = totalQueueConcurrencyLimit - queueCurrentConcurrency +local actualMaxCount = math.min(maxCount, envAvailableCapacity, queueAvailableCapacity) -if #messages == 0 then +if actualMaxCount <= 0 then return nil end -local messageId = messages[1] -local messageScore = tonumber(messages[2]) +-- Attempt to dequeue messages up to actualMaxCount +local messages = redis.call('ZRANGEBYSCORE', queueKey, '-inf', currentTime, 'WITHSCORES', 'LIMIT', 0, actualMaxCount) --- Get the message payload -local messageKey = messageKeyPrefix .. messageId -local messagePayload = redis.call('GET', messageKey) -local decodedPayload = cjson.decode(messagePayload); +if #messages == 0 then + return nil +end --- Update concurrency -redis.call('ZREM', queueKey, messageId) -redis.call('ZREM', envQueueKey, messageId) -redis.call('SADD', queueCurrentConcurrencyKey, messageId) -redis.call('SADD', envCurrentConcurrencyKey, messageId) +local results = {} +local dequeuedCount = 0 + +-- Process messages in pairs (messageId, score) +for i = 1, #messages, 2 do + local messageId = messages[i] + local messageScore = tonumber(messages[i + 1]) + + -- Get the message payload + local messageKey = messageKeyPrefix .. messageId + local messagePayload = redis.call('GET', messageKey) + + if messagePayload then + -- Update concurrency + redis.call('ZREM', queueKey, messageId) + redis.call('ZREM', envQueueKey, messageId) + redis.call('SADD', queueCurrentConcurrencyKey, messageId) + redis.call('SADD', envCurrentConcurrencyKey, messageId) + + -- Add to results + table.insert(results, messageId) + table.insert(results, messageScore) + table.insert(results, messagePayload) + + dequeuedCount = dequeuedCount + 1 + end +end --- Rebalance the parent queues +-- Rebalance the parent queues only once after all dequeues local earliestMessage = redis.call('ZRANGE', queueKey, 0, 0, 'WITHSCORES') -for _, parentQueue in ipairs(decodedPayload.masterQueues) do - local prefixedParentQueue = keyPrefix .. parentQueue - if #earliestMessage == 0 then - redis.call('ZREM', prefixedParentQueue, queueName) - else - redis.call('ZADD', prefixedParentQueue, earliestMessage[2], queueName) - end + +if #earliestMessage == 0 then + redis.call('ZREM', masterQueueKey, queueName) +else + redis.call('ZADD', masterQueueKey, earliestMessage[2], queueName) end -return {messageId, messageScore, messagePayload} -- Return message details +-- Return results as a flat array: [messageId1, messageScore1, messagePayload1, messageId2, messageScore2, messagePayload2, ...] +return results `, }); this.redis.defineCommand("acknowledgeMessage", { - numberOfKeys: 5, + numberOfKeys: 7, lua: ` -- Keys: -local messageKey = KEYS[1] -local messageQueueKey = KEYS[2] -local queueCurrentConcurrencyKey = KEYS[3] -local envCurrentConcurrencyKey = KEYS[4] -local envQueueKey = KEYS[5] +local masterQueueKey = KEYS[1] +local messageKey = KEYS[2] +local messageQueueKey = KEYS[3] +local queueCurrentConcurrencyKey = KEYS[4] +local envCurrentConcurrencyKey = KEYS[5] +local envQueueKey = KEYS[6] +local workerQueueKey = KEYS[7] -- Args: local messageId = ARGV[1] local messageQueueName = ARGV[2] -local parentQueues = cjson.decode(ARGV[3]) -local keyPrefix = ARGV[4] +local messageKeyValue = ARGV[3] +local removeFromWorkerQueue = ARGV[4] -- Remove the message from the message key redis.call('DEL', messageKey) @@ -1196,38 +1849,39 @@ redis.call('ZREM', envQueueKey, messageId) -- Rebalance the parent queues local earliestMessage = redis.call('ZRANGE', messageQueueKey, 0, 0, 'WITHSCORES') -for _, parentQueue in ipairs(parentQueues) do - local prefixedParentQueue = keyPrefix .. parentQueue - if #earliestMessage == 0 then - redis.call('ZREM', prefixedParentQueue, messageQueueName) - else - redis.call('ZADD', prefixedParentQueue, earliestMessage[2], messageQueueName) - end +if #earliestMessage == 0 then + redis.call('ZREM', masterQueueKey, messageQueueName) +else + redis.call('ZADD', masterQueueKey, earliestMessage[2], messageQueueName) end -- Update the concurrency keys redis.call('SREM', queueCurrentConcurrencyKey, messageId) redis.call('SREM', envCurrentConcurrencyKey, messageId) + +-- Remove the message from the worker queue +if removeFromWorkerQueue == '1' then + redis.call('LREM', workerQueueKey, 0, messageKeyValue) +end `, }); this.redis.defineCommand("nackMessage", { - numberOfKeys: 5, + numberOfKeys: 6, lua: ` -- Keys: -local messageKey = KEYS[1] -local messageQueueKey = KEYS[2] -local queueCurrentConcurrencyKey = KEYS[3] -local envCurrentConcurrencyKey = KEYS[4] -local envQueueKey = KEYS[5] +local masterQueueKey = KEYS[1] +local messageKey = KEYS[2] +local messageQueueKey = KEYS[3] +local queueCurrentConcurrencyKey = KEYS[4] +local envCurrentConcurrencyKey = KEYS[5] +local envQueueKey = KEYS[6] -- Args: local messageId = ARGV[1] local messageQueueName = ARGV[2] local messageData = ARGV[3] local messageScore = tonumber(ARGV[4]) -local parentQueues = cjson.decode(ARGV[5]) -local keyPrefix = ARGV[6] -- Update the message data redis.call('SET', messageKey, messageData) @@ -1242,33 +1896,29 @@ redis.call('ZADD', envQueueKey, messageScore, messageId) -- Rebalance the parent queues local earliestMessage = redis.call('ZRANGE', messageQueueKey, 0, 0, 'WITHSCORES') -for _, parentQueue in ipairs(parentQueues) do - local prefixedParentQueue = keyPrefix .. parentQueue - if #earliestMessage == 0 then - redis.call('ZREM', prefixedParentQueue, messageQueueName) - else - redis.call('ZADD', prefixedParentQueue, earliestMessage[2], messageQueueName) - end +if #earliestMessage == 0 then + redis.call('ZREM', masterQueueKey, messageQueueName) +else + redis.call('ZADD', masterQueueKey, earliestMessage[2], messageQueueName) end `, }); this.redis.defineCommand("moveToDeadLetterQueue", { - numberOfKeys: 6, + numberOfKeys: 7, lua: ` -- Keys: -local messageKey = KEYS[1] -local messageQueue = KEYS[2] -local queueCurrentConcurrencyKey = KEYS[3] -local envCurrentConcurrencyKey = KEYS[4] -local envQueueKey = KEYS[5] -local deadLetterQueueKey = KEYS[6] +local masterQueueKey = KEYS[1] +local messageKey = KEYS[2] +local messageQueue = KEYS[3] +local queueCurrentConcurrencyKey = KEYS[4] +local envCurrentConcurrencyKey = KEYS[5] +local envQueueKey = KEYS[6] +local deadLetterQueueKey = KEYS[7] -- Args: local messageId = ARGV[1] local messageQueueName = ARGV[2] -local parentQueues = cjson.decode(ARGV[3]) -local keyPrefix = ARGV[4] -- Remove the message from the queue redis.call('ZREM', messageQueue, messageId) @@ -1276,13 +1926,10 @@ redis.call('ZREM', envQueueKey, messageId) -- Rebalance the parent queues local earliestMessage = redis.call('ZRANGE', messageQueue, 0, 0, 'WITHSCORES') -for _, parentQueue in ipairs(parentQueues) do - local prefixedParentQueue = keyPrefix .. parentQueue - if #earliestMessage == 0 then - redis.call('ZREM', prefixedParentQueue, messageQueueName) - else - redis.call('ZADD', prefixedParentQueue, earliestMessage[2], messageQueueName) - end +if #earliestMessage == 0 then + redis.call('ZREM', masterQueueKey, messageQueueName) +else + redis.call('ZADD', masterQueueKey, earliestMessage[2], messageQueueName) end -- Add the message to the dead letter queue @@ -1393,6 +2040,7 @@ declare module "@internal/redis" { interface RedisCommander { enqueueMessage( //keys + masterQueueKey: string, queue: string, messageKey: string, queueCurrentConcurrencyKey: string, @@ -1403,12 +2051,10 @@ declare module "@internal/redis" { messageId: string, messageData: string, messageScore: string, - parentQueues: string, - keyPrefix: string, callback?: Callback ): Result; - dequeueMessage( + dequeueMessagesFromQueue( //keys childQueue: string, queueConcurrencyLimitKey: string, @@ -1417,28 +2063,42 @@ declare module "@internal/redis" { envCurrentConcurrencyKey: string, messageKeyPrefix: string, envQueueKey: string, + masterQueueKey: string, //args childQueueName: string, currentTime: string, defaultEnvConcurrencyLimit: string, keyPrefix: string, + maxCount: string, + callback?: Callback + ): Result; + + dequeueMessageFromWorkerQueue( + // keys + workerQueueKey: string, + // args + keyPrefix: string, + timeoutInSeconds: string, callback?: Callback<[string, string]> - ): Result<[string, string, string] | null, Context>; + ): Result<[string, string] | null, Context>; acknowledgeMessage( + masterQueueKey: string, messageKey: string, messageQueue: string, concurrencyKey: string, envConcurrencyKey: string, envQueueKey: string, + workerQueueKey: string, messageId: string, messageQueueName: string, - masterQueues: string, - keyPrefix: string, + messageKeyValue: string, + removeFromWorkerQueue: string, callback?: Callback ): Result; nackMessage( + masterQueueKey: string, messageKey: string, messageQueue: string, queueCurrentConcurrencyKey: string, @@ -1448,12 +2108,11 @@ declare module "@internal/redis" { messageQueueName: string, messageData: string, messageScore: string, - masterQueues: string, - keyPrefix: string, callback?: Callback ): Result; moveToDeadLetterQueue( + masterQueueKey: string, messageKey: string, messageQueue: string, queueCurrentConcurrencyKey: string, @@ -1462,8 +2121,6 @@ declare module "@internal/redis" { deadLetterQueueKey: string, messageId: string, messageQueueName: string, - masterQueues: string, - keyPrefix: string, callback?: Callback ): Result; @@ -1495,5 +2152,11 @@ declare module "@internal/redis" { envConcurrencyLimit: string, callback?: Callback ): Result; + + migrateLegacyMasterQueues( + masterQueueKey: string, + keyPrefix: string, + ...queueNames: string[] + ): Result; } } diff --git a/internal-packages/run-engine/src/run-queue/keyProducer.ts b/internal-packages/run-engine/src/run-queue/keyProducer.ts index 6c840bd212..49e165ad90 100644 --- a/internal-packages/run-engine/src/run-queue/keyProducer.ts +++ b/internal-packages/run-engine/src/run-queue/keyProducer.ts @@ -1,5 +1,6 @@ import { MinimalAuthenticatedEnvironment } from "../shared/index.js"; import { EnvDescriptor, QueueDescriptor, RunQueueKeyProducer } from "./types.js"; +import { jumpHash } from "@trigger.dev/core/v3/serverOnly"; const constants = { CURRENT_CONCURRENCY_PART: "currentConcurrency", @@ -13,9 +14,33 @@ const constants = { TASK_PART: "task", MESSAGE_PART: "message", DEAD_LETTER_QUEUE_PART: "deadLetter", + MASTER_QUEUE_PART: "masterQueue", + WORKER_QUEUE_PART: "workerQueue", } as const; export class RunQueueFullKeyProducer implements RunQueueKeyProducer { + legacyMasterQueueKey(masterQueueName: string): string { + return masterQueueName; + } + + masterQueueKeyForEnvironment(envId: string, shardCount: number): string { + const shard = this.masterQueueShardForEnvironment(envId, shardCount); + + return this.masterQueueKeyForShard(shard); + } + + masterQueueKeyForShard(shard: number): string { + return [constants.MASTER_QUEUE_PART, "shard", shard.toString()].join(":"); + } + + masterQueueShardForEnvironment(envId: string, shardCount: number): number { + return jumpHash(envId, shardCount); + } + + workerQueueKey(workerQueue: string): string { + return [constants.WORKER_QUEUE_PART, workerQueue].join(":"); + } + queueConcurrencyLimitKey(env: MinimalAuthenticatedEnvironment, queue: string) { return [this.queueKey(env, queue), constants.CONCURRENCY_LIMIT_PART].join(":"); } diff --git a/internal-packages/run-engine/src/run-queue/tests/ack.test.ts b/internal-packages/run-engine/src/run-queue/tests/ack.test.ts index 8fc6da7dd7..f04358ecb6 100644 --- a/internal-packages/run-engine/src/run-queue/tests/ack.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/ack.test.ts @@ -1,7 +1,8 @@ -import { redisTest } from "@internal/testcontainers"; +import { assertNonNullable, redisTest } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { describe } from "node:test"; +import { setTimeout } from "node:timers/promises"; import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; import { RunQueue } from "../index.js"; import { RunQueueFullKeyProducer } from "../keyProducer.js"; @@ -65,17 +66,20 @@ describe("RunQueue.acknowledgeMessage", () => { }); try { - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - // Enqueue and dequeue a message to get it into processing await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: authenticatedEnvDev.id, }); - const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); - expect(dequeued.length).toBe(1); + await setTimeout(1000); + + const dequeued = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvDev.id + ); + assertNonNullable(dequeued); // Verify concurrency is set const queueConcurrency = await queue.currentConcurrencyOfQueue( @@ -123,13 +127,11 @@ describe("RunQueue.acknowledgeMessage", () => { }); try { - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - // Enqueue message await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: authenticatedEnvDev.id, }); // Verify queue lengths @@ -139,9 +141,14 @@ describe("RunQueue.acknowledgeMessage", () => { const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvDev); expect(envQueueLength).toBe(1); + await setTimeout(1000); + // Dequeue the message - const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); - expect(dequeued.length).toBe(1); + const dequeued = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvDev.id + ); + assertNonNullable(dequeued); // Verify queue is empty after dequeue const queueLengthAfterDequeue = await queue.lengthOfQueue( diff --git a/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromMasterQueue.test.ts b/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromWorkerQueue.test.ts similarity index 78% rename from internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromMasterQueue.test.ts rename to internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromWorkerQueue.test.ts index 846ad5f308..4d1a038a78 100644 --- a/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromMasterQueue.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/dequeueMessageFromWorkerQueue.test.ts @@ -1,10 +1,11 @@ -import { redisTest } from "@internal/testcontainers"; +import { assertNonNullable, redisTest } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; import { describe } from "node:test"; import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; import { RunQueue } from "../index.js"; import { RunQueueFullKeyProducer } from "../keyProducer.js"; import { InputPayload } from "../types.js"; +import { setTimeout } from "node:timers/promises"; const testOptions = { name: "rq", @@ -43,8 +44,8 @@ const messageDev: InputPayload = { vi.setConfig({ testTimeout: 60_000 }); -describe("RunQueue.dequeueMessageFromMasterQueue", () => { - redisTest("dequeuing a message from a master queue", async ({ redisContainer }) => { +describe("RunQueue.dequeueMessageFromWorkerQueue", () => { + redisTest("dequeuing a message from a worker queue", async ({ redisContainer }) => { const queue = new RunQueue({ ...testOptions, queueSelectionStrategy: new FairQueueSelectionStrategy({ @@ -73,13 +74,11 @@ describe("RunQueue.dequeueMessageFromMasterQueue", () => { const oldestScore = await queue.oldestMessageInQueue(authenticatedEnvDev, messageDev.queue); expect(oldestScore).toBe(undefined); - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - //enqueue message await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: "main", }); //queue length @@ -103,12 +102,18 @@ describe("RunQueue.dequeueMessageFromMasterQueue", () => { const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); expect(envConcurrency).toBe(0); - const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); - expect(dequeued.length).toBe(1); - expect(dequeued[0].messageId).toEqual(messageDev.runId); - expect(dequeued[0].message.orgId).toEqual(messageDev.orgId); - expect(dequeued[0].message.version).toEqual("1"); - expect(dequeued[0].message.masterQueues).toEqual(["main", envMasterQueue]); + await setTimeout(1000); + + const dequeued = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(dequeued).toBeDefined(); + assertNonNullable(dequeued); + expect(dequeued.messageId).toEqual(messageDev.runId); + expect(dequeued.message.orgId).toEqual(messageDev.orgId); + expect(dequeued.message.version).toEqual("2"); + + const workerQueue = + dequeued.message.version === "2" ? dequeued.message.workerQueue : undefined; + expect(workerQueue).toEqual("main"); //concurrencies const queueConcurrencyAfter = await queue.currentConcurrencyOfQueue( @@ -157,37 +162,31 @@ describe("RunQueue.dequeueMessageFromMasterQueue", () => { maximumConcurrencyLimit: 1, }); - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - // Enqueue first message await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: "main", }); // Dequeue first message to occupy the concurrency - const dequeued1 = await queue.dequeueMessageFromMasterQueue( - "test_12345", - envMasterQueue, - 10 - ); - expect(dequeued1.length).toBe(1); + await setTimeout(1000); + + const dequeued1 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + assertNonNullable(dequeued1); // Enqueue second message await queue.enqueueMessage({ env: authenticatedEnvDev, message: { ...messageDev, runId: "r4322" }, - masterQueues: ["main", envMasterQueue], + workerQueue: "main", }); + await setTimeout(1000); + // Try to dequeue second message - const dequeued2 = await queue.dequeueMessageFromMasterQueue( - "test_12345", - envMasterQueue, - 10 - ); - expect(dequeued2.length).toBe(0); + const dequeued2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(dequeued2).toBeUndefined(); const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); expect(envConcurrency).toBe(1); @@ -221,36 +220,29 @@ describe("RunQueue.dequeueMessageFromMasterQueue", () => { // Set queue concurrency limit to 1 await queue.updateQueueConcurrencyLimits(authenticatedEnvDev, messageDev.queue, 1); - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - // Enqueue two messages await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: "main", }); await queue.enqueueMessage({ env: authenticatedEnvDev, message: { ...messageDev, runId: "r4322" }, - masterQueues: ["main", envMasterQueue], + workerQueue: "main", }); + await setTimeout(1000); + // Dequeue first message - const dequeued1 = await queue.dequeueMessageFromMasterQueue( - "test_12345", - envMasterQueue, - 10 - ); - expect(dequeued1.length).toBe(1); + const dequeued1 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(dequeued1).toBeDefined(); + assertNonNullable(dequeued1); // Try to dequeue second message - const dequeued2 = await queue.dequeueMessageFromMasterQueue( - "test_12345", - envMasterQueue, - 10 - ); - expect(dequeued2.length).toBe(0); + const dequeued2 = await queue.dequeueMessageFromWorkerQueue("test_12345", "main"); + expect(dequeued2).toBeUndefined(); const queueConcurrency = await queue.currentConcurrencyOfQueue( authenticatedEnvDev, diff --git a/internal-packages/run-engine/src/run-queue/tests/enqueueMessage.test.ts b/internal-packages/run-engine/src/run-queue/tests/enqueueMessage.test.ts index 573ac1485b..907f99fe4f 100644 --- a/internal-packages/run-engine/src/run-queue/tests/enqueueMessage.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/enqueueMessage.test.ts @@ -1,7 +1,8 @@ -import { redisTest } from "@internal/testcontainers"; +import { assertNonNullable, redisTest } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { describe } from "node:test"; +import { setTimeout } from "node:timers/promises"; import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; import { RunQueue } from "../index.js"; import { RunQueueFullKeyProducer } from "../keyProducer.js"; @@ -75,13 +76,11 @@ describe("RunQueue.enqueueMessage", () => { const oldestScore = await queue.oldestMessageInQueue(authenticatedEnvDev, messageDev.queue); expect(oldestScore).toBe(undefined); - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - //enqueue message const enqueueResult = await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: authenticatedEnvDev.id, }); expect(enqueueResult).toBe(undefined); @@ -106,6 +105,21 @@ describe("RunQueue.enqueueMessage", () => { const envConcurrency = await queue.currentConcurrencyOfEnvironment(authenticatedEnvDev); expect(envConcurrency).toBe(0); + + await setTimeout(1000); + + //dequeue message + const dequeued = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvDev.id + ); + assertNonNullable(dequeued); + expect(dequeued.messageId).toEqual(messageDev.runId); + expect(dequeued.message.orgId).toEqual(messageDev.orgId); + expect(dequeued.message.version).toEqual("2"); + const workerQueue = + dequeued.message.version == "2" ? dequeued.message.workerQueue : undefined; + expect(workerQueue).toEqual(authenticatedEnvDev.id); } finally { await queue.quit(); } diff --git a/internal-packages/run-engine/src/run-queue/tests/migrateLegacyMasterQueue.test.ts b/internal-packages/run-engine/src/run-queue/tests/migrateLegacyMasterQueue.test.ts new file mode 100644 index 0000000000..b16ad9e70c --- /dev/null +++ b/internal-packages/run-engine/src/run-queue/tests/migrateLegacyMasterQueue.test.ts @@ -0,0 +1,131 @@ +import { createRedisClient } from "@internal/redis"; +import { redisTest } from "@internal/testcontainers"; +import { trace } from "@internal/tracing"; +import { Logger } from "@trigger.dev/core/logger"; +import { describe } from "node:test"; +import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; +import { RunQueue } from "../index.js"; +import { RunQueueFullKeyProducer } from "../keyProducer.js"; + +const testOptions = { + name: "rq", + tracer: trace.getTracer("rq"), + workers: 1, + defaultEnvConcurrency: 25, + logger: new Logger("RunQueue", "debug"), + retryOptions: { + maxAttempts: 5, + factor: 1.1, + minTimeoutInMs: 100, + maxTimeoutInMs: 1_000, + randomize: true, + }, + keys: new RunQueueFullKeyProducer(), + shardCount: 2, +}; + +vi.setConfig({ testTimeout: 60_000 }); + +describe("RunQueue.migrateLegacyMasterQueue", () => { + redisTest( + "should migrate the legacy master queue to the new master queues", + async ({ redisContainer, redisOptions }) => { + const queue = new RunQueue({ + ...testOptions, + queueSelectionStrategy: new FairQueueSelectionStrategy({ + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + keys: testOptions.keys, + }), + redis: { + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }, + }); + + // We need to create a legacy master queue and fill it with some queues that have their own sorted sets with some messages + const legacyMasterQueue = "legacy-master-queue"; + const legacyMasterQueueKey = testOptions.keys.legacyMasterQueueKey(legacyMasterQueue); + + const redis = createRedisClient({ + keyPrefix: "runqueue:test:", + host: redisContainer.getHost(), + port: redisContainer.getPort(), + }); + + const queue1 = testOptions.keys.queueKey("org1", "project1", "env1", "queue1"); + const queue2 = testOptions.keys.queueKey("org1", "project1", "env1", "queue2"); + const queue3 = testOptions.keys.queueKey("org1", "project1", "env2", "queue3"); + const queue4 = testOptions.keys.queueKey("org1", "project1", "env2", "queue4"); + const queue5 = testOptions.keys.queueKey("org1", "project1", "env3", "queue5"); + const queue6 = testOptions.keys.queueKey("org1", "project1", "env3", "queue6"); + const queue7 = testOptions.keys.queueKey("org1", "project1", "env4", "queue7"); + const queue8 = testOptions.keys.queueKey("org1", "project1", "env4", "queue8"); + + await redis.zadd(legacyMasterQueueKey, 0, queue1); + await redis.zadd(legacyMasterQueueKey, 0, queue2); + await redis.zadd(legacyMasterQueueKey, 0, queue3); + await redis.zadd(legacyMasterQueueKey, 0, queue4); + await redis.zadd(legacyMasterQueueKey, 0, queue5); + await redis.zadd(legacyMasterQueueKey, 0, queue6); + await redis.zadd(legacyMasterQueueKey, 0, queue7); + await redis.zadd(legacyMasterQueueKey, 0, queue8); + + // Add messages to the queue with various hardcoded unix epoch timestamps + await redis.zadd(queue1, 1717334000, "message1"); + await redis.zadd(queue1, 1717334001, "message2"); + + await redis.zadd(queue2, 1717334002, "message3"); + + await redis.zadd(queue3, 1717334003, "message4"); + + await redis.zadd(queue4, 1717334004, "message5"); + + await redis.zadd(queue5, 1717334005, "message6"); + + await redis.zadd(queue6, 1717334006, "message7"); + + await redis.zadd(queue7, 1717334400, "message7"); + + // queue8 has no messages, even though it's in the legacy master queue + + await queue.migrateLegacyMasterQueue(legacyMasterQueue); + + // Inspect the new master queues + const shard1MasterQueueKey = testOptions.keys.masterQueueKeyForShard(0); + const shard2MasterQueueKey = testOptions.keys.masterQueueKeyForShard(1); + + // The legacy master queue should be empty + const shard1Queues = await redis.zrange(shard1MasterQueueKey, 0, -1, "WITHSCORES"); + + expect(shard1Queues).toEqual([ + "{org:org1}:proj:project1:env:env1:queue:queue1", + "1717334000", + "{org:org1}:proj:project1:env:env1:queue:queue2", + "1717334002", + "{org:org1}:proj:project1:env:env2:queue:queue3", + "1717334003", + "{org:org1}:proj:project1:env:env2:queue:queue4", + "1717334004", + ]); + + const shard2Queues = await redis.zrange(shard2MasterQueueKey, 0, -1, "WITHSCORES"); + + expect(shard2Queues).toEqual([ + "{org:org1}:proj:project1:env:env3:queue:queue5", + "1717334005", + "{org:org1}:proj:project1:env:env3:queue:queue6", + "1717334006", + "{org:org1}:proj:project1:env:env4:queue:queue7", + "1717334400", + ]); + + await queue.quit(); + await redis.quit(); + } + ); +}); diff --git a/internal-packages/run-engine/src/run-queue/tests/nack.test.ts b/internal-packages/run-engine/src/run-queue/tests/nack.test.ts index 4b1e832023..fe1623e376 100644 --- a/internal-packages/run-engine/src/run-queue/tests/nack.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/nack.test.ts @@ -1,6 +1,5 @@ -import { redisTest } from "@internal/testcontainers"; +import { assertNonNullable, redisTest } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; -import { Logger } from "@trigger.dev/core/logger"; import { describe } from "node:test"; import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; import { RunQueue } from "../index.js"; @@ -13,7 +12,6 @@ const testOptions = { tracer: trace.getTracer("rq"), workers: 1, defaultEnvConcurrency: 25, - logger: new Logger("RunQueue", "warn"), retryOptions: { maxAttempts: 5, factor: 1.1, @@ -66,18 +64,21 @@ describe("RunQueue.nackMessage", () => { }); try { - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - // Enqueue message with reserve concurrency await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: authenticatedEnvDev.id, }); + await setTimeout(1000); + // Dequeue message - const dequeued = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); - expect(dequeued.length).toBe(1); + const dequeued = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvDev.id + ); + assertNonNullable(dequeued); // Verify current concurrency is set and reserve is cleared const queueCurrentConcurrency = await queue.currentConcurrencyOfQueue( @@ -116,11 +117,14 @@ describe("RunQueue.nackMessage", () => { expect(message?.attempt).toBe(1); //we need to wait because the default wait is 1 second - await setTimeout(300); + await setTimeout(1000); // Now we should be able to dequeue it again - const dequeued2 = await queue.dequeueMessageFromMasterQueue("test_12345", envMasterQueue, 10); - expect(dequeued2.length).toBe(1); + const dequeued2 = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvDev.id + ); + assertNonNullable(dequeued2); } finally { await queue.quit(); } @@ -131,6 +135,7 @@ describe("RunQueue.nackMessage", () => { async ({ redisContainer }) => { const queue = new RunQueue({ ...testOptions, + logLevel: "debug", retryOptions: { ...testOptions.retryOptions, maxAttempts: 2, // Set lower for testing @@ -151,29 +156,25 @@ describe("RunQueue.nackMessage", () => { }); try { - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: authenticatedEnvDev.id, }); - const dequeued = await queue.dequeueMessageFromMasterQueue( + await setTimeout(1000); + + const dequeued = await queue.dequeueMessageFromWorkerQueue( "test_12345", - envMasterQueue, - 10 + authenticatedEnvDev.id ); - expect(dequeued.length).toBe(1); + assertNonNullable(dequeued); await queue.nackMessage({ orgId: messageDev.orgId, messageId: messageDev.runId, }); - // Wait for any requeue delay - await setTimeout(300); - // Message should not be requeued as max attempts reached const envQueueLength = await queue.lengthOfEnvQueue(authenticatedEnvDev); expect(envQueueLength).toBe(1); @@ -181,13 +182,14 @@ describe("RunQueue.nackMessage", () => { const message = await queue.readMessage(messageDev.orgId, messageDev.runId); expect(message?.attempt).toBe(1); + await setTimeout(1000); + // Now we dequeue and nack again, and it should be moved to dead letter queue - const dequeued3 = await queue.dequeueMessageFromMasterQueue( + const dequeued3 = await queue.dequeueMessageFromWorkerQueue( "test_12345", - envMasterQueue, - 10 + authenticatedEnvDev.id ); - expect(dequeued3.length).toBe(1); + assertNonNullable(dequeued3); const envQueueLengthDequeue = await queue.lengthOfEnvQueue(authenticatedEnvDev); expect(envQueueLengthDequeue).toBe(0); @@ -236,22 +238,21 @@ describe("RunQueue.nackMessage", () => { }); try { - const envMasterQueue = `env:${authenticatedEnvDev.id}`; - // Enqueue message await queue.enqueueMessage({ env: authenticatedEnvDev, message: messageDev, - masterQueues: ["main", envMasterQueue], + workerQueue: authenticatedEnvDev.id, }); + await setTimeout(1000); + // Dequeue message - const dequeued = await queue.dequeueMessageFromMasterQueue( + const dequeued = await queue.dequeueMessageFromWorkerQueue( "test_12345", - envMasterQueue, - 10 + authenticatedEnvDev.id ); - expect(dequeued.length).toBe(1); + assertNonNullable(dequeued); // Set retryAt to 5 seconds in the future const retryAt = Date.now() + 5000; diff --git a/internal-packages/run-engine/src/run-queue/tests/reacquireConcurrency.test.ts b/internal-packages/run-engine/src/run-queue/tests/reacquireConcurrency.test.ts index a9c0386ca5..81cf549636 100644 --- a/internal-packages/run-engine/src/run-queue/tests/reacquireConcurrency.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/reacquireConcurrency.test.ts @@ -1,4 +1,4 @@ -import { redisTest } from "@internal/testcontainers"; +import { assertNonNullable, redisTest } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; @@ -6,6 +6,7 @@ import { RunQueue } from "../index.js"; import { RunQueueFullKeyProducer } from "../keyProducer.js"; import { InputPayload } from "../types.js"; import { MessageNotFoundError } from "../errors.js"; +import { setTimeout } from "node:timers/promises"; const testOptions = { name: "rq", @@ -74,11 +75,16 @@ describe("RunQueue.reacquireConcurrency", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: authenticatedEnvProd.id, }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages.length).toBe(1); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvProd.id + ); + assertNonNullable(message); //concurrencies expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -136,11 +142,16 @@ describe("RunQueue.reacquireConcurrency", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: authenticatedEnvProd.id, }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages.length).toBe(1); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvProd.id + ); + assertNonNullable(message); //concurrencies expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -195,11 +206,16 @@ describe("RunQueue.reacquireConcurrency", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: authenticatedEnvProd.id, }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages.length).toBe(1); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvProd.id + ); + assertNonNullable(message); //concurrencies expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -254,12 +270,17 @@ describe("RunQueue.reacquireConcurrency", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: authenticatedEnvProd.id, }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 1); - expect(messages.length).toBe(1); - expect(messages[0].message.runId).toBe(messageProd.runId); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvProd.id + ); + assertNonNullable(message); + expect(message.message.runId).toBe(messageProd.runId); //concurrencies expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -275,7 +296,7 @@ describe("RunQueue.reacquireConcurrency", () => { runId: "r1235", queue: "task/my-task-2", }, - masterQueues: "main", + workerQueue: authenticatedEnvProd.id, }); //reacquire the concurrency diff --git a/internal-packages/run-engine/src/run-queue/tests/releaseConcurrency.test.ts b/internal-packages/run-engine/src/run-queue/tests/releaseConcurrency.test.ts index 63873a54b3..bb436b8cb4 100644 --- a/internal-packages/run-engine/src/run-queue/tests/releaseConcurrency.test.ts +++ b/internal-packages/run-engine/src/run-queue/tests/releaseConcurrency.test.ts @@ -1,10 +1,11 @@ -import { redisTest } from "@internal/testcontainers"; +import { assertNonNullable, redisTest } from "@internal/testcontainers"; import { trace } from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { FairQueueSelectionStrategy } from "../fairQueueSelectionStrategy.js"; import { RunQueue } from "../index.js"; import { RunQueueFullKeyProducer } from "../keyProducer.js"; import { InputPayload } from "../types.js"; +import { setTimeout } from "node:timers/promises"; const testOptions = { name: "rq", @@ -68,11 +69,16 @@ describe("RunQueue.releaseConcurrency", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: authenticatedEnvProd.id, }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 10); - expect(messages.length).toBe(1); + await setTimeout(1000); + + const message = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvProd.id + ); + assertNonNullable(message); //concurrencies expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( @@ -99,6 +105,7 @@ describe("RunQueue.releaseConcurrency", () => { async ({ redisContainer }) => { const queue = new RunQueue({ ...testOptions, + masterQueueConsumersDisabled: true, queueSelectionStrategy: new FairQueueSelectionStrategy({ redis: { keyPrefix: "runqueue:test:", @@ -118,17 +125,25 @@ describe("RunQueue.releaseConcurrency", () => { await queue.enqueueMessage({ env: authenticatedEnvProd, message: messageProd, - masterQueues: "main", + workerQueue: authenticatedEnvProd.id, + skipDequeueProcessing: true, }); await queue.enqueueMessage({ env: authenticatedEnvProd, message: { ...messageProd, runId: "r1235" }, - masterQueues: "main", + workerQueue: authenticatedEnvProd.id, + skipDequeueProcessing: true, }); - const messages = await queue.dequeueMessageFromMasterQueue("test_12345", "main", 1); - expect(messages.length).toBe(1); + // Only process one message + await queue.processMasterQueueForEnvironment(authenticatedEnvProd.id, 1); + + const message = await queue.dequeueMessageFromWorkerQueue( + "test_12345", + authenticatedEnvProd.id + ); + assertNonNullable(message); //concurrencies expect(await queue.currentConcurrencyOfQueue(authenticatedEnvProd, messageProd.queue)).toBe( diff --git a/internal-packages/run-engine/src/run-queue/types.ts b/internal-packages/run-engine/src/run-queue/types.ts index 12627f375e..68431f4ede 100644 --- a/internal-packages/run-engine/src/run-queue/types.ts +++ b/internal-packages/run-engine/src/run-queue/types.ts @@ -16,10 +16,20 @@ export const InputPayload = z.object({ }); export type InputPayload = z.infer; -export const OutputPayload = InputPayload.extend({ +export const OutputPayloadV1 = InputPayload.extend({ version: z.literal("1"), masterQueues: z.string().array(), }); +export type OutputPayloadV1 = z.infer; + +export const OutputPayloadV2 = InputPayload.extend({ + version: z.literal("2"), + workerQueue: z.string(), +}); +export type OutputPayloadV2 = z.infer; + +export const OutputPayload = z.discriminatedUnion("version", [OutputPayloadV1, OutputPayloadV2]); + export type OutputPayload = z.infer; export type QueueDescriptor = { @@ -47,6 +57,13 @@ export interface RunQueueKeyProducer { ): string; queueKey(env: MinimalAuthenticatedEnvironment, queue: string, concurrencyKey?: string): string; + legacyMasterQueueKey(masterQueueName: string): string; + + masterQueueKeyForEnvironment(envId: string, shardCount: number): string; + masterQueueKeyForShard(shard: number): string; + masterQueueShardForEnvironment(envId: string, shardCount: number): number; + workerQueueKey(workerQueue: string): string; + envQueueKey(env: MinimalAuthenticatedEnvironment): string; envQueueKeyFromQueue(queue: string): string; queueConcurrencyLimitKey(env: MinimalAuthenticatedEnvironment, queue: string): string; diff --git a/internal-packages/run-engine/tsconfig.build.json b/internal-packages/run-engine/tsconfig.build.json index 619461da80..e5327e934a 100644 --- a/internal-packages/run-engine/tsconfig.build.json +++ b/internal-packages/run-engine/tsconfig.build.json @@ -1,10 +1,10 @@ { "include": ["src/**/*.ts"], - "exclude": ["src/**/*.test.ts"], + "exclude": ["src/**/*.test.ts", "src/engine/tests/utils/*.ts"], "compilerOptions": { "composite": true, - "target": "ES2019", - "lib": ["ES2019", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], "outDir": "dist", "module": "Node16", "moduleResolution": "Node16", diff --git a/internal-packages/run-engine/tsconfig.src.json b/internal-packages/run-engine/tsconfig.src.json index 6043e02ad2..2ddae2a0dd 100644 --- a/internal-packages/run-engine/tsconfig.src.json +++ b/internal-packages/run-engine/tsconfig.src.json @@ -1,10 +1,10 @@ { "include": ["src/**/*.ts"], - "exclude": ["node_modules", "src/**/*.test.ts"], + "exclude": ["node_modules", "src/**/*.test.ts", "src/engine/tests/utils/*.ts"], "compilerOptions": { "composite": true, - "target": "ES2019", - "lib": ["ES2019", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], "module": "Node16", "moduleResolution": "Node16", "moduleDetection": "force", diff --git a/internal-packages/run-engine/tsconfig.test.json b/internal-packages/run-engine/tsconfig.test.json index b4f627aff1..d33ed305b4 100644 --- a/internal-packages/run-engine/tsconfig.test.json +++ b/internal-packages/run-engine/tsconfig.test.json @@ -1,10 +1,10 @@ { - "include": ["src/**/*.test.ts"], + "include": ["src/**/*.test.ts", "src/engine/tests/utils/*.ts"], "references": [{ "path": "./tsconfig.src.json" }], "compilerOptions": { "composite": true, - "target": "ES2019", - "lib": ["ES2019", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], + "target": "ES2020", + "lib": ["ES2020", "DOM", "DOM.Iterable", "DOM.AsyncIterable"], "module": "Node16", "moduleResolution": "Node16", "moduleDetection": "force", diff --git a/internal-packages/testcontainers/src/index.ts b/internal-packages/testcontainers/src/index.ts index 4816d3ba9d..cae66dcbd8 100644 --- a/internal-packages/testcontainers/src/index.ts +++ b/internal-packages/testcontainers/src/index.ts @@ -17,7 +17,7 @@ import { StartedClickHouseContainer } from "./clickhouse"; import { ClickHouseClient, createClient } from "@clickhouse/client"; export { assertNonNullable } from "./utils"; -export { StartedRedisContainer }; +export { logCleanup }; type NetworkContext = { network: StartedNetwork }; @@ -35,14 +35,21 @@ type ElectricContext = { electricOrigin: string; }; -type ContainerContext = NetworkContext & PostgresContext & RedisContext & ClickhouseContext; -type PostgresAndRedisContext = NetworkContext & PostgresContext & RedisContext; -type ContainerWithElectricAndRedisContext = ContainerContext & ElectricContext; -type ContainerWithElectricContext = NetworkContext & PostgresContext & ElectricContext; +export type ContainerContext = NetworkContext & PostgresContext & RedisContext & ClickhouseContext; +export type PostgresAndRedisContext = NetworkContext & PostgresContext & RedisContext; +export type ContainerWithElectricAndRedisContext = ContainerContext & ElectricContext; +export type ContainerWithElectricContext = NetworkContext & PostgresContext & ElectricContext; + +export type { + StartedNetwork, + StartedPostgreSqlContainer, + StartedRedisContainer, + StartedClickHouseContainer, +}; type Use = (value: T) => Promise; -const network = async ({ task }: TaskContext, use: Use) => { +export const network = async ({ task }: TaskContext, use: Use) => { const testName = task.name; logSetup("network: starting", { testName }); @@ -68,7 +75,7 @@ const network = async ({ task }: TaskContext, use: Use) => { } }; -const postgresContainer = async ( +export const postgresContainer = async ( { network, task }: { network: StartedNetwork } & TaskContext, use: Use ) => { @@ -81,7 +88,7 @@ const postgresContainer = async ( await useContainer("postgresContainer", { container, task, use: () => use(container) }); }; -const prisma = async ( +export const prisma = async ( { postgresContainer, task }: { postgresContainer: StartedPostgreSqlContainer } & TaskContext, use: Use ) => { @@ -106,7 +113,7 @@ const prisma = async ( export const postgresTest = test.extend({ network, postgresContainer, prisma }); -const redisContainer = async ( +export const redisContainer = async ( { network, task }: { network: StartedNetwork } & TaskContext, use: Use ) => { @@ -122,7 +129,7 @@ const redisContainer = async ( await useContainer("redisContainer", { container, task, use: () => use(container) }); }; -const redisOptions = async ( +export const redisOptions = async ( { redisContainer }: { redisContainer: StartedRedisContainer }, use: Use ) => { @@ -152,6 +159,8 @@ const redisOptions = async ( showFriendlyErrorStack: true, }; + console.log("Redis options", options); + await use(options); }; diff --git a/internal-packages/tracing/src/index.ts b/internal-packages/tracing/src/index.ts index 04d11f159a..8d01bdaed5 100644 --- a/internal-packages/tracing/src/index.ts +++ b/internal-packages/tracing/src/index.ts @@ -1,18 +1,51 @@ -import { type Span, type SpanOptions, SpanStatusCode, type Tracer } from "@opentelemetry/api"; +import { + Meter, + type Span, + type SpanOptions, + SpanStatusCode, + type Tracer, +} from "@opentelemetry/api"; import { type Logger, SeverityNumber } from "@opentelemetry/api-logs"; import { flattenAttributes } from "@trigger.dev/core/v3/utils/flattenAttributes"; export * from "@opentelemetry/semantic-conventions"; -export type { Tracer, Attributes } from "@opentelemetry/api"; +export type { + Tracer, + Attributes, + Meter, + Counter, + UpDownCounter, + ObservableGauge, + ObservableCounter, + ObservableUpDownCounter, + MetricOptions, + Gauge, + Histogram, + ObservableResult, +} from "@opentelemetry/api"; -import { trace, context, propagation, SpanKind } from "@opentelemetry/api"; -export { trace, context, propagation, type Span, SpanKind, type SpanOptions, SpanStatusCode }; +import { trace, context, propagation, SpanKind, metrics, ValueType } from "@opentelemetry/api"; +export { + trace, + context, + propagation, + type Span, + SpanKind, + type SpanOptions, + SpanStatusCode, + metrics, + ValueType, +}; export function getTracer(name: string): Tracer { return trace.getTracer(name); } +export function getMeter(name: string): Meter { + return metrics.getMeter(name); +} + export async function startSpan( tracer: Tracer | undefined, name: string, diff --git a/packages/core/src/logger.ts b/packages/core/src/logger.ts index d46ee06012..e63987405b 100644 --- a/packages/core/src/logger.ts +++ b/packages/core/src/logger.ts @@ -12,9 +12,9 @@ import { env } from "node:process"; import { Buffer } from "node:buffer"; import { trace, context } from "@opentelemetry/api"; -export type LogLevel = "log" | "error" | "warn" | "info" | "debug"; +export type LogLevel = "log" | "error" | "warn" | "info" | "debug" | "verbose"; -const logLevels: Array = ["log", "error", "warn", "info", "debug"]; +const logLevels: Array = ["log", "error", "warn", "info", "debug", "verbose"]; export class Logger { #name: string; @@ -87,6 +87,12 @@ export class Logger { this.#structuredLog(console.debug, message, "debug", ...args); } + verbose(message: string, ...args: Array | undefined>) { + if (this.#level < 5) return; + + this.#structuredLog(console.log, message, "verbose", ...args); + } + #structuredLog( loggerFunction: (message: string, ...args: any[]) => void, message: string, diff --git a/packages/core/src/v3/serverOnly/index.ts b/packages/core/src/v3/serverOnly/index.ts index 05e7b08a24..6111d04de9 100644 --- a/packages/core/src/v3/serverOnly/index.ts +++ b/packages/core/src/v3/serverOnly/index.ts @@ -4,3 +4,4 @@ export * from "./httpServer.js"; export * from "./singleton.js"; export * from "./shutdownManager.js"; export * from "./k8s.js"; +export * from "./jumpHash.js"; diff --git a/packages/core/src/v3/serverOnly/jumpHash.ts b/packages/core/src/v3/serverOnly/jumpHash.ts new file mode 100644 index 0000000000..ac13deb5f7 --- /dev/null +++ b/packages/core/src/v3/serverOnly/jumpHash.ts @@ -0,0 +1,49 @@ +import { createHash } from "node:crypto"; + +/** + * Returns the binary hash of a binary value. Hash function and output length are configurable. + */ +function hash(data: Uint8Array, outputLength = 32, type = "sha256"): Uint8Array { + const hasher = createHash(type); + hasher.update(data); + let hash = new Uint8Array(hasher.digest()); + hash = hash.subarray(0, outputLength); + return hash; +} + +export function jumpHash(key: string, buckets: number): number { + return jumpConsistentHash(hash(Buffer.from(key)), buckets); +} + +/** + * Based on Node.js's Buffer implementation: https://github.com/nodejs/node/blob/ed8fc7e11d688cbcdf33d0d149830064758bdcd2/lib/internal/buffer.js#L98 + * @param bytes + */ +function bytesToBigInt(bytes: Uint8Array): bigint { + const hi = bytes[0]! * 2 ** 24 + bytes[1]! * 2 ** 16 + bytes[2]! * 2 ** 8 + bytes[3]!; + const lo = bytes[4]! * 2 ** 24 + bytes[5]! * 2 ** 16 + bytes[6]! * 2 ** 8 + bytes[7]!; + + // tslint:disable-next-line:no-bitwise + return (BigInt(hi) << 32n) + BigInt(lo); +} + +/** + * @param {Uint8Array} key 8 bytes (represents uint64 number) + * @param {number} numBuckets Up to 32-bit number + * + * @return {number} Bucket from `[0, numBuckets)` range + */ +function jumpConsistentHash(key: Uint8Array, numBuckets: number): number { + let keyBigInt = bytesToBigInt(key); + let b = -1n; + let j = 0n; + while (j < numBuckets) { + b = j; + // We fit the number after multiplication within 64-bit range, just like in C++ implementation from paper + keyBigInt = ((keyBigInt * 2862933555777941757n) % 2n ** 64n) + 1n; + // Here we need to divide numbers as double (like in C++ implementation from paper), hence converting back to numbers for that + // tslint:disable-next-line:no-bitwise + j = BigInt(Math.floor(((Number(b) + 1) * Number(1n << 31n)) / Number((keyBigInt >> 33n) + 1n))); + } + return Number(b); +} diff --git a/packages/core/test/jumpHash.test.ts b/packages/core/test/jumpHash.test.ts new file mode 100644 index 0000000000..8f9a3d97ed --- /dev/null +++ b/packages/core/test/jumpHash.test.ts @@ -0,0 +1,81 @@ +import { jumpHash } from "../src/v3/serverOnly/index.js"; + +describe("jumpHash", () => { + it("should hash a string to a number", () => { + expect(jumpHash("test", 10)).toBe(5); + }); + + it("should hash different strings to numbers in range", () => { + for (const key of ["a", "b", "c", "test", "trigger", "dev", "123", "!@#"]) { + for (const buckets of [1, 2, 5, 10, 100, 1000]) { + const result = jumpHash(key, buckets); + expect(result).toBeGreaterThanOrEqual(0); + expect(result).toBeLessThan(buckets); + } + } + }); + + it("should return 0 for any key if buckets is 1", () => { + expect(jumpHash("anything", 1)).toBe(0); + expect(jumpHash("", 1)).toBe(0); + }); + + it("should handle empty string key", () => { + expect(jumpHash("", 10)).toBeGreaterThanOrEqual(0); + expect(jumpHash("", 10)).toBeLessThan(10); + }); + + it("should distribute keys evenly across buckets", () => { + const buckets = 10; + const numKeys = 10000; + const counts = Array(buckets).fill(0); + for (let i = 0; i < numKeys; i++) { + const key = `key_${i}`; + const bucket = jumpHash(key, buckets); + counts[bucket]++; + } + const avg = numKeys / buckets; + // No bucket should have less than half or more than double the average + for (const count of counts) { + expect(count).toBeGreaterThanOrEqual(avg * 0.5); + expect(count).toBeLessThanOrEqual(avg * 2); + } + }); + + it("should have minimal movement when increasing buckets by 1", () => { + const numKeys = 1000; + const buckets = 50; + let moved = 0; + for (let i = 0; i < numKeys; i++) { + const key = `key_${i}`; + const bucket1 = jumpHash(key, buckets); + const bucket2 = jumpHash(key, buckets + 1); + if (bucket1 !== bucket2) moved++; + } + // For jump consistent hash, about 1/(buckets+1) of keys should move + const expectedMoved = numKeys / (buckets + 1); + expect(moved).toBeGreaterThanOrEqual(expectedMoved * 0.5); + expect(moved).toBeLessThanOrEqual(expectedMoved * 2); + }); + + it("should be deterministic for the same key and bucket count", () => { + for (let i = 0; i < 100; i++) { + const key = `key_${i}`; + const buckets = 20; + const result1 = jumpHash(key, buckets); + const result2 = jumpHash(key, buckets); + expect(result1).toBe(result2); + } + }); + + it("should always return a value in [0, buckets-1]", () => { + for (let i = 0; i < 100; i++) { + const key = `key_${i}`; + for (let buckets = 1; buckets < 50; buckets++) { + const result = jumpHash(key, buckets); + expect(result).toBeGreaterThanOrEqual(0); + expect(result).toBeLessThan(buckets); + } + } + }); +}); diff --git a/packages/redis-worker/package.json b/packages/redis-worker/package.json index 779708d5c6..c74addcfb0 100644 --- a/packages/redis-worker/package.json +++ b/packages/redis-worker/package.json @@ -27,7 +27,6 @@ "lodash.omit": "^4.5.0", "nanoid": "^5.0.7", "p-limit": "^6.2.0", - "prom-client": "^15.1.0", "zod": "3.23.8" }, "devDependencies": { @@ -52,4 +51,4 @@ "require": "./dist/index.cjs" } } -} +} \ No newline at end of file diff --git a/packages/redis-worker/src/queue.test.ts b/packages/redis-worker/src/queue.test.ts index 032a100852..ffea46b6b3 100644 --- a/packages/redis-worker/src/queue.test.ts +++ b/packages/redis-worker/src/queue.test.ts @@ -423,4 +423,64 @@ describe("SimpleQueue", () => { await queue.close(); } }); + + redisTest( + "enqueueOnce only enqueues the first message with a given ID", + { timeout: 20_000 }, + async ({ redisContainer }) => { + const queue = new SimpleQueue({ + name: "test-once", + schema: { + test: z.object({ + value: z.number(), + }), + }, + redisOptions: { + host: redisContainer.getHost(), + port: redisContainer.getPort(), + password: redisContainer.getPassword(), + }, + logger: new Logger("test", "log"), + }); + + try { + const now = Date.now(); + const availableAt1 = new Date(now + 1000); + const availableAt2 = new Date(now + 5000); + + // First enqueueOnce should succeed + const first = await queue.enqueueOnce({ + id: "unique-id", + job: "test", + item: { value: 1 }, + visibilityTimeoutMs: 2000, + availableAt: availableAt1, + }); + expect(first).toBe(true); + expect(await queue.size({ includeFuture: true })).toBe(1); + + // Second enqueueOnce with same ID but different value and availableAt should do nothing + const second = await queue.enqueueOnce({ + id: "unique-id", + job: "test", + item: { value: 999 }, + visibilityTimeoutMs: 2000, + availableAt: availableAt2, + }); + expect(second).toBe(false); + expect(await queue.size({ includeFuture: true })).toBe(1); + + // Dequeue after 1s should get the original item, not the second + await new Promise((resolve) => setTimeout(resolve, 1100)); + const [item] = await queue.dequeue(1); + expect(item).toBeDefined(); + expect(item?.id).toBe("unique-id"); + expect(item?.item).toEqual({ value: 1 }); + // Should not be the second value + expect(item?.item).not.toEqual({ value: 999 }); + } finally { + await queue.close(); + } + } + ); }); diff --git a/packages/redis-worker/src/queue.ts b/packages/redis-worker/src/queue.ts index 4b73171acc..c07257fdd6 100644 --- a/packages/redis-worker/src/queue.ts +++ b/packages/redis-worker/src/queue.ts @@ -131,6 +131,48 @@ export class SimpleQueue { } } + async enqueueOnce({ + id, + job, + item, + attempt, + availableAt, + visibilityTimeoutMs, + }: { + id: string; + job: MessageCatalogKey; + item: MessageCatalogValue>; + attempt?: number; + availableAt?: Date; + visibilityTimeoutMs: number; + }): Promise { + if (!id) { + throw new Error("enqueueOnce requires an id"); + } + try { + const score = availableAt ? availableAt.getTime() : Date.now(); + const deduplicationKey = nanoid(); + const serializedItem = JSON.stringify({ + job, + item, + visibilityTimeoutMs, + attempt, + deduplicationKey, + }); + const result = await this.redis.enqueueItemOnce(`queue`, `items`, id, score, serializedItem); + // 1 if inserted, 0 if already exists + return result === 1; + } catch (e) { + this.logger.error(`SimpleQueue ${this.name}.enqueueOnce(): error enqueuing`, { + queue: this.name, + error: e, + id, + item, + }); + throw e; + } + } + async dequeue(count: number = 1): Promise>> { const now = Date.now(); @@ -473,6 +515,26 @@ export class SimpleQueue { return 1 `, }); + + this.redis.defineCommand("enqueueItemOnce", { + numberOfKeys: 2, + lua: ` + local queue = KEYS[1] + local items = KEYS[2] + local id = ARGV[1] + local score = ARGV[2] + local serializedItem = ARGV[3] + + -- Only add if not exists + local added = redis.call('HSETNX', items, id, serializedItem) + if added == 1 then + redis.call('ZADD', queue, 'NX', score, id) + return 1 + else + return 0 + end + `, + }); } } @@ -525,5 +587,14 @@ declare module "@internal/redis" { errorMessage: string, callback?: Callback ): Result; + + enqueueItemOnce( + queue: string, + items: string, + id: string, + score: number, + serializedItem: string, + callback?: Callback + ): Result; } } diff --git a/packages/redis-worker/src/worker.ts b/packages/redis-worker/src/worker.ts index 13e4fd85f1..f981698bfa 100644 --- a/packages/redis-worker/src/worker.ts +++ b/packages/redis-worker/src/worker.ts @@ -1,4 +1,16 @@ -import { SpanKind, startSpan, trace, Tracer } from "@internal/tracing"; +import { + Attributes, + Histogram, + Meter, + metrics, + ObservableResult, + SemanticAttributes, + SpanKind, + startSpan, + trace, + Tracer, + ValueType, +} from "@internal/tracing"; import { Logger } from "@trigger.dev/core/logger"; import { calculateNextRetryDelay } from "@trigger.dev/core/v3"; import { type RetryOptions } from "@trigger.dev/core/v3/schemas"; @@ -9,7 +21,6 @@ import { nanoid } from "nanoid"; import pLimit from "p-limit"; import { createRedisClient } from "@internal/redis"; import { shutdownManager } from "@trigger.dev/core/v3/serverOnly"; -import { Registry, Histogram } from "prom-client"; export type WorkerCatalog = { [key: string]: { @@ -50,9 +61,7 @@ type WorkerOptions = { shutdownTimeoutMs?: number; logger?: Logger; tracer?: Tracer; - metrics?: { - register: Registry; - }; + meter?: Meter; }; // This results in attempt 12 being a delay of 1 hour @@ -69,9 +78,9 @@ const defaultRetrySettings = { class Worker { private subscriber: Redis | undefined; private tracer: Tracer; + private meter: Meter; private metrics: { - register?: Registry; enqueueDuration?: Histogram; dequeueDuration?: Histogram; jobDuration?: Histogram; @@ -94,6 +103,7 @@ class Worker { constructor(private options: WorkerOptions) { this.logger = options.logger ?? new Logger("Worker", "debug"); this.tracer = options.tracer ?? trace.getTracer(options.name); + this.meter = options.meter ?? metrics.getMeter(options.name); this.shutdownTimeoutMs = options.shutdownTimeoutMs ?? 60_000; @@ -116,59 +126,76 @@ class Worker { // Create a p-limit instance using this limit. this.limiter = pLimit(this.concurrency.limit); - this.metrics.register = options.metrics?.register; + const masterQueueObservableGauge = this.meter.createObservableGauge("redis_worker.queue.size", { + description: "The number of items in the queue", + unit: "items", + valueType: ValueType.INT, + }); - if (!this.metrics.register) { - return; - } + masterQueueObservableGauge.addCallback(this.#updateQueueSizeMetric.bind(this)); - this.metrics.enqueueDuration = new Histogram({ - name: "redis_worker_enqueue_duration_seconds", - help: "The duration of enqueue operations", - labelNames: ["worker_name", "job_type", "has_available_at"], - buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1], - registers: [this.metrics.register], - }); + const deadLetterQueueObservableGauge = this.meter.createObservableGauge( + "redis_worker.queue.dead_letter_size", + { + description: "The number of items in the dead letter queue", + unit: "items", + valueType: ValueType.INT, + } + ); - this.metrics.dequeueDuration = new Histogram({ - name: "redis_worker_dequeue_duration_seconds", - help: "The duration of dequeue operations", - labelNames: ["worker_name", "worker_id", "task_count"], - buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1], - registers: [this.metrics.register], - }); + deadLetterQueueObservableGauge.addCallback(this.#updateDeadLetterQueueSizeMetric.bind(this)); + + const concurrencyLimitActiveObservableGauge = this.meter.createObservableGauge( + "redis_worker.concurrency.active", + { + description: "The number of active workers", + unit: "workers", + valueType: ValueType.INT, + } + ); + + concurrencyLimitActiveObservableGauge.addCallback( + this.#updateConcurrencyLimitActiveMetric.bind(this) + ); + + const concurrencyLimitPendingObservableGauge = this.meter.createObservableGauge( + "redis_worker.concurrency.pending", + { + description: "The number of pending workers", + unit: "workers", + valueType: ValueType.INT, + } + ); + + concurrencyLimitPendingObservableGauge.addCallback( + this.#updateConcurrencyLimitPendingMetric.bind(this) + ); + } + + async #updateQueueSizeMetric(observableResult: ObservableResult) { + const queueSize = await this.queue.size(); - this.metrics.jobDuration = new Histogram({ - name: "redis_worker_job_duration_seconds", - help: "The duration of job operations", - labelNames: ["worker_name", "worker_id", "batch_size", "job_type", "attempt"], - // use different buckets here as jobs can take a while to run - buckets: [0.1, 0.25, 0.5, 1, 2.5, 5, 10, 20, 30, 45, 60], - registers: [this.metrics.register], + observableResult.observe(queueSize, { + worker_name: this.options.name, }); + } - this.metrics.ackDuration = new Histogram({ - name: "redis_worker_ack_duration_seconds", - help: "The duration of ack operations", - labelNames: ["worker_name"], - buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1], - registers: [this.metrics.register], + async #updateDeadLetterQueueSizeMetric(observableResult: ObservableResult) { + const deadLetterQueueSize = await this.queue.sizeOfDeadLetterQueue(); + observableResult.observe(deadLetterQueueSize, { + worker_name: this.options.name, }); + } - this.metrics.redriveDuration = new Histogram({ - name: "redis_worker_redrive_duration_seconds", - help: "The duration of redrive operations", - labelNames: ["worker_name"], - buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1], - registers: [this.metrics.register], + async #updateConcurrencyLimitActiveMetric(observableResult: ObservableResult) { + observableResult.observe(this.limiter.activeCount, { + worker_name: this.options.name, }); + } - this.metrics.rescheduleDuration = new Histogram({ - name: "redis_worker_reschedule_duration_seconds", - help: "The duration of reschedule operations", - labelNames: ["worker_name"], - buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1], - registers: [this.metrics.register], + async #updateConcurrencyLimitPendingMetric(observableResult: ObservableResult) { + observableResult.observe(this.limiter.pendingCount, { + worker_name: this.options.name, }); } @@ -255,6 +282,66 @@ class Worker { ); } + /** + * Enqueues a job for processing once. If the job is already in the queue, it will be ignored. + * @param options - The enqueue options. + * @param options.id - Required unique identifier for the job. + * @param options.job - The job type from the worker catalog. + * @param options.payload - The job payload that matches the schema defined in the catalog. + * @param options.visibilityTimeoutMs - Optional visibility timeout in milliseconds. Defaults to value from catalog. + * @param options.availableAt - Optional date when the job should become available for processing. Defaults to now. + * @returns A promise that resolves when the job is enqueued. + */ + enqueueOnce({ + id, + job, + payload, + visibilityTimeoutMs, + availableAt, + }: { + id: string; + job: K; + payload: z.infer; + visibilityTimeoutMs?: number; + availableAt?: Date; + }) { + return startSpan( + this.tracer, + "enqueueOnce", + async (span) => { + const timeout = visibilityTimeoutMs ?? this.options.catalog[job]?.visibilityTimeoutMs; + + if (!timeout) { + throw new Error(`No visibility timeout found for job ${String(job)} with id ${id}`); + } + + span.setAttribute("job_visibility_timeout_ms", timeout); + + return this.withHistogram( + this.metrics.enqueueDuration, + this.queue.enqueueOnce({ + id, + job, + item: payload, + visibilityTimeoutMs: timeout, + availableAt, + }), + { + job_type: String(job), + has_available_at: availableAt ? "true" : "false", + } + ); + }, + { + kind: SpanKind.PRODUCER, + attributes: { + job_type: String(job), + job_id: id, + }, + } + ); + } + /** * Reschedules an existing job to a new available date. * If the job isn't in the queue, it will be ignored. @@ -461,19 +548,20 @@ class Worker { } private async withHistogram( - histogram: Histogram | undefined, + histogram: Histogram | undefined, promise: Promise, labels?: Record ): Promise { - if (!histogram || !this.metrics.register) { + if (!histogram) { return promise; } - const end = histogram.startTimer({ worker_name: this.options.name, ...labels }); + const start = Date.now(); try { return await promise; } finally { - end(); + const duration = (Date.now() - start) / 1000; // Convert to seconds + histogram.record(duration, { worker_name: this.options.name, ...labels }); } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 6f469531ac..aa702ff8af 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -275,6 +275,9 @@ importers: '@opentelemetry/exporter-logs-otlp-http': specifier: 0.52.1 version: 0.52.1(@opentelemetry/api@1.9.0) + '@opentelemetry/exporter-metrics-otlp-proto': + specifier: 0.52.1 + version: 0.52.1(@opentelemetry/api@1.9.0) '@opentelemetry/exporter-trace-otlp-http': specifier: 0.52.1 version: 0.52.1(@opentelemetry/api@1.9.0) @@ -293,6 +296,9 @@ importers: '@opentelemetry/sdk-logs': specifier: 0.52.1 version: 0.52.1(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-metrics': + specifier: 1.25.1 + version: 1.25.1(@opentelemetry/api@1.9.0) '@opentelemetry/sdk-node': specifier: 0.52.1 version: 0.52.1(@opentelemetry/api@1.9.0) @@ -1614,9 +1620,6 @@ importers: p-limit: specifier: ^6.2.0 version: 6.2.0 - prom-client: - specifier: ^15.1.0 - version: 15.1.0 zod: specifier: 3.23.8 version: 3.23.8 @@ -8789,6 +8792,35 @@ packages: '@opentelemetry/sdk-logs': 0.52.1(@opentelemetry/api@1.9.0) dev: false + /@opentelemetry/exporter-metrics-otlp-http@0.52.1(@opentelemetry/api@1.9.0): + resolution: {integrity: sha512-oAHPOy1sZi58bwqXaucd19F/v7+qE2EuVslQOEeLQT94CDuZJJ4tbWzx8DpYBTrOSzKqqrMtx9+PMxkrcbxOyQ==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-exporter-base': 0.52.1(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-transformer': 0.52.1(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-metrics': 1.25.1(@opentelemetry/api@1.9.0) + dev: false + + /@opentelemetry/exporter-metrics-otlp-proto@0.52.1(@opentelemetry/api@1.9.0): + resolution: {integrity: sha512-m9aEOzKkjznNxm+0NbyEV834Wza9asRaFA4VyWY3b1XltqbdStRmOYSZHq0VzcecOe24uD41zFqHweL2fA3y6g==} + engines: {node: '>=14'} + peerDependencies: + '@opentelemetry/api': ^1.3.0 + dependencies: + '@opentelemetry/api': 1.9.0 + '@opentelemetry/core': 1.25.1(@opentelemetry/api@1.9.0) + '@opentelemetry/exporter-metrics-otlp-http': 0.52.1(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-exporter-base': 0.52.1(@opentelemetry/api@1.9.0) + '@opentelemetry/otlp-transformer': 0.52.1(@opentelemetry/api@1.9.0) + '@opentelemetry/resources': 1.25.1(@opentelemetry/api@1.9.0) + '@opentelemetry/sdk-metrics': 1.25.1(@opentelemetry/api@1.9.0) + dev: false + /@opentelemetry/exporter-trace-otlp-grpc@0.49.1(@opentelemetry/api@1.4.1): resolution: {integrity: sha512-Zbd7f3zF7fI2587MVhBizaW21cO/SordyrZGtMtvhoxU6n4Qb02Gx71X4+PzXH620e0+JX+Pcr9bYb1HTeVyJA==} engines: {node: '>=14'}