Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .server-changes/task-meta-cache-resolution-metrics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
area: webapp
type: improvement
---

Emit metrics for task metadata cache resolution on the trigger path, surfacing the cache hit rate and how often a read replica returned empty for a row the primary had.
35 changes: 30 additions & 5 deletions apps/webapp/app/runEngine/concerns/queues.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ import { createCache, createLRUMemoryStore, DefaultStatefulContext, Namespace }
import { singleton } from "~/utils/singleton";
import type { TaskMetadataCache, TaskMetadataEntry } from "~/services/taskMetadataCache.server";
import { taskMetadataCacheInstance } from "~/services/taskMetadataCacheInstance.server";
import {
recordTaskMetaResolve,
type TaskMetaResolveSource,
} from "~/services/taskMetadataCacheTelemetry.server";

// LRU cache for environment queue sizes to reduce Redis calls
const queueSizeCache = singleton("queueSizeCache", () => {
Expand Down Expand Up @@ -266,7 +270,10 @@ export class DefaultQueueManager implements QueueManager {
slug: string
): Promise<TaskMetadataEntry | null> {
const cached = await this.taskMetaCache.getByWorker(workerId, slug);
if (cached) return cached;
if (cached) {
recordTaskMetaResolve("locked", "cache");
return cached;
}

// Cache miss. Read the row from the replica first; if the replica comes
// back empty, re-check the writer before concluding the task is missing.
Expand All @@ -277,11 +284,13 @@ export class DefaultQueueManager implements QueueManager {
// registered. The writer read only runs on this rare miss-then-empty path,
// never on the hot path.
let row = await this.findLockedTaskRow(this.replicaPrisma, workerId, environmentId, slug);
let source: TaskMetaResolveSource = "replica";

if (!row && this.replicaPrisma !== this.prisma) {
row = await this.findLockedTaskRow(this.prisma, workerId, environmentId, slug);

if (row) {
source = "writer";
logger.warn("Locked task metadata missing on replica but found on writer", {
workerId,
environmentId,
Expand All @@ -290,7 +299,12 @@ export class DefaultQueueManager implements QueueManager {
}
}

if (!row) return null;
if (!row) {
recordTaskMetaResolve("locked", "miss");
return null;
}

recordTaskMetaResolve("locked", source);

const entry: TaskMetadataEntry = {
slug,
Expand Down Expand Up @@ -336,14 +350,20 @@ export class DefaultQueueManager implements QueueManager {
slug: string
): Promise<TaskMetadataEntry | null> {
const cached = await this.taskMetaCache.getCurrent(environment.id, slug);
if (cached) return cached;
if (cached) {
recordTaskMetaResolve("current", "cache");
return cached;
}

// Cold cache: discover the current worker for the env. Replica is fine —
// the adjacent BackgroundWorkerTask lookup below uses `replicaPrisma` too
// (replica lag for "just deployed" is bounded the same way for both
// queries; reading from the writer here would only widen the window).
const worker = await findCurrentWorkerFromEnvironment(environment, this.replicaPrisma);
if (!worker) return null;
if (!worker) {
recordTaskMetaResolve("current", "miss");
return null;
}

const row = await this.replicaPrisma.backgroundWorkerTask.findFirst({
where: { workerId: worker.id, runtimeEnvironmentId: environment.id, slug },
Expand All @@ -354,7 +374,12 @@ export class DefaultQueueManager implements QueueManager {
},
});

if (!row) return null;
if (!row) {
recordTaskMetaResolve("current", "miss");
return null;
}

recordTaskMetaResolve("current", "replica");

const entry: TaskMetadataEntry = {
slug,
Expand Down
38 changes: 38 additions & 0 deletions apps/webapp/app/services/taskMetadataCacheTelemetry.server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import { getMeter } from "@internal/tracing";

const meter = getMeter("task-meta-cache");

/**
* One counter for every task-metadata resolution on the trigger path, with two
* bounded labels:
*
* path: "locked" - lockToVersion / triggerAndWait (reads the by-worker hash)
* "current" - default trigger (reads the env hash)
* source: where the metadata was resolved from:
* "cache" - Redis hit (warm)
* "replica" - cache miss, the read replica had the row
* "writer" - cache miss + replica empty, the primary had the row
* (i.e. the replica was stale for an existing row)
* "miss" - not found anywhere (genuinely not registered)
*
* Derived signals:
* cache / total -> cache hit rate (the inverse is coldness)
* writer / total -> how often the replica returned empty for
* a row the primary had
*
* No env / worker / slug labels: those are unbounded in production.
*/
const resolveCounter = meter.createCounter("task_meta_cache.resolve", {
description:
"Task metadata resolutions on the trigger path, by lookup path and the source that satisfied them",
});

export type TaskMetaResolvePath = "locked" | "current";
export type TaskMetaResolveSource = "cache" | "replica" | "writer" | "miss";

export function recordTaskMetaResolve(
path: TaskMetaResolvePath,
source: TaskMetaResolveSource
): void {
resolveCounter.add(1, { path, source });
}