Skip to content

Commit 8b40571

Browse files
authored
feat(supervisor): workload create duration histogram with backend and outcome labels (#3928)
Adds a `workload_create_duration_seconds` Prometheus histogram to the supervisor, observed around the workload manager `create()` call: - `backend` label: `kubernetes` | `compute` | `docker` — set once from the configured workload manager - `outcome` label: `success` | `error` — the per-outcome counts double as a create error rate Registered on the supervisor's existing metrics registry, so it's exposed on the existing `/metrics` endpoint with no config changes. Notes: - Covers cold creates only; warm starts and restores return before reaching `create()`. - A create may include backend-internal retries, so one observation can span multiple attempts. - Fixed low cardinality: 2 active label sets per deployment × 10 buckets.
1 parent 5232067 commit 8b40571

2 files changed

Lines changed: 30 additions & 4 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
area: supervisor
3+
type: improvement
4+
---
5+
6+
Add a `workload_create_duration_seconds` Prometheus histogram recording the duration and outcome (success/error) of workload manager create calls, labeled by backend (kubernetes/compute/docker). Previously failed creates were only visible as error logs.

apps/supervisor/src/index.ts

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import {
2121
isKubernetesEnvironment,
2222
} from "@trigger.dev/core/v3/serverOnly";
2323
import { createK8sApi } from "./clients/kubernetes.js";
24-
import { collectDefaultMetrics } from "prom-client";
24+
import { collectDefaultMetrics, Histogram } from "prom-client";
2525
import { register } from "./metrics.js";
2626
import { PodCleaner } from "./services/podCleaner.js";
2727
import { FailedPodHandler } from "./services/failedPodHandler.js";
@@ -45,11 +45,20 @@ if (env.METRICS_COLLECT_DEFAULTS) {
4545
collectDefaultMetrics({ register });
4646
}
4747

48+
const workloadCreateDuration = new Histogram({
49+
name: "workload_create_duration_seconds",
50+
help: "Duration of workload manager create calls. A create may include backend-internal retries, so one observation can span multiple attempts.",
51+
labelNames: ["backend", "outcome"],
52+
buckets: [0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60],
53+
registers: [register],
54+
});
55+
4856
class ManagedSupervisor {
4957
private readonly workerSession: SupervisorSession;
5058
private readonly metricsServer?: HttpServer;
5159
private readonly workloadServer: WorkloadServer;
5260
private readonly workloadManager: WorkloadManager;
61+
private readonly workloadManagerBackend: "compute" | "kubernetes" | "docker";
5362
private readonly computeManager?: ComputeWorkloadManager;
5463
private readonly logger = new SimpleStructuredLogger("managed-supervisor");
5564
private readonly resourceMonitor: ResourceMonitor;
@@ -151,10 +160,13 @@ class ManagedSupervisor {
151160
});
152161
this.computeManager = computeManager;
153162
this.workloadManager = computeManager;
163+
this.workloadManagerBackend = "compute";
164+
} else if (this.isKubernetes) {
165+
this.workloadManager = new KubernetesWorkloadManager(workloadManagerOptions);
166+
this.workloadManagerBackend = "kubernetes";
154167
} else {
155-
this.workloadManager = this.isKubernetes
156-
? new KubernetesWorkloadManager(workloadManagerOptions)
157-
: new DockerWorkloadManager(workloadManagerOptions);
168+
this.workloadManager = new DockerWorkloadManager(workloadManagerOptions);
169+
this.workloadManagerBackend = "docker";
158170
}
159171

160172
if (this.isKubernetes) {
@@ -493,6 +505,10 @@ class ManagedSupervisor {
493505
hasPrivateLink: message.organization.hasPrivateLink,
494506
});
495507
recordPhaseSince("workload_create", createStart, undefined);
508+
workloadCreateDuration.observe(
509+
{ backend: this.workloadManagerBackend, outcome: "success" },
510+
(performance.now() - createStart) / 1000
511+
);
496512

497513
// Disabled for now
498514
// this.resourceMonitor.blockResources({
@@ -505,6 +521,10 @@ class ManagedSupervisor {
505521
createStart,
506522
error instanceof Error ? error : new Error(String(error))
507523
);
524+
workloadCreateDuration.observe(
525+
{ backend: this.workloadManagerBackend, outcome: "error" },
526+
(performance.now() - createStart) / 1000
527+
);
508528
this.logger.error("Failed to create workload", { error });
509529
}
510530
}

0 commit comments

Comments
 (0)