Skip to content

Commit 7c2e78c

Browse files
authored
fix(batch): more high cardinality metric attribute fixes (#2846)
1 parent 6dfbe1d commit 7c2e78c

File tree

2 files changed

+64
-11
lines changed

2 files changed

+64
-11
lines changed

.cursor/rules/otel-metrics.mdc

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
---
2+
description: Guidelines for creating OpenTelemetry metrics to avoid cardinality issues
3+
globs:
4+
- "**/*.ts"
5+
---
6+
7+
# OpenTelemetry Metrics Guidelines
8+
9+
When creating or editing OTEL metrics (counters, histograms, gauges), always ensure metric attributes have **low cardinality**.
10+
11+
## What is Cardinality?
12+
13+
Cardinality refers to the number of unique values an attribute can have. Each unique combination of attribute values creates a new time series, which consumes memory and storage in your metrics backend.
14+
15+
## Rules
16+
17+
### DO use low-cardinality attributes:
18+
- **Enums**: `environment_type` (PRODUCTION, STAGING, DEVELOPMENT, PREVIEW)
19+
- **Booleans**: `hasFailures`, `streaming`, `success`
20+
- **Bounded error codes**: A finite, controlled set of error types
21+
- **Shard IDs**: When sharding is bounded (e.g., 0-15)
22+
23+
### DO NOT use high-cardinality attributes:
24+
- **UUIDs/IDs**: `envId`, `userId`, `runId`, `projectId`, `organizationId`
25+
- **Unbounded integers**: `itemCount`, `batchSize`, `retryCount`
26+
- **Timestamps**: `createdAt`, `startTime`
27+
- **Free-form strings**: `errorMessage`, `taskName`, `queueName`
28+
29+
## Example
30+
31+
```typescript
32+
// BAD - High cardinality
33+
this.counter.add(1, {
34+
envId: options.environmentId, // UUID - unbounded
35+
itemCount: options.runCount, // Integer - unbounded
36+
});
37+
38+
// GOOD - Low cardinality
39+
this.counter.add(1, {
40+
environment_type: options.environmentType, // Enum - 4 values
41+
streaming: true, // Boolean - 2 values
42+
});
43+
```
44+
45+
## Reference
46+
47+
See the schedule engine (`internal-packages/schedule-engine/src/engine/index.ts`) for a good example of low-cardinality metric attributes.
48+
49+
High cardinality metrics can cause:
50+
- Memory bloat in metrics backends (Axiom, Prometheus, etc.)
51+
- Slow queries and dashboard timeouts
52+
- Increased costs (many backends charge per time series)
53+
- Potential data loss or crashes at scale

internal-packages/run-engine/src/batch-queue/index.ts

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,7 @@ export class BatchQueue {
287287

288288
// Record metric
289289
this.batchesEnqueuedCounter?.add(1, {
290-
envId: options.environmentId,
291-
itemCount: options.runCount,
290+
environment_type: options.environmentType,
292291
streaming: true,
293292
});
294293

@@ -358,7 +357,7 @@ export class BatchQueue {
358357
});
359358

360359
// Record metric
361-
this.itemsEnqueuedCounter?.add(1, { envId });
360+
this.itemsEnqueuedCounter?.add(1, { environment_type: meta.environmentType });
362361

363362
this.logger.debug("Batch item enqueued", {
364363
batchId,
@@ -701,9 +700,8 @@ export class BatchQueue {
701700
"batch.attempt": storedMessage.attempt,
702701
});
703702

704-
// Record queue time metric (time from enqueue to processing)
703+
// Calculate queue time (time from enqueue to processing)
705704
const queueTimeMs = Date.now() - storedMessage.timestamp;
706-
this.itemQueueTimeHistogram?.record(queueTimeMs, { envId: storedMessage.tenantId });
707705
span?.setAttribute("batch.queueTimeMs", queueTimeMs);
708706

709707
this.logger.debug("Processing batch item", {
@@ -734,6 +732,9 @@ export class BatchQueue {
734732
return;
735733
}
736734

735+
// Record queue time metric (requires meta for environment_type)
736+
this.itemQueueTimeHistogram?.record(queueTimeMs, { environment_type: meta.environmentType });
737+
737738
span?.setAttributes({
738739
"batch.runCount": meta.runCount,
739740
"batch.environmentId": meta.environmentId,
@@ -769,7 +770,7 @@ export class BatchQueue {
769770
return this.completionTracker.recordSuccess(batchId, result.runId, itemIndex);
770771
});
771772

772-
this.itemsProcessedCounter?.add(1, { envId: meta.environmentId });
773+
this.itemsProcessedCounter?.add(1, { environment_type: meta.environmentType });
773774
this.logger.debug("Batch item processed successfully", {
774775
batchId,
775776
itemIndex,
@@ -808,7 +809,7 @@ export class BatchQueue {
808809
});
809810

810811
this.itemsFailedCounter?.add(1, {
811-
envId: meta.environmentId,
812+
environment_type: meta.environmentType,
812813
errorCode: result.errorCode,
813814
});
814815

@@ -848,7 +849,7 @@ export class BatchQueue {
848849
});
849850

850851
this.itemsFailedCounter?.add(1, {
851-
envId: meta.environmentId,
852+
environment_type: meta.environmentType,
852853
errorCode: "UNEXPECTED_ERROR",
853854
});
854855
this.logger.error("Unexpected error processing batch item", {
@@ -914,14 +915,13 @@ export class BatchQueue {
914915

915916
// Record metrics
916917
this.batchCompletedCounter?.add(1, {
917-
envId: meta.environmentId,
918+
environment_type: meta.environmentType,
918919
hasFailures: result.failedRunCount > 0,
919920
});
920921

921922
const processingDuration = Date.now() - meta.createdAt;
922923
this.batchProcessingDurationHistogram?.record(processingDuration, {
923-
envId: meta.environmentId,
924-
itemCount: meta.runCount,
924+
environment_type: meta.environmentType,
925925
});
926926

927927
span?.setAttribute("batch.processingDurationMs", processingDuration);

0 commit comments

Comments
 (0)