Skip to content

Commit ed92900

Browse files
committed
feat: Dual emitting timer and histogram metrics
1 parent 57efcc2 commit ed92900

File tree

12 files changed

+899
-32
lines changed

12 files changed

+899
-32
lines changed
Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/*
2+
* Copyright 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
*
4+
* Modifications copyright (C) 2017 Uber Technologies, Inc.
5+
*
6+
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
7+
* use this file except in compliance with the License. A copy of the License is
8+
* located at
9+
*
10+
* http://aws.amazon.com/apache2.0
11+
*
12+
* or in the "license" file accompanying this file. This file is distributed on
13+
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
14+
* express or implied. See the License for the specific language governing
15+
* permissions and limitations under the License.
16+
*/
17+
18+
package com.uber.cadence.internal.metrics;
19+
20+
import com.uber.m3.tally.DurationBuckets;
21+
import com.uber.m3.util.Duration;
22+
import java.util.concurrent.TimeUnit;
23+
24+
/**
25+
* Histogram bucket configurations for timer metrics migration.
26+
*
27+
* <p>This class defines standard histogram bucket configurations used during the migration from
28+
* timers to histograms. These buckets provide consistent granularity for measuring latencies across
29+
* different time ranges.
30+
*
31+
* <p>Note: Unlike the Go client which uses subsettable exponential histograms with algorithmic
32+
* bucket generation, the Java client uses explicit bucket definitions. We provide multiple
33+
* configurations to balance between granularity and cardinality:
34+
*
35+
* <ul>
36+
* <li><b>DEFAULT_1MS_100S</b>: Most common metrics (46 buckets, 1ms-100s)
37+
* <li><b>LOW_1MS_100S</b>: High-cardinality metrics (16 buckets, 1ms-100s)
38+
* <li><b>HIGH_1MS_24H</b>: Long-running operations (27 buckets, 1ms-24h)
39+
* <li><b>MID_1MS_24H</b>: High-cardinality long operations (14 buckets, 1ms-24h)
40+
* </ul>
41+
*/
42+
public final class HistogramBuckets {
43+
44+
/**
45+
* Default bucket configuration for most client-side latency metrics.
46+
*
47+
* <p>Range: 1ms to 100s
48+
*
49+
* <p>Provides: - Fine-grained buckets (1ms steps) from 1ms to 10ms - Medium-grained buckets (10ms
50+
* steps) from 10ms to 100ms - Coarser buckets (100ms steps) from 100ms to 1s - Second-level
51+
* buckets from 1s to 100s
52+
*
53+
* <p>Use for: - Decision poll latency - Activity poll latency - Decision execution latency -
54+
* Activity execution latency - Workflow replay latency - Most RPC call latencies
55+
*/
56+
public static final DurationBuckets DEFAULT_1MS_100S =
57+
DurationBuckets.custom(
58+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), // 1ms
59+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)),
60+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(3)),
61+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(4)),
62+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)),
63+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(6)),
64+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(7)),
65+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(8)),
66+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(9)),
67+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)),
68+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)),
69+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(30)),
70+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(40)),
71+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)),
72+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(60)),
73+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(70)),
74+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(80)),
75+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(90)),
76+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)),
77+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)),
78+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(300)),
79+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(400)),
80+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)),
81+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(600)),
82+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(700)),
83+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(800)),
84+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(900)),
85+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)),
86+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)),
87+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(3)),
88+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(4)),
89+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)),
90+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(6)),
91+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(7)),
92+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(8)),
93+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(9)),
94+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)),
95+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)),
96+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)),
97+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(40)),
98+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(50)),
99+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(60)),
100+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(70)),
101+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(80)),
102+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(90)),
103+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(100)));
104+
105+
/**
106+
* Low-resolution bucket configuration for high-cardinality metrics.
107+
*
108+
* <p>Range: 1ms to 100s (same as DEFAULT_1MS_100S but with fewer buckets)
109+
*
110+
* <p>Provides: - Coarser buckets with ~2x steps instead of fine-grained steps - Approximately
111+
* half the cardinality of DEFAULT_1MS_100S
112+
*
113+
* <p>Use for: - Per-activity-type metrics where cardinality is high - Per-workflow-type metrics
114+
* where cardinality is high - Metrics with many tag combinations
115+
*/
116+
public static final DurationBuckets LOW_1MS_100S =
117+
DurationBuckets.custom(
118+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)),
119+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)),
120+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)),
121+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)),
122+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)),
123+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)),
124+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)),
125+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)),
126+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)),
127+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)),
128+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)),
129+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)),
130+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)),
131+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)),
132+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(50)),
133+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(100)));
134+
135+
/**
136+
* High-resolution bucket configuration for long-running operations.
137+
*
138+
* <p>Range: 1ms to 24 hours
139+
*
140+
* <p>Provides: - Fine-grained buckets from 1ms to 10ms - Medium-grained from 10ms to 1s -
141+
* Second-level buckets from 1s to 10 minutes - Minute-level buckets from 10 minutes to 24 hours
142+
*
143+
* <p>Use for: - Workflow end-to-end latency - Long-running activity execution latency - Multi-day
144+
* operation metrics
145+
*/
146+
public static final DurationBuckets HIGH_1MS_24H =
147+
DurationBuckets.custom(
148+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)),
149+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)),
150+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)),
151+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)),
152+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)),
153+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)),
154+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)),
155+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)),
156+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)),
157+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)),
158+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)),
159+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)),
160+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)),
161+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)),
162+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)),
163+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(60)),
164+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(120)), // 2 min
165+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(300)), // 5 min
166+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(600)), // 10 min
167+
Duration.ofNanos(TimeUnit.MINUTES.toNanos(20)),
168+
Duration.ofNanos(TimeUnit.MINUTES.toNanos(30)),
169+
Duration.ofNanos(TimeUnit.HOURS.toNanos(1)),
170+
Duration.ofNanos(TimeUnit.HOURS.toNanos(2)),
171+
Duration.ofNanos(TimeUnit.HOURS.toNanos(4)),
172+
Duration.ofNanos(TimeUnit.HOURS.toNanos(8)),
173+
Duration.ofNanos(TimeUnit.HOURS.toNanos(12)),
174+
Duration.ofNanos(TimeUnit.HOURS.toNanos(24)));
175+
176+
/**
177+
* Medium-resolution bucket configuration for long-running operations.
178+
*
179+
* <p>Range: 1ms to 24 hours (same as HIGH_1MS_24H but with fewer buckets)
180+
*
181+
* <p>Provides: - Coarser buckets than HIGH_1MS_24H - Better for high-cardinality long-duration
182+
* metrics
183+
*
184+
* <p>Use for: - When HIGH_1MS_24H's cardinality is too high - Per-workflow-type E2E latency with
185+
* many workflow types
186+
*/
187+
public static final DurationBuckets MID_1MS_24H =
188+
DurationBuckets.custom(
189+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)),
190+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)),
191+
Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)),
192+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)),
193+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)),
194+
Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)),
195+
Duration.ofNanos(TimeUnit.MINUTES.toNanos(1)),
196+
Duration.ofNanos(TimeUnit.MINUTES.toNanos(5)),
197+
Duration.ofNanos(TimeUnit.MINUTES.toNanos(10)),
198+
Duration.ofNanos(TimeUnit.MINUTES.toNanos(30)),
199+
Duration.ofNanos(TimeUnit.HOURS.toNanos(1)),
200+
Duration.ofNanos(TimeUnit.HOURS.toNanos(4)),
201+
Duration.ofNanos(TimeUnit.HOURS.toNanos(12)),
202+
Duration.ofNanos(TimeUnit.HOURS.toNanos(24)));
203+
204+
private HistogramBuckets() {
205+
// Utility class - prevent instantiation
206+
}
207+
}
Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
# Timer to Histogram Migration
2+
3+
## Overview
4+
5+
This document describes the migration from timer metrics to histogram metrics in the Cadence Java client. The migration uses a dual-emit pattern where **both timer and histogram metrics are always emitted**, allowing for gradual migration of dashboards and alerts without requiring a coordinated flag day.
6+
7+
## Why Migrate?
8+
9+
Timers and histograms serve similar purposes (measuring latencies and durations) but have different characteristics:
10+
11+
- **Timers**: Legacy approach, currently used throughout the codebase
12+
- **Histograms**: More flexible, better support for custom buckets and percentile calculations
13+
14+
## Migration Strategy
15+
16+
### Phase 1: Dual Emission (Current)
17+
18+
Both timer and histogram metrics are emitted simultaneously:
19+
20+
```java
21+
// Old code:
22+
Stopwatch sw = scope.timer(MetricsType.DECISION_POLL_LATENCY).start();
23+
// ... do work ...
24+
sw.stop();
25+
26+
// New code (dual emit):
27+
DualStopwatch sw = MetricsEmit.startLatency(
28+
scope,
29+
MetricsType.DECISION_POLL_LATENCY,
30+
HistogramBuckets.DEFAULT_1MS_100S
31+
);
32+
// ... do work ...
33+
sw.stop(); // Records to BOTH timer and histogram
34+
```
35+
36+
### Phase 2: Dashboard/Alert Migration (Next)
37+
38+
Update all dashboards and alerts to use histogram metrics instead of timer metrics. This can be done gradually since both are being emitted.
39+
40+
### Phase 3: Remove Timer Emission (Future)
41+
42+
Once all dashboards/alerts are migrated, remove timer emission:
43+
44+
```java
45+
// Future code (histogram only):
46+
Stopwatch sw = scope.histogram(
47+
MetricsType.DECISION_POLL_LATENCY,
48+
HistogramBuckets.DEFAULT_1MS_100S
49+
).start();
50+
// ... do work ...
51+
sw.stop();
52+
```
53+
54+
## Helper Classes
55+
56+
### HistogramBuckets
57+
58+
Defines standard bucket configurations:
59+
60+
- `DEFAULT_1MS_100S`: For most latency measurements (1ms to 100s range)
61+
- Fine-grained: 1ms steps from 1-10ms
62+
- Medium-grained: 10ms steps from 10-100ms
63+
- Coarse: 100ms steps from 100ms-1s
64+
- Second-level: 1s steps from 1-100s
65+
- Use for: Most RPC calls, decision/activity poll, execution latencies
66+
67+
- `LOW_1MS_100S`: Low-resolution version for high-cardinality metrics (1ms to 100s)
68+
- Approximately half the buckets of DEFAULT_1MS_100S
69+
- Use for: Per-activity-type, per-workflow-type metrics with high cardinality
70+
71+
- `HIGH_1MS_24H`: For long-running operations (1ms to 24 hours)
72+
- Extended range for multi-hour workflows
73+
- Use for: Workflow end-to-end latency, long-running activities
74+
75+
- `MID_1MS_24H`: Lower-resolution version of HIGH_1MS_24H
76+
- Fewer buckets than HIGH_1MS_24H
77+
- Use for: When HIGH_1MS_24H's cardinality is too high
78+
79+
### MetricsEmit
80+
81+
Provides dual-emit helper methods:
82+
83+
- `emitLatency(scope, name, duration, buckets)`: Directly record a duration
84+
- `startLatency(scope, name, buckets)`: Create a dual stopwatch
85+
86+
### DualStopwatch
87+
88+
A stopwatch wrapper that records to both timer and histogram when `.stop()` is called.
89+
90+
## Migration Checklist
91+
92+
For each timer metric:
93+
94+
1. ✅ Identify the timer usage (e.g., `scope.timer(name).start()`)
95+
2. ✅ Replace with `MetricsEmit.startLatency(scope, name, buckets)`
96+
3. ✅ Choose appropriate bucket configuration (typically `HistogramBuckets.DEFAULT_1MS_100S`)
97+
4. ✅ Verify both metrics are being emitted
98+
5. ⏳ Update dashboards to use histogram metric
99+
6. ⏳ Update alerts to use histogram metric
100+
7. ⏳ (Future) Remove timer emission
101+
102+
## Example Conversions
103+
104+
### Example 1: Poll Latency
105+
106+
```java
107+
// Before:
108+
Stopwatch sw = scope.timer(MetricsType.DECISION_POLL_LATENCY).start();
109+
PollForDecisionTaskResponse result = service.PollForDecisionTask(request);
110+
sw.stop();
111+
112+
// After:
113+
DualStopwatch sw = MetricsEmit.startLatency(
114+
scope,
115+
MetricsType.DECISION_POLL_LATENCY,
116+
HistogramBuckets.DEFAULT_1MS_100S
117+
);
118+
PollForDecisionTaskResponse result = service.PollForDecisionTask(request);
119+
sw.stop();
120+
```
121+
122+
### Example 2: Execution Latency
123+
124+
```java
125+
// Before:
126+
Stopwatch sw = metricsScope.timer(MetricsType.ACTIVITY_EXEC_LATENCY).start();
127+
Result response = handler.handle(task, metricsScope, false);
128+
sw.stop();
129+
130+
// After:
131+
DualStopwatch sw = MetricsEmit.startLatency(
132+
metricsScope,
133+
MetricsType.ACTIVITY_EXEC_LATENCY,
134+
HistogramBuckets.DEFAULT_1MS_100S
135+
);
136+
Result response = handler.handle(task, metricsScope, false);
137+
sw.stop();
138+
```
139+
140+
### Example 3: Direct Duration Recording
141+
142+
```java
143+
// Before:
144+
Duration scheduledToStartLatency = Duration.between(scheduledTime, startedTime);
145+
scope.timer(MetricsType.DECISION_SCHEDULED_TO_START_LATENCY).record(scheduledToStartLatency);
146+
147+
// After:
148+
Duration scheduledToStartLatency = Duration.between(scheduledTime, startedTime);
149+
MetricsEmit.emitLatency(
150+
scope,
151+
MetricsType.DECISION_SCHEDULED_TO_START_LATENCY,
152+
scheduledToStartLatency,
153+
HistogramBuckets.DEFAULT_1MS_100S
154+
);
155+
```
156+
157+
## Testing
158+
159+
The migration preserves existing timer behavior while adding histogram emission, so:
160+
161+
- Existing timer-based tests continue to work
162+
- Existing timer-based dashboards/alerts continue to work
163+
- New histogram metrics are available for gradual migration
164+
165+
## Timeline
166+
167+
1. **Now**: Dual emission in place, both metrics available
168+
2. **Next Quarter**: Migrate dashboards and alerts to histograms
169+
3. **Future Release**: Remove timer emission, histogram-only
170+
171+
## Questions?
172+
173+
Contact the Cadence team for guidance on specific metrics or migration questions.

0 commit comments

Comments
 (0)