cadence-workflow
diff --git a/‎src/main/java/com/uber/cadence/internal/metrics/HistogramBuckets.java‎
Lines changed: 207 additions & 0 deletions b/‎src/main/java/com/uber/cadence/internal/metrics/HistogramBuckets.java‎
Lines changed: 207 additions & 0 deletions
diff --git a/‎src/main/java/com/uber/cadence/internal/metrics/MIGRATION.md‎
Lines changed: 173 additions & 0 deletions b/‎src/main/java/com/uber/cadence/internal/metrics/MIGRATION.md‎
Lines changed: 173 additions & 0 deletions
@@ -0,0 +1,207 @@
+/*
+ *  Copyright 2012-2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ *
+ *  Modifications copyright (C) 2017 Uber Technologies, Inc.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License"). You may not
+ *  use this file except in compliance with the License. A copy of the License is
+ *  located at
+ *
+ *  http://aws.amazon.com/apache2.0
+ *
+ *  or in the "license" file accompanying this file. This file is distributed on
+ *  an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ *  express or implied. See the License for the specific language governing
+ *  permissions and limitations under the License.
+ */
+
+package com.uber.cadence.internal.metrics;
+
+import com.uber.m3.tally.DurationBuckets;
+import com.uber.m3.util.Duration;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * Histogram bucket configurations for timer metrics migration.
+ *
+ * <p>This class defines standard histogram bucket configurations used during the migration from
+ * timers to histograms. These buckets provide consistent granularity for measuring latencies across
+ * different time ranges.
+ *
+ * <p>Note: Unlike the Go client which uses subsettable exponential histograms with algorithmic
+ * bucket generation, the Java client uses explicit bucket definitions. We provide multiple
+ * configurations to balance between granularity and cardinality:
+ *
+ * <ul>
+ *   <li><b>DEFAULT_1MS_100S</b>: Most common metrics (46 buckets, 1ms-100s)
+ *   <li><b>LOW_1MS_100S</b>: High-cardinality metrics (16 buckets, 1ms-100s)
+ *   <li><b>HIGH_1MS_24H</b>: Long-running operations (27 buckets, 1ms-24h)
+ *   <li><b>MID_1MS_24H</b>: High-cardinality long operations (14 buckets, 1ms-24h)
+ * </ul>
+ */
+public final class HistogramBuckets {
+
+  /**
+   * Default bucket configuration for most client-side latency metrics.
+   *
+   * <p>Range: 1ms to 100s
+   *
+   * <p>Provides: - Fine-grained buckets (1ms steps) from 1ms to 10ms - Medium-grained buckets (10ms
+   * steps) from 10ms to 100ms - Coarser buckets (100ms steps) from 100ms to 1s - Second-level
+   * buckets from 1s to 100s
+   *
+   * <p>Use for: - Decision poll latency - Activity poll latency - Decision execution latency -
+   * Activity execution latency - Workflow replay latency - Most RPC call latencies
+   */
+  public static final DurationBuckets DEFAULT_1MS_100S =
+      DurationBuckets.custom(
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)), // 1ms
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(3)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(4)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(6)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(7)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(8)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(9)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(30)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(40)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(60)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(70)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(80)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(90)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(300)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(400)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(600)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(700)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(800)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(900)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(3)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(4)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(6)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(7)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(8)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(9)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(40)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(50)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(60)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(70)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(80)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(90)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(100)));
+
+  /**
+   * Low-resolution bucket configuration for high-cardinality metrics.
+   *
+   * <p>Range: 1ms to 100s (same as DEFAULT_1MS_100S but with fewer buckets)
+   *
+   * <p>Provides: - Coarser buckets with ~2x steps instead of fine-grained steps - Approximately
+   * half the cardinality of DEFAULT_1MS_100S
+   *
+   * <p>Use for: - Per-activity-type metrics where cardinality is high - Per-workflow-type metrics
+   * where cardinality is high - Metrics with many tag combinations
+   */
+  public static final DurationBuckets LOW_1MS_100S =
+      DurationBuckets.custom(
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(50)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(100)));
+
+  /**
+   * High-resolution bucket configuration for long-running operations.
+   *
+   * <p>Range: 1ms to 24 hours
+   *
+   * <p>Provides: - Fine-grained buckets from 1ms to 10ms - Medium-grained from 10ms to 1s -
+   * Second-level buckets from 1s to 10 minutes - Minute-level buckets from 10 minutes to 24 hours
+   *
+   * <p>Use for: - Workflow end-to-end latency - Long-running activity execution latency - Multi-day
+   * operation metrics
+   */
+  public static final DurationBuckets HIGH_1MS_24H =
+      DurationBuckets.custom(
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(2)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(5)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(20)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(50)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(200)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(500)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(2)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(5)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(20)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(60)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(120)), // 2 min
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(300)), // 5 min
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(600)), // 10 min
+          Duration.ofNanos(TimeUnit.MINUTES.toNanos(20)),
+          Duration.ofNanos(TimeUnit.MINUTES.toNanos(30)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(2)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(4)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(8)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(12)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(24)));
+
+  /**
+   * Medium-resolution bucket configuration for long-running operations.
+   *
+   * <p>Range: 1ms to 24 hours (same as HIGH_1MS_24H but with fewer buckets)
+   *
+   * <p>Provides: - Coarser buckets than HIGH_1MS_24H - Better for high-cardinality long-duration
+   * metrics
+   *
+   * <p>Use for: - When HIGH_1MS_24H's cardinality is too high - Per-workflow-type E2E latency with
+   * many workflow types
+   */
+  public static final DurationBuckets MID_1MS_24H =
+      DurationBuckets.custom(
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(10)),
+          Duration.ofNanos(TimeUnit.MILLISECONDS.toNanos(100)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(10)),
+          Duration.ofNanos(TimeUnit.SECONDS.toNanos(30)),
+          Duration.ofNanos(TimeUnit.MINUTES.toNanos(1)),
+          Duration.ofNanos(TimeUnit.MINUTES.toNanos(5)),
+          Duration.ofNanos(TimeUnit.MINUTES.toNanos(10)),
+          Duration.ofNanos(TimeUnit.MINUTES.toNanos(30)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(1)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(4)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(12)),
+          Duration.ofNanos(TimeUnit.HOURS.toNanos(24)));
+
+  private HistogramBuckets() {
+    // Utility class - prevent instantiation
+  }
+}
@@ -0,0 +1,173 @@
+# Timer to Histogram Migration
+
+## Overview
+
+This document describes the migration from timer metrics to histogram metrics in the Cadence Java client. The migration uses a dual-emit pattern where **both timer and histogram metrics are always emitted**, allowing for gradual migration of dashboards and alerts without requiring a coordinated flag day.
+
+## Why Migrate?
+
+Timers and histograms serve similar purposes (measuring latencies and durations) but have different characteristics:
+
+- **Timers**: Legacy approach, currently used throughout the codebase
+- **Histograms**: More flexible, better support for custom buckets and percentile calculations
+
+## Migration Strategy
+
+### Phase 1: Dual Emission (Current)
+
+Both timer and histogram metrics are emitted simultaneously:
+
+```java
+// Old code:
+Stopwatch sw = scope.timer(MetricsType.DECISION_POLL_LATENCY).start();
+// ... do work ...
+sw.stop();
+
+// New code (dual emit):
+DualStopwatch sw = MetricsEmit.startLatency(
+    scope,
+    MetricsType.DECISION_POLL_LATENCY,
+    HistogramBuckets.DEFAULT_1MS_100S
+);
+// ... do work ...
+sw.stop(); // Records to BOTH timer and histogram
+```
+
+### Phase 2: Dashboard/Alert Migration (Next)
+
+Update all dashboards and alerts to use histogram metrics instead of timer metrics. This can be done gradually since both are being emitted.
+
+### Phase 3: Remove Timer Emission (Future)
+
+Once all dashboards/alerts are migrated, remove timer emission:
+
+```java
+// Future code (histogram only):
+Stopwatch sw = scope.histogram(
+    MetricsType.DECISION_POLL_LATENCY,
+    HistogramBuckets.DEFAULT_1MS_100S
+).start();
+// ... do work ...
+sw.stop();
+```
+
+## Helper Classes
+
+### HistogramBuckets
+
+Defines standard bucket configurations:
+
+- `DEFAULT_1MS_100S`: For most latency measurements (1ms to 100s range)
+  - Fine-grained: 1ms steps from 1-10ms
+  - Medium-grained: 10ms steps from 10-100ms
+  - Coarse: 100ms steps from 100ms-1s
+  - Second-level: 1s steps from 1-100s
+  - Use for: Most RPC calls, decision/activity poll, execution latencies
+
+- `LOW_1MS_100S`: Low-resolution version for high-cardinality metrics (1ms to 100s)
+  - Approximately half the buckets of DEFAULT_1MS_100S
+  - Use for: Per-activity-type, per-workflow-type metrics with high cardinality
+
+- `HIGH_1MS_24H`: For long-running operations (1ms to 24 hours)
+  - Extended range for multi-hour workflows
+  - Use for: Workflow end-to-end latency, long-running activities
+
+- `MID_1MS_24H`: Lower-resolution version of HIGH_1MS_24H
+  - Fewer buckets than HIGH_1MS_24H
+  - Use for: When HIGH_1MS_24H's cardinality is too high
+
+### MetricsEmit
+
+Provides dual-emit helper methods:
+
+- `emitLatency(scope, name, duration, buckets)`: Directly record a duration
+- `startLatency(scope, name, buckets)`: Create a dual stopwatch
+
+### DualStopwatch
+
+A stopwatch wrapper that records to both timer and histogram when `.stop()` is called.
+
+## Migration Checklist
+
+For each timer metric:
+
+1. ✅ Identify the timer usage (e.g., `scope.timer(name).start()`)
+2. ✅ Replace with `MetricsEmit.startLatency(scope, name, buckets)`
+3. ✅ Choose appropriate bucket configuration (typically `HistogramBuckets.DEFAULT_1MS_100S`)
+4. ✅ Verify both metrics are being emitted
+5. ⏳ Update dashboards to use histogram metric
+6. ⏳ Update alerts to use histogram metric
+7. ⏳ (Future) Remove timer emission
+
+## Example Conversions
+
+### Example 1: Poll Latency
+
+```java
+// Before:
+Stopwatch sw = scope.timer(MetricsType.DECISION_POLL_LATENCY).start();
+PollForDecisionTaskResponse result = service.PollForDecisionTask(request);
+sw.stop();
+
+// After:
+DualStopwatch sw = MetricsEmit.startLatency(
+    scope,
+    MetricsType.DECISION_POLL_LATENCY,
+    HistogramBuckets.DEFAULT_1MS_100S
+);
+PollForDecisionTaskResponse result = service.PollForDecisionTask(request);
+sw.stop();
+```
+
+### Example 2: Execution Latency
+
+```java
+// Before:
+Stopwatch sw = metricsScope.timer(MetricsType.ACTIVITY_EXEC_LATENCY).start();
+Result response = handler.handle(task, metricsScope, false);
+sw.stop();
+
+// After:
+DualStopwatch sw = MetricsEmit.startLatency(
+    metricsScope,
+    MetricsType.ACTIVITY_EXEC_LATENCY,
+    HistogramBuckets.DEFAULT_1MS_100S
+);
+Result response = handler.handle(task, metricsScope, false);
+sw.stop();
+```
+
+### Example 3: Direct Duration Recording
+
+```java
+// Before:
+Duration scheduledToStartLatency = Duration.between(scheduledTime, startedTime);
+scope.timer(MetricsType.DECISION_SCHEDULED_TO_START_LATENCY).record(scheduledToStartLatency);
+
+// After:
+Duration scheduledToStartLatency = Duration.between(scheduledTime, startedTime);
+MetricsEmit.emitLatency(
+    scope,
+    MetricsType.DECISION_SCHEDULED_TO_START_LATENCY,
+    scheduledToStartLatency,
+    HistogramBuckets.DEFAULT_1MS_100S
+);
+```
+
+## Testing
+
+The migration preserves existing timer behavior while adding histogram emission, so:
+
+- Existing timer-based tests continue to work
+- Existing timer-based dashboards/alerts continue to work
+- New histogram metrics are available for gradual migration
+
+## Timeline
+
+1. **Now**: Dual emission in place, both metrics available
+2. **Next Quarter**: Migrate dashboards and alerts to histograms
+3. **Future Release**: Remove timer emission, histogram-only
+
+## Questions?
+
+Contact the Cadence team for guidance on specific metrics or migration questions.