Skip to content

Commit 1cf01e4

Browse files
authored
feat: enable dual emitting for task attempt and latency related metrics (#7743)
<!-- 1-2 line summary of WHAT changed technically: - Always link the relevant projects GitHub issue, unless it is a minor bugfix - Good: "Modified FailoverDomain mapper to allow null ActiveClusterName #320" - Bad: "added nil check" --> **What changed?** Start dual emitting timer and histogram metrics for task latency and attempt metrics #7741 <!-- Your goal is to provide all the required context for a future maintainer to understand the reasons for making this change (see https://cbea.ms/git-commit/#why-not-how). How did this work previously (and what was wrong with it)? What has changed, and why did you solve it this way? - Good: "Active-active domains have independent cluster attributes per region. Previously, modifying cluster attributes required spedifying the default ActiveClusterName which updates the global domain default. This prevents operators from updating regional configurations without affecting the primary cluster designation. This change allows attribute updates to be independent of active cluster selection." - Bad: "Improves domain handling" --> **Why?** Timer -> Histogram migration <!-- Include specific test commands and setup. Please include the exact commands such that another maintainer or contributor can reproduce the test steps taken. - e.g Unit test commands with exact invocation `go test -v ./common/types/mapper/proto -run TestFailoverDomainRequest` - For integration tests include setup steps and test commands Example: "Started local server with `./cadence start`, then ran `make test_e2e`" - For local simulation testing include setup steps for the server and how you ran the tests - Good: Full commands that reviewers can copy-paste to verify - Bad: "Tested locally" or "Added tests" --> **How did you test it?** go test -v ./common/metrics <!-- If there are risks that the release engineer should know about document them here. For example: - Has an API/IDL been modified? Is it backwards/forwards compatible? If not, what are the repecussions? - Has a schema change been introduced? Is it possible to roll back? - Has a feature flag been re-used for a new purpose? - Is there a potential performance concern? Is the change modifying core task processing logic? - If truly N/A, you can mark it as such --> **Potential risks** Metrics storage increase <!-- If this PR completes a user facing feature or changes functionality add release notes here. Your release notes should allow a user and the release engineer to understand the changes with little context. Always ensure that the description contains a link to the relevant GitHub issue. --> **Release notes** <!-- Consider whether this change requires documentation updates in the Cadence-Docs repo - If yes: mention what needs updating (or link to docs PR in cadence-docs repo) - If in doubt, add a note about potential doc needs - Only mark N/A if you're certain no docs are affected --> **Documentation Changes** --- ## Reviewer Validation **PR Description Quality** (check these before reviewing code): - [x] **"What changed"** provides a clear 1-2 line summary - [x] Project Issue is linked - [x] **"Why"** explains the full motivation with sufficient context - [x] **Testing is documented:** - [x] Unit test commands are included (with exact `go test` invocation) - [ ] Integration test setup/commands included (if integration tests were run) - [ ] Canary testing details included (if canary was mentioned) - [x] **Potential risks** section is thoughtfully filled out (or legitimately N/A) - [x] **Release notes** included if this completes a user-facing feature - [x] **Documentation** needs are addressed (or noted if uncertain) --------- Signed-off-by: Neil Xie <neil.xie@uber.com>
1 parent a0ea940 commit 1cf01e4

File tree

4 files changed

+84
-41
lines changed

4 files changed

+84
-41
lines changed

common/metrics/config.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,20 @@ func (h *HistogramMigration) UnmarshalYAML(read func(any) error) error {
4141
// This is likely best done in an `init` func, to ensure it happens early enough
4242
// and does not race with config reading.
4343
var HistogramMigrationMetrics = map[string]struct{}{
44-
"task_latency_processing": {},
45-
"task_latency_processing_ns": {},
44+
"task_attempt": {},
45+
"task_attempt_counts": {},
46+
"task_attempt_per_domain": {},
47+
"task_attempt_per_domain_counts": {},
48+
"task_latency_per_domain": {},
49+
"task_latency_per_domain_ns": {},
50+
"task_latency_processing": {},
51+
"task_latency_processing_ns": {},
52+
"task_latency_queue": {},
53+
"task_latency_queue_ns": {},
54+
"task_latency_processing_per_domain": {},
55+
"task_latency_processing_per_domain_ns": {},
56+
"task_latency_queue_per_domain": {},
57+
"task_latency_queue_per_domain_ns": {},
4658

4759
// Replication task processor histograms (PR #7685).
4860
// Dual-emitted as timer + histogram.

common/metrics/defs.go

Lines changed: 45 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -2487,6 +2487,7 @@ const (
24872487
TaskFailures
24882488
TaskDiscarded
24892489
TaskAttemptTimer
2490+
ExponentialTaskAttemptCounts
24902491
TaskStandbyRetryCounter
24912492
TaskNotActiveCounter
24922493
TaskLimitExceededCounter
@@ -2495,6 +2496,7 @@ const (
24952496
TaskProcessingLatency
24962497
ExponentialTaskProcessingLatency
24972498
TaskQueueLatency
2499+
ExponentialTaskQueueLatency
24982500
ScheduleToStartHistoryQueueLatencyPerTaskList
24992501
TaskRequestsOldScheduler
25002502
TaskRequestsNewScheduler
@@ -2504,19 +2506,23 @@ const (
25042506

25052507
TaskRequestsPerDomain
25062508
TaskLatencyPerDomain
2509+
ExponentialTaskLatencyPerDomain
25072510
TaskFailuresPerDomain
25082511
TaskWorkflowBusyPerDomain
25092512
TaskDiscardedPerDomain
25102513
TaskUnsupportedPerDomain
25112514
TaskAttemptTimerPerDomain
2515+
ExponentialTaskAttemptCountsPerDomain
25122516
TaskStandbyRetryCounterPerDomain
25132517
TaskListNotOwnedByHostCounterPerDomain
25142518
TaskPendingActiveCounterPerDomain
25152519
TaskNotActiveCounterPerDomain
25162520
TaskTargetNotActiveCounterPerDomain
25172521
TaskLimitExceededCounterPerDomain
25182522
TaskProcessingLatencyPerDomain
2523+
ExponentialTaskProcessingLatencyPerDomain
25192524
TaskQueueLatencyPerDomain
2525+
ExponentialTaskQueueLatencyPerDomain
25202526
TaskScheduleLatencyPerDomain
25212527
TaskEnqueueToFetchLatency
25222528
TransferTaskMissingEventCounterPerDomain
@@ -3316,17 +3322,19 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{
33163322
WeightedChannelPoolSizeGauge: {metricName: "weighted_channel_pool_size", metricType: Gauge},
33173323
},
33183324
History: {
3319-
TaskRequests: {metricName: "task_requests", metricType: Counter},
3320-
TaskLatency: {metricName: "task_latency", metricType: Timer},
3321-
TaskAttemptTimer: {metricName: "task_attempt", metricType: Timer},
3322-
TaskFailures: {metricName: "task_errors", metricType: Counter},
3323-
TaskDiscarded: {metricName: "task_errors_discarded", metricType: Counter},
3324-
TaskStandbyRetryCounter: {metricName: "task_errors_standby_retry_counter", metricType: Counter},
3325-
TaskNotActiveCounter: {metricName: "task_errors_not_active_counter", metricType: Counter},
3326-
TaskLimitExceededCounter: {metricName: "task_errors_limit_exceeded_counter", metricType: Counter},
3327-
TaskProcessingLatency: {metricName: "task_latency_processing", metricType: Timer},
3328-
ExponentialTaskProcessingLatency: {metricName: "task_latency_processing_ns", metricType: Histogram, exponentialBuckets: Low1ms100s},
3329-
TaskQueueLatency: {metricName: "task_latency_queue", metricType: Timer},
3325+
TaskRequests: {metricName: "task_requests", metricType: Counter},
3326+
TaskLatency: {metricName: "task_latency", metricType: Timer},
3327+
TaskAttemptTimer: {metricName: "task_attempt", metricType: Timer},
3328+
ExponentialTaskAttemptCounts: {metricName: "task_attempt_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
3329+
TaskFailures: {metricName: "task_errors", metricType: Counter},
3330+
TaskDiscarded: {metricName: "task_errors_discarded", metricType: Counter},
3331+
TaskStandbyRetryCounter: {metricName: "task_errors_standby_retry_counter", metricType: Counter},
3332+
TaskNotActiveCounter: {metricName: "task_errors_not_active_counter", metricType: Counter},
3333+
TaskLimitExceededCounter: {metricName: "task_errors_limit_exceeded_counter", metricType: Counter},
3334+
TaskProcessingLatency: {metricName: "task_latency_processing", metricType: Timer},
3335+
ExponentialTaskProcessingLatency: {metricName: "task_latency_processing_ns", metricType: Histogram, exponentialBuckets: Low1ms100s},
3336+
TaskQueueLatency: {metricName: "task_latency_queue", metricType: Timer},
3337+
ExponentialTaskQueueLatency: {metricName: "task_latency_queue_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h},
33303338
ScheduleToStartHistoryQueueLatencyPerTaskList: {metricName: "schedule_to_start_history_queue_latency_per_tl", metricType: Timer},
33313339
TaskRequestsOldScheduler: {metricName: "task_requests_old_scheduler", metricType: Counter},
33323340
TaskRequestsNewScheduler: {metricName: "task_requests_new_scheduler", metricType: Counter},
@@ -3336,28 +3344,32 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{
33363344

33373345
// per domain task metrics
33383346

3339-
TaskRequestsPerDomain: {metricName: "task_requests_per_domain", metricRollupName: "task_requests", metricType: Counter},
3340-
TaskLatencyPerDomain: {metricName: "task_latency_per_domain", metricRollupName: "task_latency", metricType: Timer},
3341-
TaskAttemptTimerPerDomain: {metricName: "task_attempt_per_domain", metricRollupName: "task_attempt", metricType: Timer},
3342-
TaskFailuresPerDomain: {metricName: "task_errors_per_domain", metricRollupName: "task_errors", metricType: Counter},
3343-
TaskWorkflowBusyPerDomain: {metricName: "task_errors_workflow_busy_per_domain", metricRollupName: "task_errors_workflow_busy", metricType: Counter},
3344-
TaskDiscardedPerDomain: {metricName: "task_errors_discarded_per_domain", metricRollupName: "task_errors_discarded", metricType: Counter},
3345-
TaskUnsupportedPerDomain: {metricName: "task_errors_unsupported_per_domain", metricRollupName: "task_errors_discarded", metricType: Counter},
3346-
TaskStandbyRetryCounterPerDomain: {metricName: "task_errors_standby_retry_counter_per_domain", metricRollupName: "task_errors_standby_retry_counter", metricType: Counter},
3347-
TaskListNotOwnedByHostCounterPerDomain: {metricName: "task_errors_task_list_not_owned_by_host_counter_per_domain", metricRollupName: "task_errors_task_list_not_owned_by_host_counter", metricType: Counter},
3348-
TaskPendingActiveCounterPerDomain: {metricName: "task_errors_pending_active_counter_per_domain", metricRollupName: "task_errors_pending_active_counter", metricType: Counter},
3349-
TaskNotActiveCounterPerDomain: {metricName: "task_errors_not_active_counter_per_domain", metricRollupName: "task_errors_not_active_counter", metricType: Counter},
3350-
TaskTargetNotActiveCounterPerDomain: {metricName: "task_errors_target_not_active_counter_per_domain", metricRollupName: "task_errors_target_not_active_counter", metricType: Counter},
3351-
TaskLimitExceededCounterPerDomain: {metricName: "task_errors_limit_exceeded_counter_per_domain", metricRollupName: "task_errors_limit_exceeded_counter", metricType: Counter},
3352-
TaskProcessingLatencyPerDomain: {metricName: "task_latency_processing_per_domain", metricRollupName: "task_latency_processing", metricType: Timer},
3353-
TaskQueueLatencyPerDomain: {metricName: "task_latency_queue_per_domain", metricRollupName: "task_latency_queue", metricType: Timer},
3354-
TaskScheduleLatencyPerDomain: {metricName: "task_latency_schedule_per_domain", metricRollupName: "task_latency_schedule", metricType: Histogram, buckets: HistoryTaskLatencyBuckets},
3355-
TaskEnqueueToFetchLatency: {metricName: "task_latency_enqueue_to_fetch", metricType: Histogram, buckets: HistoryTaskLatencyBuckets},
3356-
TransferTaskMissingEventCounterPerDomain: {metricName: "transfer_task_missing_event_counter_per_domain", metricRollupName: "transfer_task_missing_event_counter", metricType: Counter},
3357-
ReplicationTasksAppliedPerDomain: {metricName: "replication_tasks_applied_per_domain", metricType: Counter},
3358-
WorkflowTerminateCounterPerDomain: {metricName: "workflow_terminate_counter_per_domain", metricRollupName: "workflow_terminate_counter", metricType: Counter},
3359-
TaskSchedulerAllowedCounterPerDomain: {metricName: "task_scheduler_allowed_counter_per_domain", metricRollupName: "task_scheduler_allowed_counter", metricType: Counter},
3360-
TaskSchedulerThrottledCounterPerDomain: {metricName: "task_scheduler_throttled_counter_per_domain", metricRollupName: "task_scheduler_throttled_counter", metricType: Counter},
3347+
TaskRequestsPerDomain: {metricName: "task_requests_per_domain", metricRollupName: "task_requests", metricType: Counter},
3348+
TaskLatencyPerDomain: {metricName: "task_latency_per_domain", metricRollupName: "task_latency", metricType: Timer},
3349+
ExponentialTaskLatencyPerDomain: {metricName: "task_latency_per_domain_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h},
3350+
TaskAttemptTimerPerDomain: {metricName: "task_attempt_per_domain", metricRollupName: "task_attempt", metricType: Timer},
3351+
ExponentialTaskAttemptCountsPerDomain: {metricName: "task_attempt_per_domain_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
3352+
TaskFailuresPerDomain: {metricName: "task_errors_per_domain", metricRollupName: "task_errors", metricType: Counter},
3353+
TaskWorkflowBusyPerDomain: {metricName: "task_errors_workflow_busy_per_domain", metricRollupName: "task_errors_workflow_busy", metricType: Counter},
3354+
TaskDiscardedPerDomain: {metricName: "task_errors_discarded_per_domain", metricRollupName: "task_errors_discarded", metricType: Counter},
3355+
TaskUnsupportedPerDomain: {metricName: "task_errors_unsupported_per_domain", metricRollupName: "task_errors_discarded", metricType: Counter},
3356+
TaskStandbyRetryCounterPerDomain: {metricName: "task_errors_standby_retry_counter_per_domain", metricRollupName: "task_errors_standby_retry_counter", metricType: Counter},
3357+
TaskListNotOwnedByHostCounterPerDomain: {metricName: "task_errors_task_list_not_owned_by_host_counter_per_domain", metricRollupName: "task_errors_task_list_not_owned_by_host_counter", metricType: Counter},
3358+
TaskPendingActiveCounterPerDomain: {metricName: "task_errors_pending_active_counter_per_domain", metricRollupName: "task_errors_pending_active_counter", metricType: Counter},
3359+
TaskNotActiveCounterPerDomain: {metricName: "task_errors_not_active_counter_per_domain", metricRollupName: "task_errors_not_active_counter", metricType: Counter},
3360+
TaskTargetNotActiveCounterPerDomain: {metricName: "task_errors_target_not_active_counter_per_domain", metricRollupName: "task_errors_target_not_active_counter", metricType: Counter},
3361+
TaskLimitExceededCounterPerDomain: {metricName: "task_errors_limit_exceeded_counter_per_domain", metricRollupName: "task_errors_limit_exceeded_counter", metricType: Counter},
3362+
TaskProcessingLatencyPerDomain: {metricName: "task_latency_processing_per_domain", metricRollupName: "task_latency_processing", metricType: Timer},
3363+
ExponentialTaskProcessingLatencyPerDomain: {metricName: "task_latency_processing_per_domain_ns", metricType: Histogram, exponentialBuckets: Low1ms100s},
3364+
TaskQueueLatencyPerDomain: {metricName: "task_latency_queue_per_domain", metricRollupName: "task_latency_queue", metricType: Timer},
3365+
ExponentialTaskQueueLatencyPerDomain: {metricName: "task_latency_queue_per_domain_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h},
3366+
TaskScheduleLatencyPerDomain: {metricName: "task_latency_schedule_per_domain", metricRollupName: "task_latency_schedule", metricType: Histogram, buckets: HistoryTaskLatencyBuckets},
3367+
TaskEnqueueToFetchLatency: {metricName: "task_latency_enqueue_to_fetch", metricType: Histogram, buckets: HistoryTaskLatencyBuckets},
3368+
TransferTaskMissingEventCounterPerDomain: {metricName: "transfer_task_missing_event_counter_per_domain", metricRollupName: "transfer_task_missing_event_counter", metricType: Counter},
3369+
ReplicationTasksAppliedPerDomain: {metricName: "replication_tasks_applied_per_domain", metricType: Counter},
3370+
WorkflowTerminateCounterPerDomain: {metricName: "workflow_terminate_counter_per_domain", metricRollupName: "workflow_terminate_counter", metricType: Counter},
3371+
TaskSchedulerAllowedCounterPerDomain: {metricName: "task_scheduler_allowed_counter_per_domain", metricRollupName: "task_scheduler_allowed_counter", metricType: Counter},
3372+
TaskSchedulerThrottledCounterPerDomain: {metricName: "task_scheduler_throttled_counter_per_domain", metricRollupName: "task_scheduler_throttled_counter", metricType: Counter},
33613373

33623374
TaskBatchCompleteCounter: {metricName: "task_batch_complete_counter", metricType: Counter},
33633375
TaskBatchCompleteFailure: {metricName: "task_batch_complete_error", metricType: Counter},

common/metrics/scope_test.go

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,24 @@ func TestHistogramMode(t *testing.T) {
2929
})
3030

3131
HistogramMigrationMetrics = map[string]struct{}{
32-
findName(CadenceLatency): {},
33-
findName(ExponentialReplicationTaskLatency): {},
34-
findName(PersistenceLatencyPerShard): {},
35-
findName(ExponentialTaskProcessingLatency): {},
36-
findName(PersistenceLatency): {},
37-
findName(PersistenceLatencyHistogram): {},
32+
findName(CadenceLatency): {},
33+
findName(ExponentialReplicationTaskLatency): {},
34+
findName(PersistenceLatencyPerShard): {},
35+
findName(ExponentialTaskProcessingLatency): {},
36+
findName(PersistenceLatency): {},
37+
findName(PersistenceLatencyHistogram): {},
38+
findName(TaskAttemptTimer): {},
39+
findName(ExponentialTaskAttemptCounts): {},
40+
findName(TaskQueueLatency): {},
41+
findName(ExponentialTaskQueueLatency): {},
42+
findName(TaskLatencyPerDomain): {},
43+
findName(ExponentialTaskLatencyPerDomain): {},
44+
findName(TaskAttemptTimerPerDomain): {},
45+
findName(ExponentialTaskAttemptCountsPerDomain): {},
46+
findName(TaskProcessingLatencyPerDomain): {},
47+
findName(ExponentialTaskProcessingLatencyPerDomain): {},
48+
findName(TaskQueueLatencyPerDomain): {},
49+
findName(ExponentialTaskQueueLatencyPerDomain): {},
3850
}
3951

4052
c := NewClient(ts, History, HistogramMigration{

service/history/task/task.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ func (t *taskImpl) Execute() error {
203203
defer func() {
204204
t.scope.IncCounter(metrics.TaskRequestsPerDomain)
205205
t.scope.RecordTimer(metrics.TaskProcessingLatencyPerDomain, time.Since(executionStartTime))
206+
t.scope.ExponentialHistogram(metrics.ExponentialTaskProcessingLatencyPerDomain, time.Since(executionStartTime))
206207
}()
207208
executeResponse, err := t.taskExecutor.Execute(t)
208209
t.scope = executeResponse.Scope
@@ -236,6 +237,7 @@ func (t *taskImpl) HandleErr(err error) (retErr error) {
236237
t.attempt++
237238
if t.attempt > t.criticalRetryCount() {
238239
t.scope.RecordTimer(metrics.TaskAttemptTimerPerDomain, time.Duration(t.attempt))
240+
t.scope.IntExponentialHistogram(metrics.ExponentialTaskAttemptCountsPerDomain, t.attempt)
239241
logger.Error("Critical error processing task, retrying.",
240242
tag.Error(err),
241243
tag.OperationCritical,
@@ -371,9 +373,14 @@ func (t *taskImpl) Ack() {
371373

372374
t.state = ctask.TaskStateAcked
373375
if t.shouldProcessTask {
376+
// Record attempt count as duration so timer mean ≈ average attempt count.
374377
t.scope.RecordTimer(metrics.TaskAttemptTimerPerDomain, time.Duration(t.attempt))
378+
// Use IntExponentialHistogram with Mid1To16k buckets (1–64k) for attempt counts
379+
t.scope.IntExponentialHistogram(metrics.ExponentialTaskAttemptCountsPerDomain, t.attempt)
375380
t.scope.RecordTimer(metrics.TaskLatencyPerDomain, time.Since(t.initialSubmitTime))
381+
t.scope.ExponentialHistogram(metrics.ExponentialTaskLatencyPerDomain, time.Since(t.initialSubmitTime))
376382
t.scope.RecordTimer(metrics.TaskQueueLatencyPerDomain, time.Since(t.GetVisibilityTimestamp()))
383+
t.scope.ExponentialHistogram(metrics.ExponentialTaskQueueLatencyPerDomain, time.Since(t.GetVisibilityTimestamp()))
377384

378385
}
379386

0 commit comments

Comments
 (0)