Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions common/metrics/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,20 @@ func (h *HistogramMigration) UnmarshalYAML(read func(any) error) error {
// This is likely best done in an `init` func, to ensure it happens early enough
// and does not race with config reading.
var HistogramMigrationMetrics = map[string]struct{}{
"task_latency_processing": {},
"task_latency_processing_ns": {},
"task_attempt": {},
"task_attempt_counts": {},
"task_attempt_per_domain": {},
"task_attempt_per_domain_counts": {},
"task_latency_per_domain": {},
"task_latency_per_domain_ns": {},
"task_latency_processing": {},
"task_latency_processing_ns": {},
"task_latency_queue": {},
"task_latency_queue_ns": {},
"task_latency_processing_per_domain": {},
"task_latency_processing_per_domain_ns": {},
"task_latency_queue_per_domain": {},
"task_latency_queue_per_domain_ns": {},

// Replication task processor histograms (PR #7685).
// Dual-emitted as timer + histogram.
Expand Down
78 changes: 45 additions & 33 deletions common/metrics/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -2487,6 +2487,7 @@ const (
TaskFailures
TaskDiscarded
TaskAttemptTimer
ExponentialTaskAttemptCounts
TaskStandbyRetryCounter
TaskNotActiveCounter
TaskLimitExceededCounter
Expand All @@ -2495,6 +2496,7 @@ const (
TaskProcessingLatency
ExponentialTaskProcessingLatency
TaskQueueLatency
ExponentialTaskQueueLatency
ScheduleToStartHistoryQueueLatencyPerTaskList
TaskRequestsOldScheduler
TaskRequestsNewScheduler
Expand All @@ -2504,19 +2506,23 @@ const (

TaskRequestsPerDomain
TaskLatencyPerDomain
ExponentialTaskLatencyPerDomain
TaskFailuresPerDomain
TaskWorkflowBusyPerDomain
TaskDiscardedPerDomain
TaskUnsupportedPerDomain
TaskAttemptTimerPerDomain
ExponentialTaskAttemptCountsPerDomain
TaskStandbyRetryCounterPerDomain
TaskListNotOwnedByHostCounterPerDomain
TaskPendingActiveCounterPerDomain
TaskNotActiveCounterPerDomain
TaskTargetNotActiveCounterPerDomain
TaskLimitExceededCounterPerDomain
TaskProcessingLatencyPerDomain
ExponentialTaskProcessingLatencyPerDomain
TaskQueueLatencyPerDomain
ExponentialTaskQueueLatencyPerDomain
TaskScheduleLatencyPerDomain
TaskEnqueueToFetchLatency
TransferTaskMissingEventCounterPerDomain
Expand Down Expand Up @@ -3316,17 +3322,19 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{
WeightedChannelPoolSizeGauge: {metricName: "weighted_channel_pool_size", metricType: Gauge},
},
History: {
TaskRequests: {metricName: "task_requests", metricType: Counter},
TaskLatency: {metricName: "task_latency", metricType: Timer},
TaskAttemptTimer: {metricName: "task_attempt", metricType: Timer},
TaskFailures: {metricName: "task_errors", metricType: Counter},
TaskDiscarded: {metricName: "task_errors_discarded", metricType: Counter},
TaskStandbyRetryCounter: {metricName: "task_errors_standby_retry_counter", metricType: Counter},
TaskNotActiveCounter: {metricName: "task_errors_not_active_counter", metricType: Counter},
TaskLimitExceededCounter: {metricName: "task_errors_limit_exceeded_counter", metricType: Counter},
TaskProcessingLatency: {metricName: "task_latency_processing", metricType: Timer},
ExponentialTaskProcessingLatency: {metricName: "task_latency_processing_ns", metricType: Histogram, exponentialBuckets: Low1ms100s},
TaskQueueLatency: {metricName: "task_latency_queue", metricType: Timer},
TaskRequests: {metricName: "task_requests", metricType: Counter},
TaskLatency: {metricName: "task_latency", metricType: Timer},
TaskAttemptTimer: {metricName: "task_attempt", metricType: Timer},
ExponentialTaskAttemptCounts: {metricName: "task_attempt_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
TaskFailures: {metricName: "task_errors", metricType: Counter},
TaskDiscarded: {metricName: "task_errors_discarded", metricType: Counter},
TaskStandbyRetryCounter: {metricName: "task_errors_standby_retry_counter", metricType: Counter},
TaskNotActiveCounter: {metricName: "task_errors_not_active_counter", metricType: Counter},
TaskLimitExceededCounter: {metricName: "task_errors_limit_exceeded_counter", metricType: Counter},
TaskProcessingLatency: {metricName: "task_latency_processing", metricType: Timer},
ExponentialTaskProcessingLatency: {metricName: "task_latency_processing_ns", metricType: Histogram, exponentialBuckets: Low1ms100s},
TaskQueueLatency: {metricName: "task_latency_queue", metricType: Timer},
ExponentialTaskQueueLatency: {metricName: "task_latency_queue_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h},
ScheduleToStartHistoryQueueLatencyPerTaskList: {metricName: "schedule_to_start_history_queue_latency_per_tl", metricType: Timer},
TaskRequestsOldScheduler: {metricName: "task_requests_old_scheduler", metricType: Counter},
TaskRequestsNewScheduler: {metricName: "task_requests_new_scheduler", metricType: Counter},
Expand All @@ -3336,28 +3344,32 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{

// per domain task metrics

TaskRequestsPerDomain: {metricName: "task_requests_per_domain", metricRollupName: "task_requests", metricType: Counter},
TaskLatencyPerDomain: {metricName: "task_latency_per_domain", metricRollupName: "task_latency", metricType: Timer},
TaskAttemptTimerPerDomain: {metricName: "task_attempt_per_domain", metricRollupName: "task_attempt", metricType: Timer},
TaskFailuresPerDomain: {metricName: "task_errors_per_domain", metricRollupName: "task_errors", metricType: Counter},
TaskWorkflowBusyPerDomain: {metricName: "task_errors_workflow_busy_per_domain", metricRollupName: "task_errors_workflow_busy", metricType: Counter},
TaskDiscardedPerDomain: {metricName: "task_errors_discarded_per_domain", metricRollupName: "task_errors_discarded", metricType: Counter},
TaskUnsupportedPerDomain: {metricName: "task_errors_unsupported_per_domain", metricRollupName: "task_errors_discarded", metricType: Counter},
TaskStandbyRetryCounterPerDomain: {metricName: "task_errors_standby_retry_counter_per_domain", metricRollupName: "task_errors_standby_retry_counter", metricType: Counter},
TaskListNotOwnedByHostCounterPerDomain: {metricName: "task_errors_task_list_not_owned_by_host_counter_per_domain", metricRollupName: "task_errors_task_list_not_owned_by_host_counter", metricType: Counter},
TaskPendingActiveCounterPerDomain: {metricName: "task_errors_pending_active_counter_per_domain", metricRollupName: "task_errors_pending_active_counter", metricType: Counter},
TaskNotActiveCounterPerDomain: {metricName: "task_errors_not_active_counter_per_domain", metricRollupName: "task_errors_not_active_counter", metricType: Counter},
TaskTargetNotActiveCounterPerDomain: {metricName: "task_errors_target_not_active_counter_per_domain", metricRollupName: "task_errors_target_not_active_counter", metricType: Counter},
TaskLimitExceededCounterPerDomain: {metricName: "task_errors_limit_exceeded_counter_per_domain", metricRollupName: "task_errors_limit_exceeded_counter", metricType: Counter},
TaskProcessingLatencyPerDomain: {metricName: "task_latency_processing_per_domain", metricRollupName: "task_latency_processing", metricType: Timer},
TaskQueueLatencyPerDomain: {metricName: "task_latency_queue_per_domain", metricRollupName: "task_latency_queue", metricType: Timer},
TaskScheduleLatencyPerDomain: {metricName: "task_latency_schedule_per_domain", metricRollupName: "task_latency_schedule", metricType: Histogram, buckets: HistoryTaskLatencyBuckets},
TaskEnqueueToFetchLatency: {metricName: "task_latency_enqueue_to_fetch", metricType: Histogram, buckets: HistoryTaskLatencyBuckets},
TransferTaskMissingEventCounterPerDomain: {metricName: "transfer_task_missing_event_counter_per_domain", metricRollupName: "transfer_task_missing_event_counter", metricType: Counter},
ReplicationTasksAppliedPerDomain: {metricName: "replication_tasks_applied_per_domain", metricType: Counter},
WorkflowTerminateCounterPerDomain: {metricName: "workflow_terminate_counter_per_domain", metricRollupName: "workflow_terminate_counter", metricType: Counter},
TaskSchedulerAllowedCounterPerDomain: {metricName: "task_scheduler_allowed_counter_per_domain", metricRollupName: "task_scheduler_allowed_counter", metricType: Counter},
TaskSchedulerThrottledCounterPerDomain: {metricName: "task_scheduler_throttled_counter_per_domain", metricRollupName: "task_scheduler_throttled_counter", metricType: Counter},
TaskRequestsPerDomain: {metricName: "task_requests_per_domain", metricRollupName: "task_requests", metricType: Counter},
TaskLatencyPerDomain: {metricName: "task_latency_per_domain", metricRollupName: "task_latency", metricType: Timer},
ExponentialTaskLatencyPerDomain: {metricName: "task_latency_per_domain_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h},
TaskAttemptTimerPerDomain: {metricName: "task_attempt_per_domain", metricRollupName: "task_attempt", metricType: Timer},
ExponentialTaskAttemptCountsPerDomain: {metricName: "task_attempt_per_domain_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
TaskFailuresPerDomain: {metricName: "task_errors_per_domain", metricRollupName: "task_errors", metricType: Counter},
TaskWorkflowBusyPerDomain: {metricName: "task_errors_workflow_busy_per_domain", metricRollupName: "task_errors_workflow_busy", metricType: Counter},
TaskDiscardedPerDomain: {metricName: "task_errors_discarded_per_domain", metricRollupName: "task_errors_discarded", metricType: Counter},
TaskUnsupportedPerDomain: {metricName: "task_errors_unsupported_per_domain", metricRollupName: "task_errors_discarded", metricType: Counter},
TaskStandbyRetryCounterPerDomain: {metricName: "task_errors_standby_retry_counter_per_domain", metricRollupName: "task_errors_standby_retry_counter", metricType: Counter},
TaskListNotOwnedByHostCounterPerDomain: {metricName: "task_errors_task_list_not_owned_by_host_counter_per_domain", metricRollupName: "task_errors_task_list_not_owned_by_host_counter", metricType: Counter},
TaskPendingActiveCounterPerDomain: {metricName: "task_errors_pending_active_counter_per_domain", metricRollupName: "task_errors_pending_active_counter", metricType: Counter},
TaskNotActiveCounterPerDomain: {metricName: "task_errors_not_active_counter_per_domain", metricRollupName: "task_errors_not_active_counter", metricType: Counter},
TaskTargetNotActiveCounterPerDomain: {metricName: "task_errors_target_not_active_counter_per_domain", metricRollupName: "task_errors_target_not_active_counter", metricType: Counter},
TaskLimitExceededCounterPerDomain: {metricName: "task_errors_limit_exceeded_counter_per_domain", metricRollupName: "task_errors_limit_exceeded_counter", metricType: Counter},
TaskProcessingLatencyPerDomain: {metricName: "task_latency_processing_per_domain", metricRollupName: "task_latency_processing", metricType: Timer},
ExponentialTaskProcessingLatencyPerDomain: {metricName: "task_latency_processing_per_domain_ns", metricType: Histogram, exponentialBuckets: Low1ms100s},
TaskQueueLatencyPerDomain: {metricName: "task_latency_queue_per_domain", metricRollupName: "task_latency_queue", metricType: Timer},
ExponentialTaskQueueLatencyPerDomain: {metricName: "task_latency_queue_per_domain_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h},
TaskScheduleLatencyPerDomain: {metricName: "task_latency_schedule_per_domain", metricRollupName: "task_latency_schedule", metricType: Histogram, buckets: HistoryTaskLatencyBuckets},
TaskEnqueueToFetchLatency: {metricName: "task_latency_enqueue_to_fetch", metricType: Histogram, buckets: HistoryTaskLatencyBuckets},
TransferTaskMissingEventCounterPerDomain: {metricName: "transfer_task_missing_event_counter_per_domain", metricRollupName: "transfer_task_missing_event_counter", metricType: Counter},
ReplicationTasksAppliedPerDomain: {metricName: "replication_tasks_applied_per_domain", metricType: Counter},
WorkflowTerminateCounterPerDomain: {metricName: "workflow_terminate_counter_per_domain", metricRollupName: "workflow_terminate_counter", metricType: Counter},
TaskSchedulerAllowedCounterPerDomain: {metricName: "task_scheduler_allowed_counter_per_domain", metricRollupName: "task_scheduler_allowed_counter", metricType: Counter},
TaskSchedulerThrottledCounterPerDomain: {metricName: "task_scheduler_throttled_counter_per_domain", metricRollupName: "task_scheduler_throttled_counter", metricType: Counter},

TaskBatchCompleteCounter: {metricName: "task_batch_complete_counter", metricType: Counter},
TaskBatchCompleteFailure: {metricName: "task_batch_complete_error", metricType: Counter},
Expand Down
24 changes: 18 additions & 6 deletions common/metrics/scope_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,24 @@ func TestHistogramMode(t *testing.T) {
})

HistogramMigrationMetrics = map[string]struct{}{
findName(CadenceLatency): {},
findName(ExponentialReplicationTaskLatency): {},
findName(PersistenceLatencyPerShard): {},
findName(ExponentialTaskProcessingLatency): {},
findName(PersistenceLatency): {},
findName(PersistenceLatencyHistogram): {},
findName(CadenceLatency): {},
findName(ExponentialReplicationTaskLatency): {},
findName(PersistenceLatencyPerShard): {},
findName(ExponentialTaskProcessingLatency): {},
findName(PersistenceLatency): {},
findName(PersistenceLatencyHistogram): {},
findName(TaskAttemptTimer): {},
findName(ExponentialTaskAttemptCounts): {},
findName(TaskQueueLatency): {},
findName(ExponentialTaskQueueLatency): {},
findName(TaskLatencyPerDomain): {},
findName(ExponentialTaskLatencyPerDomain): {},
findName(TaskAttemptTimerPerDomain): {},
findName(ExponentialTaskAttemptCountsPerDomain): {},
findName(TaskProcessingLatencyPerDomain): {},
findName(ExponentialTaskProcessingLatencyPerDomain): {},
findName(TaskQueueLatencyPerDomain): {},
findName(ExponentialTaskQueueLatencyPerDomain): {},
}

c := NewClient(ts, History, HistogramMigration{
Expand Down
7 changes: 7 additions & 0 deletions service/history/task/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ func (t *taskImpl) Execute() error {
defer func() {
t.scope.IncCounter(metrics.TaskRequestsPerDomain)
t.scope.RecordTimer(metrics.TaskProcessingLatencyPerDomain, time.Since(executionStartTime))
t.scope.ExponentialHistogram(metrics.ExponentialTaskProcessingLatencyPerDomain, time.Since(executionStartTime))
}()
executeResponse, err := t.taskExecutor.Execute(t)
t.scope = executeResponse.Scope
Expand Down Expand Up @@ -236,6 +237,7 @@ func (t *taskImpl) HandleErr(err error) (retErr error) {
t.attempt++
if t.attempt > t.criticalRetryCount() {
t.scope.RecordTimer(metrics.TaskAttemptTimerPerDomain, time.Duration(t.attempt))
t.scope.IntExponentialHistogram(metrics.ExponentialTaskAttemptCountsPerDomain, t.attempt)
logger.Error("Critical error processing task, retrying.",
tag.Error(err),
tag.OperationCritical,
Expand Down Expand Up @@ -371,9 +373,14 @@ func (t *taskImpl) Ack() {

t.state = ctask.TaskStateAcked
if t.shouldProcessTask {
// Record attempt count as duration so timer mean ≈ average attempt count.
t.scope.RecordTimer(metrics.TaskAttemptTimerPerDomain, time.Duration(t.attempt))
// Use IntExponentialHistogram with Mid1To16k buckets (1–64k) for attempt counts
t.scope.IntExponentialHistogram(metrics.ExponentialTaskAttemptCountsPerDomain, t.attempt)
t.scope.RecordTimer(metrics.TaskLatencyPerDomain, time.Since(t.initialSubmitTime))
t.scope.ExponentialHistogram(metrics.ExponentialTaskLatencyPerDomain, time.Since(t.initialSubmitTime))
t.scope.RecordTimer(metrics.TaskQueueLatencyPerDomain, time.Since(t.GetVisibilityTimestamp()))
t.scope.ExponentialHistogram(metrics.ExponentialTaskQueueLatencyPerDomain, time.Since(t.GetVisibilityTimestamp()))

}

Expand Down
Loading