diff --git a/common/metrics/config.go b/common/metrics/config.go index 700cbd6318b..8374c980f51 100644 --- a/common/metrics/config.go +++ b/common/metrics/config.go @@ -47,7 +47,7 @@ var HistogramMigrationMetrics = map[string]struct{}{ // Replication task processor histograms (PR #7685). // Dual-emitted as timer + histogram. "replication_tasks_lag": {}, - "replication_tasks_lag_ns": {}, + "replication_tasks_lag_counts": {}, "replication_tasks_applied_latency": {}, "replication_tasks_applied_latency_ns": {}, diff --git a/common/metrics/defs.go b/common/metrics/defs.go index f553f822031..9f0596518f0 100644 --- a/common/metrics/defs.go +++ b/common/metrics/defs.go @@ -3541,7 +3541,7 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ ReplicationTasksApplied: {metricName: "replication_tasks_applied", metricType: Counter}, ReplicationTasksFailed: {metricName: "replication_tasks_failed", metricType: Counter}, ReplicationTasksLag: {metricName: "replication_tasks_lag", metricType: Timer}, - ExponentialReplicationTasksLag: {metricName: "replication_tasks_lag_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h}, + ExponentialReplicationTasksLag: {metricName: "replication_tasks_lag_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k}, ReplicationTasksLagRaw: {metricName: "replication_tasks_lag_raw", metricType: Timer}, ReplicationTasksDelay: {metricName: "replication_tasks_delay", metricType: Histogram, buckets: ReplicationTaskDelayBucket}, ReplicationTasksFetched: {metricName: "replication_tasks_fetched", metricType: Timer}, diff --git a/service/history/replication/task_processor.go b/service/history/replication/task_processor.go index 215fb1a4b5e..44e3888690e 100644 --- a/service/history/replication/task_processor.go +++ b/service/history/replication/task_processor.go @@ -278,12 +278,12 @@ func (p *taskProcessorImpl) cleanupAckedReplicationTasks() error { persistence.HistoryTaskCategoryReplication, p.currentCluster, ).GetTaskID() - lag := time.Duration(maxReadLevel - minAckLevel) + lagCount := int(maxReadLevel - minAckLevel) scope := p.metricsClient.Scope(metrics.ReplicationTaskFetcherScope, metrics.TargetClusterTag(p.currentCluster), ) - scope.RecordTimer(metrics.ReplicationTasksLag, lag) - scope.ExponentialHistogram(metrics.ExponentialReplicationTasksLag, lag) + scope.RecordTimer(metrics.ReplicationTasksLag, time.Duration(lagCount)) + scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksLag, lagCount) for { pageSize := p.config.ReplicatorTaskDeleteBatchSize() resp, err := p.shard.GetExecutionManager().RangeCompleteHistoryTask(