fix(metrics): use _counts histogram for replication tasks lag (#7716)

zawadzkidiana · web-flow · commit 87ecb5a74fe1 · 2026-02-25T18:52:44.000-05:00
**What changed?**
Updated replication task processor lag histogram emission to use integer
_counts instead of duration/ns. Specifically, in
cleanupAckedReplicationTasks, changed ExponentialReplicationTasksLag
emission from ExponentialHistogram(..., lag) to
IntExponentialHistogram(..., lagCount), and updated metric definitions +
migration allowlist names from replication_tasks_lag_ns to
replication_tasks_lag_counts.

**Why?**
Per follow-up review, this metric represents queue depth/lag in number
of tasks, not time duration, so _counts is the correct histogram
semantic.
Previously, the code emitted replication_tasks_lag_ns with duration
buckets, which could misrepresent the signal and make dashboards/alerts
inconsistent with actual units. This change keeps timer emission for
backward compatibility while making histogram emission unit-correct for
migration and analysis.

**How did you test it?**
go test ./service/history/replication/... -count=1
go test ./common/metrics/... -run TestHistogramMigration -count=1
make pr

**Potential risks**
Low to moderate metrics-consumer risk.
No API/IDL or schema changes.
Timer metric (replication_tasks_lag) is unchanged.
Histogram metric name changed from _ns to _counts; any dashboards/alerts
reading the old histogram name will need to move to
replication_tasks_lag_counts.

**Release notes**
Internal metrics migration update: replication task processor lag
histogram now emits task-count based values via
replication_tasks_lag_counts (integer histogram), while preserving
existing timer emission.

**Documentation Changes**
N/A for Cadence docs; internal dashboard/alert metric references should
switch from replication_tasks_lag_ns to replication_tasks_lag_counts.

Signed-off-by: Diana Zawadzki &lt;dzawa@live.de&gt;
diff --git a/common/metrics/config.go b/common/metrics/config.go
@@ -59,7 +59,7 @@ var HistogramMigrationMetrics = map[string]struct{}{
 	// Replication task processor histograms (PR #7685).
 	// Dual-emitted as timer + histogram.
 	"replication_tasks_lag":                {},
-	"replication_tasks_lag_ns":             {},
+	"replication_tasks_lag_counts":         {},
 	"replication_tasks_applied_latency":    {},
 	"replication_tasks_applied_latency_ns": {},
 
diff --git a/common/metrics/defs.go b/common/metrics/defs.go
@@ -3562,7 +3562,7 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{
 		ReplicationTasksApplied:                                      {metricName: "replication_tasks_applied", metricType: Counter},
 		ReplicationTasksFailed:                                       {metricName: "replication_tasks_failed", metricType: Counter},
 		ReplicationTasksLag:                                          {metricName: "replication_tasks_lag", metricType: Timer},
-		ExponentialReplicationTasksLag:                               {metricName: "replication_tasks_lag_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h},
+		ExponentialReplicationTasksLag:                               {metricName: "replication_tasks_lag_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
 		ReplicationTasksLagRaw:                                       {metricName: "replication_tasks_lag_raw", metricType: Timer},
 		ReplicationTasksDelay:                                        {metricName: "replication_tasks_delay", metricType: Histogram, buckets: ReplicationTaskDelayBucket},
 		ReplicationTasksFetched:                                      {metricName: "replication_tasks_fetched", metricType: Timer},
diff --git a/service/history/replication/task_processor.go b/service/history/replication/task_processor.go
@@ -278,12 +278,12 @@ func (p *taskProcessorImpl) cleanupAckedReplicationTasks() error {
 		persistence.HistoryTaskCategoryReplication,
 		p.currentCluster,
 	).GetTaskID()
-	lag := time.Duration(maxReadLevel - minAckLevel)
+	lagCount := int(maxReadLevel - minAckLevel)
 	scope := p.metricsClient.Scope(metrics.ReplicationTaskFetcherScope,
 		metrics.TargetClusterTag(p.currentCluster),
 	)
-	scope.RecordTimer(metrics.ReplicationTasksLag, lag)
-	scope.ExponentialHistogram(metrics.ExponentialReplicationTasksLag, lag)
+	scope.RecordTimer(metrics.ReplicationTasksLag, time.Duration(lagCount))
+	scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksLag, lagCount)
 	for {
 		pageSize := p.config.ReplicatorTaskDeleteBatchSize()
 		resp, err := p.shard.GetExecutionManager().RangeCompleteHistoryTask(