Skip to content

Commit cfac903

Browse files
committed
chore: add histograms for replication tasks lag, returned, and returned diff
Dual-emit exponential histograms alongside existing timers for ReplicationTasksLag, ReplicationTasksReturned, and ReplicationTasksReturnedDiff to support histogram migration. ReplicationTasksLag uses the already-defined ExponentialReplicationTasksLag (duration-based Mid1ms24h). Returned and ReturnedDiff use new integer histograms (Mid1To16k) with _counts suffix. All six metric names added to HistogramMigrationMetrics allowlist. Signed-off-by: Diana Zawadzki <dzawa@live.de>
1 parent 4b59edc commit cfac903

File tree

3 files changed

+25
-3
lines changed

3 files changed

+25
-3
lines changed

common/metrics/config.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,15 @@ var HistogramMigrationMetrics = map[string]struct{}{
5151

5252
"replication_task_latency": {},
5353
"replication_task_latency_ns": {},
54+
55+
// Replication tasks lag/returned/diff (replication task-ack path).
56+
// Dual-emitted as timer + histogram.
57+
"replication_tasks_lag": {},
58+
"replication_tasks_lag_ns": {},
59+
"replication_tasks_returned": {},
60+
"replication_tasks_returned_counts": {},
61+
"replication_tasks_returned_diff": {},
62+
"replication_tasks_returned_diff_counts": {},
5463
}
5564

5665
func (h HistogramMigration) EmitTimer(name string) bool {

common/metrics/defs.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2731,7 +2731,9 @@ const (
27312731
ReplicationTasksDelay
27322732
ReplicationTasksFetched
27332733
ReplicationTasksReturned
2734+
ExponentialReplicationTasksReturned
27342735
ReplicationTasksReturnedDiff
2736+
ExponentialReplicationTasksReturnedDiff
27352737
ReplicationTasksAppliedLatency
27362738
ExponentialReplicationTasksAppliedLatency
27372739
ReplicationTasksBatchSize
@@ -3548,7 +3550,9 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{
35483550
ReplicationTasksDelay: {metricName: "replication_tasks_delay", metricType: Histogram, buckets: ReplicationTaskDelayBucket},
35493551
ReplicationTasksFetched: {metricName: "replication_tasks_fetched", metricType: Timer},
35503552
ReplicationTasksReturned: {metricName: "replication_tasks_returned", metricType: Timer},
3553+
ExponentialReplicationTasksReturned: {metricName: "replication_tasks_returned_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
35513554
ReplicationTasksReturnedDiff: {metricName: "replication_tasks_returned_diff", metricType: Timer},
3555+
ExponentialReplicationTasksReturnedDiff: {metricName: "replication_tasks_returned_diff_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
35523556
ReplicationTasksAppliedLatency: {metricName: "replication_tasks_applied_latency", metricType: Timer},
35533557
ExponentialReplicationTasksAppliedLatency: {metricName: "replication_tasks_applied_latency_ns", metricType: Histogram, exponentialBuckets: Low1ms100s},
35543558
ReplicationTasksBatchSize: {metricName: "replication_tasks_batch_size", metricType: Gauge},

service/history/replication/task_ack_manager.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,9 +199,18 @@ func (t *TaskAckManager) getTasks(ctx context.Context, pollingCluster string, la
199199
return nil, err
200200
}
201201

202-
t.scope.RecordTimer(metrics.ReplicationTasksLag, time.Duration(t.ackLevels.UpdateIfNeededAndGetQueueMaxReadLevel(persistence.HistoryTaskCategoryReplication, pollingCluster).GetTaskID()-msgs.LastRetrievedMessageID))
203-
t.scope.RecordTimer(metrics.ReplicationTasksReturned, time.Duration(len(msgs.ReplicationTasks)))
204-
t.scope.RecordTimer(metrics.ReplicationTasksReturnedDiff, time.Duration(len(taskInfos)-len(msgs.ReplicationTasks)))
202+
// Keep timers (backwards compatible), dual-emit exponential histograms for migration.
203+
replicationLag := time.Duration(t.ackLevels.UpdateIfNeededAndGetQueueMaxReadLevel(persistence.HistoryTaskCategoryReplication, pollingCluster).GetTaskID() - msgs.LastRetrievedMessageID)
204+
t.scope.RecordTimer(metrics.ReplicationTasksLag, replicationLag)
205+
t.scope.ExponentialHistogram(metrics.ExponentialReplicationTasksLag, replicationLag)
206+
207+
tasksReturned := len(msgs.ReplicationTasks)
208+
t.scope.RecordTimer(metrics.ReplicationTasksReturned, time.Duration(tasksReturned))
209+
t.scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksReturned, tasksReturned)
210+
211+
tasksReturnedDiff := len(taskInfos) - len(msgs.ReplicationTasks)
212+
t.scope.RecordTimer(metrics.ReplicationTasksReturnedDiff, time.Duration(tasksReturnedDiff))
213+
t.scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksReturnedDiff, tasksReturnedDiff)
205214

206215
t.ackLevel(pollingCluster, lastReadTaskID)
207216

0 commit comments

Comments
 (0)