Skip to content

Commit e7dd6b3

Browse files
committed
replication: add histograms for task-ack timers
Signed-off-by: Diana Zawadzki <dzawa@live.de>
1 parent 808d658 commit e7dd6b3

File tree

2 files changed

+35
-3
lines changed

2 files changed

+35
-3
lines changed

common/metrics/defs.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2726,11 +2726,16 @@ const (
27262726
ReplicationTasksApplied
27272727
ReplicationTasksFailed
27282728
ReplicationTasksLag
2729+
ExponentialReplicationTasksLag
27292730
ReplicationTasksLagRaw
2731+
ExponentialReplicationTasksLagRaw
27302732
ReplicationTasksDelay
27312733
ReplicationTasksFetched
2734+
ExponentialReplicationTasksFetched
27322735
ReplicationTasksReturned
2736+
ExponentialReplicationTasksReturned
27332737
ReplicationTasksReturnedDiff
2738+
ExponentialReplicationTasksReturnedDiff
27342739
ReplicationTasksAppliedLatency
27352740
ReplicationTasksBatchSize
27362741
ReplicationDynamicTaskBatchSizerDecision
@@ -3541,11 +3546,16 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{
35413546
ReplicationTasksApplied: {metricName: "replication_tasks_applied", metricType: Counter},
35423547
ReplicationTasksFailed: {metricName: "replication_tasks_failed", metricType: Counter},
35433548
ReplicationTasksLag: {metricName: "replication_tasks_lag", metricType: Timer},
3549+
ExponentialReplicationTasksLag: {metricName: "replication_tasks_lag_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
35443550
ReplicationTasksLagRaw: {metricName: "replication_tasks_lag_raw", metricType: Timer},
3551+
ExponentialReplicationTasksLagRaw: {metricName: "replication_tasks_lag_raw_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
35453552
ReplicationTasksDelay: {metricName: "replication_tasks_delay", metricType: Histogram, buckets: ReplicationTaskDelayBucket},
35463553
ReplicationTasksFetched: {metricName: "replication_tasks_fetched", metricType: Timer},
3554+
ExponentialReplicationTasksFetched: {metricName: "replication_tasks_fetched_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
35473555
ReplicationTasksReturned: {metricName: "replication_tasks_returned", metricType: Timer},
3556+
ExponentialReplicationTasksReturned: {metricName: "replication_tasks_returned_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
35483557
ReplicationTasksReturnedDiff: {metricName: "replication_tasks_returned_diff", metricType: Timer},
3558+
ExponentialReplicationTasksReturnedDiff: {metricName: "replication_tasks_returned_diff_counts", metricType: Histogram, intExponentialBuckets: Mid1To16k},
35493559
ReplicationTasksAppliedLatency: {metricName: "replication_tasks_applied_latency", metricType: Timer},
35503560
ReplicationTasksBatchSize: {metricName: "replication_tasks_batch_size", metricType: Gauge},
35513561
ReplicationDynamicTaskBatchSizerDecision: {metricName: "replication_dynamic_task_batch_sizer_decision", metricType: Counter},

service/history/replication/task_ack_manager.go

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ func (t *TaskAckManager) getTasks(ctx context.Context, pollingCluster string, la
147147
return nil, err
148148
}
149149
t.scope.RecordTimer(metrics.ReplicationTasksFetched, time.Duration(len(taskInfos)))
150+
t.scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksFetched, len(taskInfos))
150151

151152
// Happy path assumption - we will push all tasks to replication tasks.
152153
msgs := &types.ReplicationMessages{
@@ -167,7 +168,10 @@ func (t *TaskAckManager) getTasks(ctx context.Context, pollingCluster string, la
167168
oldestUnprocessedTaskTimestamp = t.timeSource.Now().UnixNano()
168169
}
169170

170-
t.scope.RecordTimer(metrics.ReplicationTasksLagRaw, time.Duration(t.ackLevels.UpdateIfNeededAndGetQueueMaxReadLevel(persistence.HistoryTaskCategoryReplication, pollingCluster).GetTaskID()-oldestUnprocessedTaskID))
171+
maxReadLevel := t.ackLevels.UpdateIfNeededAndGetQueueMaxReadLevel(persistence.HistoryTaskCategoryReplication, pollingCluster).GetTaskID()
172+
lagRaw := maxReadLevel - oldestUnprocessedTaskID
173+
t.scope.RecordTimer(metrics.ReplicationTasksLagRaw, time.Duration(lagRaw))
174+
t.scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksLagRaw, clampInt64ToInt(lagRaw))
171175
t.scope.RecordHistogramDuration(metrics.ReplicationTasksDelay, time.Duration(oldestUnprocessedTaskTimestamp-t.timeSource.Now().UnixNano()))
172176

173177
// hydrate the tasks
@@ -199,9 +203,16 @@ func (t *TaskAckManager) getTasks(ctx context.Context, pollingCluster string, la
199203
return nil, err
200204
}
201205

202-
t.scope.RecordTimer(metrics.ReplicationTasksLag, time.Duration(t.ackLevels.UpdateIfNeededAndGetQueueMaxReadLevel(persistence.HistoryTaskCategoryReplication, pollingCluster).GetTaskID()-msgs.LastRetrievedMessageID))
206+
lag := maxReadLevel - msgs.LastRetrievedMessageID
207+
t.scope.RecordTimer(metrics.ReplicationTasksLag, time.Duration(lag))
208+
t.scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksLag, clampInt64ToInt(lag))
209+
203210
t.scope.RecordTimer(metrics.ReplicationTasksReturned, time.Duration(len(msgs.ReplicationTasks)))
204-
t.scope.RecordTimer(metrics.ReplicationTasksReturnedDiff, time.Duration(len(taskInfos)-len(msgs.ReplicationTasks)))
211+
t.scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksReturned, len(msgs.ReplicationTasks))
212+
213+
returnedDiff := len(taskInfos) - len(msgs.ReplicationTasks)
214+
t.scope.RecordTimer(metrics.ReplicationTasksReturnedDiff, time.Duration(returnedDiff))
215+
t.scope.IntExponentialHistogram(metrics.ExponentialReplicationTasksReturnedDiff, returnedDiff)
205216

206217
t.ackLevel(pollingCluster, lastReadTaskID)
207218

@@ -221,6 +232,17 @@ func (t *TaskAckManager) getTasks(ctx context.Context, pollingCluster string, la
221232
}, nil
222233
}
223234

235+
func clampInt64ToInt(v int64) int {
236+
if v <= 0 {
237+
return 0
238+
}
239+
maxInt := int64(int(^uint(0) >> 1))
240+
if v > maxInt {
241+
return int(maxInt)
242+
}
243+
return int(v)
244+
}
245+
224246
// ackLevel updates the ack level for the given cluster
225247
func (t *TaskAckManager) ackLevel(pollingCluster string, lastReadTaskID int64) {
226248
if err := t.ackLevels.UpdateQueueClusterAckLevel(persistence.HistoryTaskCategoryReplication, pollingCluster, persistence.NewImmediateTaskKey(lastReadTaskID)); err != nil {

0 commit comments

Comments
 (0)