From 8e51ad9d7553b0cbdd76deda8e83377cb438c49a Mon Sep 17 00:00:00 2001 From: Steven L Date: Mon, 6 Oct 2025 18:57:10 -0500 Subject: [PATCH 1/3] chore(histograms): dual-emitting persistence latency timers as histograms Signed-off-by: Steven L --- common/metrics/defs.go | 11 ++++++++- common/metrics/defs_test.go | 10 ++++----- common/persistence/wrappers/metered/base.go | 25 ++++++++++++++------- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/common/metrics/defs.go b/common/metrics/defs.go index d3585177e81..83d5ccb8190 100644 --- a/common/metrics/defs.go +++ b/common/metrics/defs.go @@ -2197,6 +2197,10 @@ const ( PersistenceFailures PersistenceLatency PersistenceLatencyHistogram + // Deprecated: replaced with PersistenceLatencyHistogram, but kept for a while for backwards compatibility reasons. + // The buckets are very similar to PersistenceLatencyHistogram, but they cannot be subset like other exponential histograms. + // This will be removed in a later server version. + PersistenceLatencyManualHistogram PersistenceErrShardExistsCounter PersistenceErrShardOwnershipLostCounter PersistenceErrConditionFailedCounter @@ -2218,7 +2222,9 @@ const ( PersistenceRequestsPerShard PersistenceFailuresPerDomain PersistenceLatencyPerDomain + PersistenceLatencyPerDomainHistogram PersistenceLatencyPerShard + PersistenceLatencyPerShardHistogram PersistenceErrShardExistsCounterPerDomain PersistenceErrShardOwnershipLostCounterPerDomain PersistenceErrConditionFailedCounterPerDomain @@ -2975,7 +2981,8 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ PersistenceRequests: {metricName: "persistence_requests", metricType: Counter}, PersistenceFailures: {metricName: "persistence_errors", metricType: Counter}, PersistenceLatency: {metricName: "persistence_latency", metricType: Timer}, - PersistenceLatencyHistogram: {metricName: "persistence_latency_histogram", metricType: Histogram, buckets: PersistenceLatencyBuckets}, + PersistenceLatencyHistogram: {metricName: "persistence_latency_ns", metricType: Histogram, exponentialBuckets: Default1ms100s}, + PersistenceLatencyManualHistogram: {metricName: "persistence_latency_histogram", metricType: Histogram, buckets: PersistenceLatencyBuckets}, PersistenceErrShardExistsCounter: {metricName: "persistence_errors_shard_exists", metricType: Counter}, PersistenceErrShardOwnershipLostCounter: {metricName: "persistence_errors_shard_ownership_lost", metricType: Counter}, PersistenceErrConditionFailedCounter: {metricName: "persistence_errors_condition_failed", metricType: Counter}, @@ -2996,7 +3003,9 @@ var MetricDefs = map[ServiceIdx]map[MetricIdx]metricDefinition{ PersistenceRequestsPerShard: {metricName: "persistence_requests_per_shard", metricType: Counter}, PersistenceFailuresPerDomain: {metricName: "persistence_errors_per_domain", metricRollupName: "persistence_errors", metricType: Counter}, PersistenceLatencyPerDomain: {metricName: "persistence_latency_per_domain", metricRollupName: "persistence_latency", metricType: Timer}, + PersistenceLatencyPerDomainHistogram: {metricName: "persistence_latency_per_domain_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, PersistenceLatencyPerShard: {metricName: "persistence_latency_per_shard", metricType: Timer}, + PersistenceLatencyPerShardHistogram: {metricName: "persistence_latency_per_shard_ns", metricType: Histogram, exponentialBuckets: Low1ms100s}, PersistenceErrShardExistsCounterPerDomain: {metricName: "persistence_errors_shard_exists_per_domain", metricRollupName: "persistence_errors_shard_exists", metricType: Counter}, PersistenceErrShardOwnershipLostCounterPerDomain: {metricName: "persistence_errors_shard_ownership_lost_per_domain", metricRollupName: "persistence_errors_shard_ownership_lost", metricType: Counter}, PersistenceErrConditionFailedCounterPerDomain: {metricName: "persistence_errors_condition_failed_per_domain", metricRollupName: "persistence_errors_condition_failed", metricType: Counter}, diff --git a/common/metrics/defs_test.go b/common/metrics/defs_test.go index 4081d777e52..671c3b3b29c 100644 --- a/common/metrics/defs_test.go +++ b/common/metrics/defs_test.go @@ -96,27 +96,27 @@ func TestScopeDefsMapped(t *testing.T) { func TestMetricDefsMapped(t *testing.T) { for i := CadenceRequests; i < NumCommonMetrics; i++ { key, ok := MetricDefs[Common][i] - require.True(t, ok) + require.True(t, ok, "common enum %v is missing a metric definition", i) require.NotEmpty(t, key) } for i := TaskRequests; i < NumHistoryMetrics; i++ { key, ok := MetricDefs[History][i] - require.True(t, ok) + require.True(t, ok, "history enum %v is missing a metric definition", i) require.NotEmpty(t, key) } for i := PollSuccessPerTaskListCounter; i < NumMatchingMetrics; i++ { key, ok := MetricDefs[Matching][i] - require.True(t, ok) + require.True(t, ok, "matching enum %v is missing a metric definition", i) require.NotEmpty(t, key) } for i := ReplicatorMessages; i < NumWorkerMetrics; i++ { key, ok := MetricDefs[Worker][i] - require.True(t, ok) + require.True(t, ok, "worker enum %v is missing a metric definition", i) require.NotEmpty(t, key) } for i := ShardDistributorRequests; i < NumShardDistributorMetrics; i++ { key, ok := MetricDefs[ShardDistributor][i] - require.True(t, ok) + require.True(t, ok, "shard distributor enum %v is missing a metric definition", i) require.NotEmpty(t, key) } } diff --git a/common/persistence/wrappers/metered/base.go b/common/persistence/wrappers/metered/base.go index 5d24daa4b89..13cc51b1d40 100644 --- a/common/persistence/wrappers/metered/base.go +++ b/common/persistence/wrappers/metered/base.go @@ -140,7 +140,7 @@ func (p *base) call(scope metrics.ScopeIdx, op func() error, tags ...metrics.Tag } if p.enableLatencyHistogramMetrics { - metricsScope.RecordHistogramDuration(metrics.PersistenceLatencyHistogram, duration) + metricsScope.RecordHistogramDuration(metrics.PersistenceLatencyManualHistogram, duration) } logger := p.logger.Helper() @@ -163,7 +163,7 @@ func (p *base) callWithoutDomainTag(scope metrics.ScopeIdx, op func() error, tag metricsScope.RecordTimer(metrics.PersistenceLatency, duration) if p.enableLatencyHistogramMetrics { - metricsScope.RecordHistogramDuration(metrics.PersistenceLatencyHistogram, duration) + metricsScope.RecordHistogramDuration(metrics.PersistenceLatencyManualHistogram, duration) } if err != nil { p.updateErrorMetric(scope, err, metricsScope, p.logger.Helper()) @@ -172,11 +172,14 @@ func (p *base) callWithoutDomainTag(scope metrics.ScopeIdx, op func() error, tag } func (p *base) callWithDomainAndShardScope(scope metrics.ScopeIdx, op func() error, domainTag metrics.Tag, shardIDTag metrics.Tag, additionalTags ...metrics.Tag) error { + // caution: additionalTags is generally unsafe in Prometheus, as it varies the tags applied to a metric. + // this method should be changed to use specific tags, and always emit the same list, using empty values where necessary. + overallScope := p.metricClient.Scope(scope) domainMetricsScope := p.metricClient.Scope(scope, append([]metrics.Tag{domainTag}, additionalTags...)...) shardOperationsMetricsScope := p.metricClient.Scope(scope, append([]metrics.Tag{shardIDTag}, additionalTags...)...) - shardOverallMetricsScope := p.metricClient.Scope(metrics.PersistenceShardRequestCountScope, shardIDTag) + shardOverallMetricsScope := p.metricClient.Scope(metrics.PersistenceShardRequestCountScope, shardIDTag) // operation:shardidpersistencerequest - domainMetricsScope.IncCounter(metrics.PersistenceRequestsPerDomain) + domainMetricsScope.IncCounter(metrics.PersistenceRequestsPerDomain) // also emits PersistenceRequests shardOperationsMetricsScope.IncCounter(metrics.PersistenceRequestsPerShard) shardOverallMetricsScope.IncCounter(metrics.PersistenceRequestsPerShard) @@ -184,12 +187,18 @@ func (p *base) callWithDomainAndShardScope(scope metrics.ScopeIdx, op func() err err := op() duration := time.Since(before) - domainMetricsScope.RecordTimer(metrics.PersistenceLatencyPerDomain, duration) - shardOperationsMetricsScope.RecordTimer(metrics.PersistenceLatencyPerShard, duration) - shardOverallMetricsScope.RecordTimer(metrics.PersistenceLatencyPerShard, duration) + domainMetricsScope.RecordTimer(metrics.PersistenceLatencyPerDomain, duration) // also emits PersistenceLatency + domainMetricsScope.RecordHistogramDuration(metrics.PersistenceLatencyPerDomainHistogram, duration) + overallScope.RecordHistogramDuration(metrics.PersistenceLatencyHistogram, duration) + + shardOperationsMetricsScope.RecordTimer(metrics.PersistenceLatencyPerShard, duration) // operation:{scope argument} + shardOperationsMetricsScope.RecordHistogramDuration(metrics.PersistenceLatencyPerShardHistogram, duration) + + shardOverallMetricsScope.RecordTimer(metrics.PersistenceLatencyPerShard, duration) // operation:shardidpersistencerequest + shardOverallMetricsScope.RecordHistogramDuration(metrics.PersistenceLatencyPerShardHistogram, duration) if p.enableLatencyHistogramMetrics { - domainMetricsScope.RecordHistogramDuration(metrics.PersistenceLatencyHistogram, duration) + domainMetricsScope.RecordHistogramDuration(metrics.PersistenceLatencyManualHistogram, duration) // manual buckets, being deprecated } if err != nil { p.updateErrorMetricPerDomain(scope, err, domainMetricsScope, p.logger.Helper()) From 9daf82d8fbd9857ff88a8878a65c1b2efb8c6a84 Mon Sep 17 00:00:00 2001 From: Steven L Date: Mon, 6 Oct 2025 19:11:32 -0500 Subject: [PATCH 2/3] include other timers in here too Signed-off-by: Steven L --- common/persistence/wrappers/metered/base.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/persistence/wrappers/metered/base.go b/common/persistence/wrappers/metered/base.go index 13cc51b1d40..7f6689577c8 100644 --- a/common/persistence/wrappers/metered/base.go +++ b/common/persistence/wrappers/metered/base.go @@ -135,8 +135,10 @@ func (p *base) call(scope metrics.ScopeIdx, op func() error, tags ...metrics.Tag duration := time.Since(before) if len(tags) > 0 { metricsScope.RecordTimer(metrics.PersistenceLatencyPerDomain, duration) + metricsScope.RecordHistogramDuration(metrics.PersistenceLatencyPerDomainHistogram, duration) } else { metricsScope.RecordTimer(metrics.PersistenceLatency, duration) + metricsScope.RecordHistogramDuration(metrics.PersistenceLatencyHistogram, duration) } if p.enableLatencyHistogramMetrics { @@ -161,6 +163,7 @@ func (p *base) callWithoutDomainTag(scope metrics.ScopeIdx, op func() error, tag err := op() duration := time.Since(before) metricsScope.RecordTimer(metrics.PersistenceLatency, duration) + metricsScope.RecordHistogramDuration(metrics.PersistenceLatencyHistogram, duration) if p.enableLatencyHistogramMetrics { metricsScope.RecordHistogramDuration(metrics.PersistenceLatencyManualHistogram, duration) From 2e8f253da66e252e0e6b11bbff6d1e56406924c8 Mon Sep 17 00:00:00 2001 From: Steven L Date: Tue, 7 Oct 2025 19:58:24 -0500 Subject: [PATCH 3/3] add support for migration-config for these metrics Signed-off-by: Steven L --- common/metrics/config.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/common/metrics/config.go b/common/metrics/config.go index 3481f1f9d5b..a62c6f87287 100644 --- a/common/metrics/config.go +++ b/common/metrics/config.go @@ -46,6 +46,16 @@ var HistogramMigrationMetrics = map[string]struct{}{ "replication_task_latency": {}, "replication_task_latency_ns": {}, + + "persistence_latency": {}, + "persistence_latency_ns": {}, + "persistence_latency_histogram": {}, // deprecating: this is a set of manual buckets being replaced by persistence_latency_ns + + "persistence_latency_per_domain": {}, + "persistence_latency_per_domain_ns": {}, + + "persistence_latency_per_shard": {}, + "persistence_latency_per_shard_ns": {}, } func (h HistogramMigration) EmitTimer(name string) bool {