diff --git a/common/membership/resolver.go b/common/membership/resolver.go index 28b5162db21..a06309dd899 100644 --- a/common/membership/resolver.go +++ b/common/membership/resolver.go @@ -217,7 +217,7 @@ func (rpo *MultiringResolver) LookupByAddress(service, address string) (HostInfo return m, nil } } - rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(1) + rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(metrics.RingResolverError) return HostInfo{}, fmt.Errorf("host not found in service %s: %s", service, address) } diff --git a/common/metrics/defs.go b/common/metrics/defs.go index 41a6bd71104..d0ebacb1c15 100644 --- a/common/metrics/defs.go +++ b/common/metrics/defs.go @@ -53,6 +53,10 @@ type ( ServiceIdx int ) +func (s scopeDefinition) GetOperationString() string { + return s.operation +} + // MetricTypes which are supported const ( Counter MetricType = iota @@ -1070,7 +1074,7 @@ const ( // -- Operation scopes for History service -- const ( // HistoryStartWorkflowExecutionScope tracks StartWorkflowExecution API calls received by service - HistoryStartWorkflowExecutionScope = iota + NumCommonScopes + HistoryStartWorkflowExecutionScope = iota + NumFrontendScopes // HistoryRecordActivityTaskHeartbeatScope tracks RecordActivityTaskHeartbeat API calls received by service HistoryRecordActivityTaskHeartbeatScope // HistoryRespondDecisionTaskCompletedScope tracks RespondDecisionTaskCompleted API calls received by service @@ -1358,7 +1362,7 @@ const ( // -- Operation scopes for Matching service -- const ( // PollForDecisionTaskScope tracks PollForDecisionTask API calls received by service - MatchingPollForDecisionTaskScope = iota + NumCommonScopes + MatchingPollForDecisionTaskScope = iota + NumHistoryScopes // PollForActivityTaskScope tracks PollForActivityTask API calls received by service MatchingPollForActivityTaskScope // MatchingAddActivityTaskScope tracks AddActivityTask API calls received by service @@ -1394,7 +1398,7 @@ const ( // -- Operation scopes for Worker service -- const ( // ReplicationScope is the scope used by all metric emitted by replicator - ReplicatorScope = iota + NumCommonScopes + ReplicatorScope = iota + NumMatchingScopes // DomainReplicationTaskScope is the scope used by domain task replication processing DomainReplicationTaskScope // ESProcessorScope is scope used by all metric emitted by esProcessor @@ -1442,7 +1446,7 @@ const ( // -- Operation scopes for ShardDistributor service -- const ( // ShardDistributorGetShardOwnerScope tracks GetShardOwner API calls received by service - ShardDistributorGetShardOwnerScope = iota + NumCommonScopes + ShardDistributorGetShardOwnerScope = iota + NumWorkerScopes ShardDistributorHeartbeatScope ShardDistributorAssignLoopScope @@ -2404,6 +2408,8 @@ const ( // cluster forwarding policy metrics ClusterForwardingPolicyRequests + RingResolverError + NumCommonMetrics // Needs to be last on this list for iota numbering ) @@ -2708,12 +2714,13 @@ const ( VirtualQueueCountGauge VirtualQueuePausedGauge VirtualQueueRunningGauge + NumHistoryMetrics ) // Matching metrics enum const ( - PollSuccessPerTaskListCounter = iota + NumCommonMetrics + PollSuccessPerTaskListCounter = iota + NumHistoryMetrics PollTimeoutPerTaskListCounter PollSuccessWithSyncPerTaskListCounter LeaseRequestPerTaskListCounter @@ -2791,12 +2798,13 @@ const ( IsolationGroupUpscale IsolationGroupDownscale PartitionDrained + NumMatchingMetrics ) // Worker metrics enum const ( - ReplicatorMessages = iota + NumCommonMetrics + ReplicatorMessages = iota + NumMatchingMetrics ReplicatorFailures ReplicatorMessagesDropped ReplicatorLatency @@ -2880,12 +2888,13 @@ const ( DiagnosticsWorkflowStartedCount DiagnosticsWorkflowSuccess DiagnosticsWorkflowExecutionLatency + NumWorkerMetrics ) // ShardDistributor metrics enum const ( - ShardDistributorRequests = iota + NumCommonMetrics + ShardDistributorRequests = iota + NumWorkerMetrics ShardDistributorFailures ShardDistributorLatency ShardDistributorErrContextTimeoutCounter @@ -3188,6 +3197,8 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{ ActiveClusterManagerLookupLatency: {metricName: "active_cluster_manager_lookup_latency", metricType: Histogram, buckets: ExponentialDurationBuckets}, ClusterForwardingPolicyRequests: {metricName: "cluster_forwarding_policy_requests", metricType: Counter}, + + RingResolverError: {metricName: "ring_resolver_error", metricType: Counter}, }, History: { TaskRequests: {metricName: "task_requests", metricType: Counter}, diff --git a/common/metrics/defs_test.go b/common/metrics/defs_test.go index 122aa359645..5b6203fc894 100644 --- a/common/metrics/defs_test.go +++ b/common/metrics/defs_test.go @@ -129,6 +129,64 @@ func TestMetricDefs(t *testing.T) { } } +// "index -> operation" must be unique so they can be looked up without needing to know the service ID. +// +// Duplicate indexes with the same operation name are technically fine, but there doesn't seem to be any benefit in allowing it, +// and it trivially ensures that all values have only one operation name. +func TestOperationIndexesAreUnique(t *testing.T) { + seen := make(map[int]bool) + for serviceIdx, serviceOps := range ScopeDefs { + for idx := range serviceOps { + if seen[idx] { + t.Error("duplicate operation index:", idx, "with name:", serviceOps[idx].operation, "in service:", serviceIdx) + } + seen[idx] = true + + // to serve as documentation: operations are NOT unique due to dups across services. + // this is probably fine, they're just used as metric tag values. + // but not being able to prevent dups means it's possible we have unintentional name collisions. + // + // name := serviceOps[idx].operation + // if seenNames[name] { + // t.Error("duplicate operation string:", name, "at different indexes") + // } + // seenNames[name] = true + } + } +} + +func TestMetricsAreUnique(t *testing.T) { + // Duplicate indexes is arguably fine, but there doesn't seem to be any benefit in allowing it. + t.Run("indexes", func(t *testing.T) { + seen := make(map[int]bool) + for _, serviceMetrics := range MetricDefs { + for idx := range serviceMetrics { + if seen[idx] { + t.Error("duplicate metric index:", idx, "with name:", serviceMetrics[idx].metricName) + } + seen[idx] = true + } + } + }) + // Duplicate names carry a high risk of causing different-tag collisions in Prometheus, and should not be allowed. + t.Run("names", func(t *testing.T) { + seen := make(map[string]bool) + for _, serviceMetrics := range MetricDefs { + for _, met := range serviceMetrics { + name := met.metricName.String() + if seen[name] { + switch name { + case "cache_full", "cache_miss", "cache_hit", "cadence_requests_per_tl", "cross_cluster_fetch_errors": + continue // known dup. worth changing as some cause problems. + } + t.Errorf("duplicate metric name %q", name) + } + seen[name] = true + } + } + }) +} + func TestExponentialDurationBuckets(t *testing.T) { factor := math.Pow(2, 0.25) assert.Equal(t, 80, len(ExponentialDurationBuckets)) diff --git a/common/metrics/nop.go b/common/metrics/nop.go index 5ccc9d15de5..e37c6b50e3f 100644 --- a/common/metrics/nop.go +++ b/common/metrics/nop.go @@ -27,9 +27,9 @@ import ( ) var ( - NoopClient = &noopClientImpl{} - NoopScope = &noopScopeImpl{} - NoopStopwatch = tally.NewStopwatch(time.Now(), &nopStopwatchRecorder{}) + NoopClient Client = &noopClientImpl{} + NoopScope Scope = &noopScopeImpl{} + NoopStopwatch = tally.NewStopwatch(time.Now(), &nopStopwatchRecorder{}) ) type nopStopwatchRecorder struct{}