Skip to content

Commit 93f66d0

Browse files
authored
Make metric indexes unique (#7237)
Part cleanup due to discoveries in e.g. #7210, part prep-work for migrating metrics to almost literally any new setup (to make that easier and safer, by allowing int -> name lookups). I *believe* this is a safe thing to do, as we don't seem to do anything fancy with the metric-def indexes. But it's quite hard to verify because we don't have a unique type (fixed in #7238) and all the code here is rather value-oriented and fallback-y so it's hard to be totally confident that everything is handled.
1 parent b9d23d9 commit 93f66d0

File tree

4 files changed

+80
-11
lines changed

4 files changed

+80
-11
lines changed

common/membership/resolver.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ func (rpo *MultiringResolver) LookupByAddress(service, address string) (HostInfo
217217
return m, nil
218218
}
219219
}
220-
rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(1)
220+
rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(metrics.RingResolverError)
221221
return HostInfo{}, fmt.Errorf("host not found in service %s: %s", service, address)
222222
}
223223

common/metrics/defs.go

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ type (
5353
ServiceIdx int
5454
)
5555

56+
func (s scopeDefinition) GetOperationString() string {
57+
return s.operation
58+
}
59+
5660
// MetricTypes which are supported
5761
const (
5862
Counter MetricType = iota
@@ -1070,7 +1074,7 @@ const (
10701074
// -- Operation scopes for History service --
10711075
const (
10721076
// HistoryStartWorkflowExecutionScope tracks StartWorkflowExecution API calls received by service
1073-
HistoryStartWorkflowExecutionScope = iota + NumCommonScopes
1077+
HistoryStartWorkflowExecutionScope = iota + NumFrontendScopes
10741078
// HistoryRecordActivityTaskHeartbeatScope tracks RecordActivityTaskHeartbeat API calls received by service
10751079
HistoryRecordActivityTaskHeartbeatScope
10761080
// HistoryRespondDecisionTaskCompletedScope tracks RespondDecisionTaskCompleted API calls received by service
@@ -1358,7 +1362,7 @@ const (
13581362
// -- Operation scopes for Matching service --
13591363
const (
13601364
// PollForDecisionTaskScope tracks PollForDecisionTask API calls received by service
1361-
MatchingPollForDecisionTaskScope = iota + NumCommonScopes
1365+
MatchingPollForDecisionTaskScope = iota + NumHistoryScopes
13621366
// PollForActivityTaskScope tracks PollForActivityTask API calls received by service
13631367
MatchingPollForActivityTaskScope
13641368
// MatchingAddActivityTaskScope tracks AddActivityTask API calls received by service
@@ -1394,7 +1398,7 @@ const (
13941398
// -- Operation scopes for Worker service --
13951399
const (
13961400
// ReplicationScope is the scope used by all metric emitted by replicator
1397-
ReplicatorScope = iota + NumCommonScopes
1401+
ReplicatorScope = iota + NumMatchingScopes
13981402
// DomainReplicationTaskScope is the scope used by domain task replication processing
13991403
DomainReplicationTaskScope
14001404
// ESProcessorScope is scope used by all metric emitted by esProcessor
@@ -1442,7 +1446,7 @@ const (
14421446
// -- Operation scopes for ShardDistributor service --
14431447
const (
14441448
// ShardDistributorGetShardOwnerScope tracks GetShardOwner API calls received by service
1445-
ShardDistributorGetShardOwnerScope = iota + NumCommonScopes
1449+
ShardDistributorGetShardOwnerScope = iota + NumWorkerScopes
14461450
ShardDistributorHeartbeatScope
14471451
ShardDistributorAssignLoopScope
14481452

@@ -2404,6 +2408,8 @@ const (
24042408
// cluster forwarding policy metrics
24052409
ClusterForwardingPolicyRequests
24062410

2411+
RingResolverError
2412+
24072413
NumCommonMetrics // Needs to be last on this list for iota numbering
24082414
)
24092415

@@ -2708,12 +2714,13 @@ const (
27082714
VirtualQueueCountGauge
27092715
VirtualQueuePausedGauge
27102716
VirtualQueueRunningGauge
2717+
27112718
NumHistoryMetrics
27122719
)
27132720

27142721
// Matching metrics enum
27152722
const (
2716-
PollSuccessPerTaskListCounter = iota + NumCommonMetrics
2723+
PollSuccessPerTaskListCounter = iota + NumHistoryMetrics
27172724
PollTimeoutPerTaskListCounter
27182725
PollSuccessWithSyncPerTaskListCounter
27192726
LeaseRequestPerTaskListCounter
@@ -2791,12 +2798,13 @@ const (
27912798
IsolationGroupUpscale
27922799
IsolationGroupDownscale
27932800
PartitionDrained
2801+
27942802
NumMatchingMetrics
27952803
)
27962804

27972805
// Worker metrics enum
27982806
const (
2799-
ReplicatorMessages = iota + NumCommonMetrics
2807+
ReplicatorMessages = iota + NumMatchingMetrics
28002808
ReplicatorFailures
28012809
ReplicatorMessagesDropped
28022810
ReplicatorLatency
@@ -2880,12 +2888,13 @@ const (
28802888
DiagnosticsWorkflowStartedCount
28812889
DiagnosticsWorkflowSuccess
28822890
DiagnosticsWorkflowExecutionLatency
2891+
28832892
NumWorkerMetrics
28842893
)
28852894

28862895
// ShardDistributor metrics enum
28872896
const (
2888-
ShardDistributorRequests = iota + NumCommonMetrics
2897+
ShardDistributorRequests = iota + NumWorkerMetrics
28892898
ShardDistributorFailures
28902899
ShardDistributorLatency
28912900
ShardDistributorErrContextTimeoutCounter
@@ -3188,6 +3197,8 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
31883197
ActiveClusterManagerLookupLatency: {metricName: "active_cluster_manager_lookup_latency", metricType: Histogram, buckets: ExponentialDurationBuckets},
31893198

31903199
ClusterForwardingPolicyRequests: {metricName: "cluster_forwarding_policy_requests", metricType: Counter},
3200+
3201+
RingResolverError: {metricName: "ring_resolver_error", metricType: Counter},
31913202
},
31923203
History: {
31933204
TaskRequests: {metricName: "task_requests", metricType: Counter},

common/metrics/defs_test.go

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,64 @@ func TestMetricDefs(t *testing.T) {
129129
}
130130
}
131131

132+
// "index -> operation" must be unique so they can be looked up without needing to know the service ID.
133+
//
134+
// Duplicate indexes with the same operation name are technically fine, but there doesn't seem to be any benefit in allowing it,
135+
// and it trivially ensures that all values have only one operation name.
136+
func TestOperationIndexesAreUnique(t *testing.T) {
137+
seen := make(map[int]bool)
138+
for serviceIdx, serviceOps := range ScopeDefs {
139+
for idx := range serviceOps {
140+
if seen[idx] {
141+
t.Error("duplicate operation index:", idx, "with name:", serviceOps[idx].operation, "in service:", serviceIdx)
142+
}
143+
seen[idx] = true
144+
145+
// to serve as documentation: operations are NOT unique due to dups across services.
146+
// this is probably fine, they're just used as metric tag values.
147+
// but not being able to prevent dups means it's possible we have unintentional name collisions.
148+
//
149+
// name := serviceOps[idx].operation
150+
// if seenNames[name] {
151+
// t.Error("duplicate operation string:", name, "at different indexes")
152+
// }
153+
// seenNames[name] = true
154+
}
155+
}
156+
}
157+
158+
func TestMetricsAreUnique(t *testing.T) {
159+
// Duplicate indexes is arguably fine, but there doesn't seem to be any benefit in allowing it.
160+
t.Run("indexes", func(t *testing.T) {
161+
seen := make(map[int]bool)
162+
for _, serviceMetrics := range MetricDefs {
163+
for idx := range serviceMetrics {
164+
if seen[idx] {
165+
t.Error("duplicate metric index:", idx, "with name:", serviceMetrics[idx].metricName)
166+
}
167+
seen[idx] = true
168+
}
169+
}
170+
})
171+
// Duplicate names carry a high risk of causing different-tag collisions in Prometheus, and should not be allowed.
172+
t.Run("names", func(t *testing.T) {
173+
seen := make(map[string]bool)
174+
for _, serviceMetrics := range MetricDefs {
175+
for _, met := range serviceMetrics {
176+
name := met.metricName.String()
177+
if seen[name] {
178+
switch name {
179+
case "cache_full", "cache_miss", "cache_hit", "cadence_requests_per_tl", "cross_cluster_fetch_errors":
180+
continue // known dup. worth changing as some cause problems.
181+
}
182+
t.Errorf("duplicate metric name %q", name)
183+
}
184+
seen[name] = true
185+
}
186+
}
187+
})
188+
}
189+
132190
func TestExponentialDurationBuckets(t *testing.T) {
133191
factor := math.Pow(2, 0.25)
134192
assert.Equal(t, 80, len(ExponentialDurationBuckets))

common/metrics/nop.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@ import (
2727
)
2828

2929
var (
30-
NoopClient = &noopClientImpl{}
31-
NoopScope = &noopScopeImpl{}
32-
NoopStopwatch = tally.NewStopwatch(time.Now(), &nopStopwatchRecorder{})
30+
NoopClient Client = &noopClientImpl{}
31+
NoopScope Scope = &noopScopeImpl{}
32+
NoopStopwatch = tally.NewStopwatch(time.Now(), &nopStopwatchRecorder{})
3333
)
3434

3535
type nopStopwatchRecorder struct{}

0 commit comments

Comments
 (0)