Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/membership/resolver.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ func (rpo *MultiringResolver) LookupByAddress(service, address string) (HostInfo
return m, nil
}
}
rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(1)
rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(metrics.RingResolverError)
return HostInfo{}, fmt.Errorf("host not found in service %s: %s", service, address)
}

Expand Down
25 changes: 18 additions & 7 deletions common/metrics/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ type (
ServiceIdx int
)

func (s scopeDefinition) GetOperationString() string {
return s.operation
}

Comment on lines +56 to +59
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could remove this - it's mostly just prep-work since almost anything leaving the scope-index ints will need it.

// MetricTypes which are supported
const (
Counter MetricType = iota
Expand Down Expand Up @@ -1070,7 +1074,7 @@ const (
// -- Operation scopes for History service --
const (
// HistoryStartWorkflowExecutionScope tracks StartWorkflowExecution API calls received by service
HistoryStartWorkflowExecutionScope = iota + NumCommonScopes
HistoryStartWorkflowExecutionScope = iota + NumFrontendScopes
// HistoryRecordActivityTaskHeartbeatScope tracks RecordActivityTaskHeartbeat API calls received by service
HistoryRecordActivityTaskHeartbeatScope
// HistoryRespondDecisionTaskCompletedScope tracks RespondDecisionTaskCompleted API calls received by service
Expand Down Expand Up @@ -1358,7 +1362,7 @@ const (
// -- Operation scopes for Matching service --
const (
// PollForDecisionTaskScope tracks PollForDecisionTask API calls received by service
MatchingPollForDecisionTaskScope = iota + NumCommonScopes
MatchingPollForDecisionTaskScope = iota + NumHistoryScopes
// PollForActivityTaskScope tracks PollForActivityTask API calls received by service
MatchingPollForActivityTaskScope
// MatchingAddActivityTaskScope tracks AddActivityTask API calls received by service
Expand Down Expand Up @@ -1394,7 +1398,7 @@ const (
// -- Operation scopes for Worker service --
const (
// ReplicationScope is the scope used by all metric emitted by replicator
ReplicatorScope = iota + NumCommonScopes
ReplicatorScope = iota + NumMatchingScopes
// DomainReplicationTaskScope is the scope used by domain task replication processing
DomainReplicationTaskScope
// ESProcessorScope is scope used by all metric emitted by esProcessor
Expand Down Expand Up @@ -1442,7 +1446,7 @@ const (
// -- Operation scopes for ShardDistributor service --
const (
// ShardDistributorGetShardOwnerScope tracks GetShardOwner API calls received by service
ShardDistributorGetShardOwnerScope = iota + NumCommonScopes
ShardDistributorGetShardOwnerScope = iota + NumWorkerScopes
ShardDistributorHeartbeatScope
ShardDistributorAssignLoopScope

Expand Down Expand Up @@ -2404,6 +2408,8 @@ const (
// cluster forwarding policy metrics
ClusterForwardingPolicyRequests

RingResolverError
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the deal with this one? did it get missed somewhere?


NumCommonMetrics // Needs to be last on this list for iota numbering
)

Expand Down Expand Up @@ -2708,12 +2714,13 @@ const (
VirtualQueueCountGauge
VirtualQueuePausedGauge
VirtualQueueRunningGauge

NumHistoryMetrics
)

// Matching metrics enum
const (
PollSuccessPerTaskListCounter = iota + NumCommonMetrics
PollSuccessPerTaskListCounter = iota + NumHistoryMetrics
PollTimeoutPerTaskListCounter
PollSuccessWithSyncPerTaskListCounter
LeaseRequestPerTaskListCounter
Expand Down Expand Up @@ -2791,12 +2798,13 @@ const (
IsolationGroupUpscale
IsolationGroupDownscale
PartitionDrained

NumMatchingMetrics
)

// Worker metrics enum
const (
ReplicatorMessages = iota + NumCommonMetrics
ReplicatorMessages = iota + NumMatchingMetrics
ReplicatorFailures
ReplicatorMessagesDropped
ReplicatorLatency
Expand Down Expand Up @@ -2880,12 +2888,13 @@ const (
DiagnosticsWorkflowStartedCount
DiagnosticsWorkflowSuccess
DiagnosticsWorkflowExecutionLatency

NumWorkerMetrics
)

// ShardDistributor metrics enum
const (
ShardDistributorRequests = iota + NumCommonMetrics
ShardDistributorRequests = iota + NumWorkerMetrics
ShardDistributorFailures
ShardDistributorLatency
ShardDistributorErrContextTimeoutCounter
Expand Down Expand Up @@ -3188,6 +3197,8 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
ActiveClusterManagerLookupLatency: {metricName: "active_cluster_manager_lookup_latency", metricType: Histogram, buckets: ExponentialDurationBuckets},

ClusterForwardingPolicyRequests: {metricName: "cluster_forwarding_policy_requests", metricType: Counter},

RingResolverError: {metricName: "ring_resolver_error", metricType: Counter},
},
History: {
TaskRequests: {metricName: "task_requests", metricType: Counter},
Expand Down
58 changes: 58 additions & 0 deletions common/metrics/defs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,64 @@ func TestMetricDefs(t *testing.T) {
}
}

// "index -> operation" must be unique so they can be looked up without needing to know the service ID.
//
// Duplicate indexes with the same operation name are technically fine, but there doesn't seem to be any benefit in allowing it,
// and it trivially ensures that all values have only one operation name.
func TestOperationIndexesAreUnique(t *testing.T) {
seen := make(map[int]bool)
for serviceIdx, serviceOps := range ScopeDefs {
for idx := range serviceOps {
if seen[idx] {
t.Error("duplicate operation index:", idx, "with name:", serviceOps[idx].operation, "in service:", serviceIdx)
}
seen[idx] = true

// to serve as documentation: operations are NOT unique due to dups across services.
// this is probably fine, they're just used as metric tag values.
// but not being able to prevent dups means it's possible we have unintentional name collisions.
//
// name := serviceOps[idx].operation
// if seenNames[name] {
// t.Error("duplicate operation string:", name, "at different indexes")
// }
// seenNames[name] = true
}
}
}

func TestMetricsAreUnique(t *testing.T) {
// Duplicate indexes is arguably fine, but there doesn't seem to be any benefit in allowing it.
t.Run("indexes", func(t *testing.T) {
seen := make(map[int]bool)
for _, serviceMetrics := range MetricDefs {
for idx := range serviceMetrics {
if seen[idx] {
t.Error("duplicate metric index:", idx, "with name:", serviceMetrics[idx].metricName)
}
seen[idx] = true
}
}
})
// Duplicate names carry a high risk of causing different-tag collisions in Prometheus, and should not be allowed.
t.Run("names", func(t *testing.T) {
seen := make(map[string]bool)
for _, serviceMetrics := range MetricDefs {
for _, met := range serviceMetrics {
name := met.metricName.String()
if seen[name] {
switch name {
case "cache_full", "cache_miss", "cache_hit", "cadence_requests_per_tl", "cross_cluster_fetch_errors":
continue // known dup. worth changing as some cause problems.
}
t.Errorf("duplicate metric name %q", name)
}
seen[name] = true
}
}
})
}

func TestExponentialDurationBuckets(t *testing.T) {
factor := math.Pow(2, 0.25)
assert.Equal(t, 80, len(ExponentialDurationBuckets))
Expand Down
6 changes: 3 additions & 3 deletions common/metrics/nop.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ import (
)

var (
NoopClient = &noopClientImpl{}
NoopScope = &noopScopeImpl{}
NoopStopwatch = tally.NewStopwatch(time.Now(), &nopStopwatchRecorder{})
NoopClient Client = &noopClientImpl{}
NoopScope Scope = &noopScopeImpl{}
NoopStopwatch = tally.NewStopwatch(time.Now(), &nopStopwatchRecorder{})
)

type nopStopwatchRecorder struct{}
Expand Down
Loading