Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ $(BUILD)/goversion-lint:
$(BUILD)/fmt: $(BUILD)/codegen # formatting must occur only after all other go-file-modifications are done
# $(BUILD)/copyright
# $(BUILD)/copyright: $(BUILD)/codegen # must add copyright to generated code, sometimes needs re-formatting
$(BUILD)/codegen: $(BUILD)/thrift $(BUILD)/protoc
$(BUILD)/codegen: $(BUILD)/thrift $(BUILD)/protoc $(BUILD)/metrics
$(BUILD)/thrift: $(BUILD)/go_mod_check
$(BUILD)/protoc: $(BUILD)/go_mod_check
$(BUILD)/go_mod_check:
Expand Down Expand Up @@ -211,6 +211,9 @@ $(BIN)/protoc-gen-gogofast: go.mod go.work | $(BIN)
$(BIN)/protoc-gen-yarpc-go: go.mod go.work | $(BIN)
$(call go_mod_build_tool,go.uber.org/yarpc/encoding/protobuf/protoc-gen-yarpc-go)

$(BIN)/metricslint: internal/tools/go.mod go.work $(wildcard internal/tools/metricslint/* internal/tools/metricslint/cmd/*) | $(BIN)
$(call go_build_tool,./metricslint/cmd,metricslint)

$(BUILD)/go_mod_check: go.mod internal/tools/go.mod go.work
$Q # generated == used is occasionally important for gomock / mock libs in general. this is not a definite problem if violated though.
$Q ./scripts/check-gomod-version.sh github.com/golang/mock/gomock $(if $(verbose),-v)
Expand Down Expand Up @@ -404,6 +407,11 @@ $(BUILD)/code-lint: $(LINT_SRC) $(BIN)/revive | $(BUILD)
fi
$Q touch $@

$(BUILD)/metrics-lint: $(ALL_SRC) $(BIN)/metricslint | $(BUILD)
$Q echo "linting metrics definitions..."
$Q $(BIN_PATH) $(BIN)/metricslint -skip cadence_requests_per_tl,2 -skip cache_hit,2 -skip cache_full,2 -skip cache_miss,2 -skip cross_cluster_fetch_errors,2 ./...
$Q touch $@

$(BUILD)/goversion-lint: go.work Dockerfile docker/github_actions/Dockerfile${DOCKERFILE_SUFFIX}
$Q echo "checking go version..."
$Q # intentionally using go.work toolchain, as GOTOOLCHAIN is user-overridable
Expand Down Expand Up @@ -458,7 +466,7 @@ endef
# useful to actually re-run to get output again.
# reuse the intermediates for simplicity and consistency.
lint: ## (Re)run the linter
$(call remake,proto-lint gomod-lint code-lint goversion-lint)
$(call remake,proto-lint gomod-lint code-lint goversion-lint metrics-lint)

# intentionally not re-making, it's a bit slow and it's clear when it's unnecessary
fmt: $(BUILD)/fmt ## Run `gofmt` / organize imports / etc
Expand Down Expand Up @@ -547,7 +555,7 @@ tools: $(TOOLS)
go-generate: $(BIN)/mockgen $(BIN)/enumer $(BIN)/mockery $(BIN)/gowrap ## Run `go generate` to regen mocks, enums, etc
$Q echo "running go generate ./..., this takes a minute or more..."
$Q # add our bins to PATH so `go generate` can find them
$Q $(BIN_PATH) go generate $(if $(verbose),-v) ./...
$Q $(BIN_PATH) go generate $(if $(verbose),-v) ./common/metrics
$Q $(MAKE) --no-print-directory fmt
# $Q echo "updating copyright headers"
# $Q $(MAKE) --no-print-directory copyright
Expand Down Expand Up @@ -577,7 +585,7 @@ tidy: ## `go mod tidy` all packages
clean: ## Clean build products and SQLite database
rm -f $(BINS)
rm -Rf $(BUILD)
rm *.db
rm -f *.db
$(if \
$(wildcard $(STABLE_BIN)/*), \
$(warning usually-stable build tools still exist, delete the $(STABLE_BIN) folder to rebuild them),)
Expand Down
2 changes: 1 addition & 1 deletion common/membership/resolver.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ func (rpo *MultiringResolver) LookupByAddress(service, address string) (HostInfo
return m, nil
}
}
rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(1)
rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(metrics.RingResolverError)
return HostInfo{}, fmt.Errorf("host not found in service %s: %s", service, address)
}

Expand Down
59 changes: 38 additions & 21 deletions common/metrics/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ type (

// metricDefinition contains the definition for a metric
metricDefinition struct {
metricType MetricType // metric type
metricName MetricName // metric name
metricRollupName MetricName // optional. if non-empty, this name must be used for rolled-up version of this metric
buckets tally.Buckets // buckets if we are emitting histograms
metricType MetricType // metric type
metricName MetricName // metric name
metricRollupName MetricName // optional. if non-empty, this name must be used for rolled-up version of this metric
buckets tally.Buckets // buckets if we are emitting histograms
exponentialBuckets histogrammy[SubsettableHistogram]
intExponentialBuckets histogrammy[IntSubsettableHistogram]
}

// scopeDefinition holds the tag definitions for a scope
Expand All @@ -53,6 +55,10 @@ type (
ServiceIdx int
)

func (s scopeDefinition) GetOperationString() string {
return s.operation
}

// MetricTypes which are supported
const (
Counter MetricType = iota
Expand Down Expand Up @@ -1068,7 +1074,7 @@ const (
// -- Operation scopes for History service --
const (
// HistoryStartWorkflowExecutionScope tracks StartWorkflowExecution API calls received by service
HistoryStartWorkflowExecutionScope = iota + NumCommonScopes
HistoryStartWorkflowExecutionScope = iota + NumFrontendScopes
// HistoryRecordActivityTaskHeartbeatScope tracks RecordActivityTaskHeartbeat API calls received by service
HistoryRecordActivityTaskHeartbeatScope
// HistoryRespondDecisionTaskCompletedScope tracks RespondDecisionTaskCompleted API calls received by service
Expand Down Expand Up @@ -1356,7 +1362,7 @@ const (
// -- Operation scopes for Matching service --
const (
// PollForDecisionTaskScope tracks PollForDecisionTask API calls received by service
MatchingPollForDecisionTaskScope = iota + NumCommonScopes
MatchingPollForDecisionTaskScope = iota + NumHistoryScopes
// PollForActivityTaskScope tracks PollForActivityTask API calls received by service
MatchingPollForActivityTaskScope
// MatchingAddActivityTaskScope tracks AddActivityTask API calls received by service
Expand Down Expand Up @@ -1392,7 +1398,7 @@ const (
// -- Operation scopes for Worker service --
const (
// ReplicationScope is the scope used by all metric emitted by replicator
ReplicatorScope = iota + NumCommonScopes
ReplicatorScope = iota + NumMatchingScopes
// DomainReplicationTaskScope is the scope used by domain task replication processing
DomainReplicationTaskScope
// ESProcessorScope is scope used by all metric emitted by esProcessor
Expand Down Expand Up @@ -1440,7 +1446,7 @@ const (
// -- Operation scopes for ShardDistributor service --
const (
// ShardDistributorGetShardOwnerScope tracks GetShardOwner API calls received by service
ShardDistributorGetShardOwnerScope = iota + NumCommonScopes
ShardDistributorGetShardOwnerScope = iota + NumWorkerScopes
ShardDistributorHeartbeatScope
ShardDistributorAssignLoopScope

Expand Down Expand Up @@ -2398,6 +2404,8 @@ const (
ActiveClusterManagerLookupFailureCount
ActiveClusterManagerLookupLatency

RingResolverError

NumCommonMetrics // Needs to be last on this list for iota numbering
)

Expand All @@ -2414,6 +2422,7 @@ const (
TaskBatchCompleteCounter
TaskBatchCompleteFailure
TaskProcessingLatency
ExponentialTaskProcessingLatency
TaskQueueLatency
ScheduleToStartHistoryQueueLatencyPerTaskList
TaskRequestsOldScheduler
Expand Down Expand Up @@ -2676,6 +2685,7 @@ const (
ReplicationTaskCleanupCount
ReplicationTaskCleanupFailure
ReplicationTaskLatency
ExponentialReplicationTaskLatency
MutableStateChecksumMismatch
MutableStateChecksumInvalidated
FailoverMarkerCount
Expand All @@ -2700,12 +2710,13 @@ const (
VirtualQueueCountGauge
VirtualQueuePausedGauge
VirtualQueueRunningGauge

NumHistoryMetrics
)

// Matching metrics enum
const (
PollSuccessPerTaskListCounter = iota + NumCommonMetrics
PollSuccessPerTaskListCounter = iota + NumHistoryMetrics
PollTimeoutPerTaskListCounter
PollSuccessWithSyncPerTaskListCounter
LeaseRequestPerTaskListCounter
Expand Down Expand Up @@ -2783,12 +2794,13 @@ const (
IsolationGroupUpscale
IsolationGroupDownscale
PartitionDrained

NumMatchingMetrics
)

// Worker metrics enum
const (
ReplicatorMessages = iota + NumCommonMetrics
ReplicatorMessages = iota + NumMatchingMetrics
ReplicatorFailures
ReplicatorMessagesDropped
ReplicatorLatency
Expand Down Expand Up @@ -2872,12 +2884,13 @@ const (
DiagnosticsWorkflowStartedCount
DiagnosticsWorkflowSuccess
DiagnosticsWorkflowExecutionLatency

NumWorkerMetrics
)

// ShardDistributor metrics enum
const (
ShardDistributorRequests = iota + NumCommonMetrics
ShardDistributorRequests = iota + NumWorkerMetrics
ShardDistributorFailures
ShardDistributorLatency
ShardDistributorErrContextTimeoutCounter
Expand Down Expand Up @@ -3178,18 +3191,21 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
ActiveClusterManagerLookupSuccessCount: {metricName: "active_cluster_manager_lookup_success_count", metricType: Counter},
ActiveClusterManagerLookupFailureCount: {metricName: "active_cluster_manager_lookup_failure_count", metricType: Counter},
ActiveClusterManagerLookupLatency: {metricName: "active_cluster_manager_lookup_latency", metricType: Histogram, buckets: ExponentialDurationBuckets},

RingResolverError: {metricName: "ring_resolver_error", metricType: Counter},
},
History: {
TaskRequests: {metricName: "task_requests", metricType: Counter},
TaskLatency: {metricName: "task_latency", metricType: Timer},
TaskAttemptTimer: {metricName: "task_attempt", metricType: Timer},
TaskFailures: {metricName: "task_errors", metricType: Counter},
TaskDiscarded: {metricName: "task_errors_discarded", metricType: Counter},
TaskStandbyRetryCounter: {metricName: "task_errors_standby_retry_counter", metricType: Counter},
TaskNotActiveCounter: {metricName: "task_errors_not_active_counter", metricType: Counter},
TaskLimitExceededCounter: {metricName: "task_errors_limit_exceeded_counter", metricType: Counter},
TaskProcessingLatency: {metricName: "task_latency_processing", metricType: Timer},
TaskQueueLatency: {metricName: "task_latency_queue", metricType: Timer},
TaskRequests: {metricName: "task_requests", metricType: Counter},
TaskLatency: {metricName: "task_latency", metricType: Timer},
TaskAttemptTimer: {metricName: "task_attempt", metricType: Timer},
TaskFailures: {metricName: "task_errors", metricType: Counter},
TaskDiscarded: {metricName: "task_errors_discarded", metricType: Counter},
TaskStandbyRetryCounter: {metricName: "task_errors_standby_retry_counter", metricType: Counter},
TaskNotActiveCounter: {metricName: "task_errors_not_active_counter", metricType: Counter},
TaskLimitExceededCounter: {metricName: "task_errors_limit_exceeded_counter", metricType: Counter},
TaskProcessingLatency: {metricName: "task_latency_processing", metricType: Timer},
ExponentialTaskProcessingLatency: {metricName: "task_latency_processing_ns", metricType: Histogram, exponentialBuckets: Low1ms10s},
TaskQueueLatency: {metricName: "task_latency_queue", metricType: Timer},
ScheduleToStartHistoryQueueLatencyPerTaskList: {metricName: "schedule_to_start_history_queue_latency_per_tl", metricType: Timer},
TaskRequestsOldScheduler: {metricName: "task_requests_old_scheduler", metricType: Counter},
TaskRequestsNewScheduler: {metricName: "task_requests_new_scheduler", metricType: Counter},
Expand Down Expand Up @@ -3444,6 +3460,7 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{
ReplicationTaskCleanupCount: {metricName: "replication_task_cleanup_count", metricType: Counter},
ReplicationTaskCleanupFailure: {metricName: "replication_task_cleanup_failed", metricType: Counter},
ReplicationTaskLatency: {metricName: "replication_task_latency", metricType: Timer},
ExponentialReplicationTaskLatency: {metricName: "replication_task_latency_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h},
MutableStateChecksumMismatch: {metricName: "mutable_state_checksum_mismatch", metricType: Counter},
MutableStateChecksumInvalidated: {metricName: "mutable_state_checksum_invalidated", metricType: Counter},
FailoverMarkerCount: {metricName: "failover_marker_count", metricType: Counter},
Expand Down
33 changes: 33 additions & 0 deletions common/metrics/defs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,39 @@ func TestMetricDefs(t *testing.T) {
}
}

// "index -> operation" must be unique for structured.DynamicOperationTags' int lookup to work consistently.
// Duplicate indexes with the same operation name are technically fine, but there doesn't seem to be any benefit in allowing it,
// and it trivially ensures that all indexes have only one operation name.
func TestOperationIndexesAreUnique(t *testing.T) {
seen := make(map[int]bool)
for serviceIdx, serviceOps := range ScopeDefs {
for idx := range serviceOps {
if seen[idx] {
t.Error("duplicate operation index:", idx, "with name:", serviceOps[idx].operation, "in service:", serviceIdx)
}
seen[idx] = true
}
}
}

func TestMetricsAreUnique(t *testing.T) {
// Duplicate indexes is arguably fine, but there doesn't seem to be any benefit in allowing it.
//
// Duplicate names are also linted, but they're done via an analyzer (metricslint) instead, to
// allow checking across multiple formats.
t.Run("indexes", func(t *testing.T) {
seen := make(map[int]bool)
for _, serviceMetrics := range MetricDefs {
for idx := range serviceMetrics {
if seen[idx] {
t.Error("duplicate metric index:", idx, "with name:", serviceMetrics[idx].metricName)
}
seen[idx] = true
}
}
})
}

func TestExponentialDurationBuckets(t *testing.T) {
factor := math.Pow(2, 0.25)
assert.Equal(t, 80, len(ExponentialDurationBuckets))
Expand Down
Loading
Loading