diff --git a/Makefile b/Makefile index 51bd35f16a6..753970abe46 100644 --- a/Makefile +++ b/Makefile @@ -64,7 +64,7 @@ $(BUILD)/goversion-lint: $(BUILD)/fmt: $(BUILD)/codegen # formatting must occur only after all other go-file-modifications are done # $(BUILD)/copyright # $(BUILD)/copyright: $(BUILD)/codegen # must add copyright to generated code, sometimes needs re-formatting -$(BUILD)/codegen: $(BUILD)/thrift $(BUILD)/protoc +$(BUILD)/codegen: $(BUILD)/thrift $(BUILD)/protoc $(BUILD)/metrics $(BUILD)/thrift: $(BUILD)/go_mod_check $(BUILD)/protoc: $(BUILD)/go_mod_check $(BUILD)/go_mod_check: @@ -211,6 +211,9 @@ $(BIN)/protoc-gen-gogofast: go.mod go.work | $(BIN) $(BIN)/protoc-gen-yarpc-go: go.mod go.work | $(BIN) $(call go_mod_build_tool,go.uber.org/yarpc/encoding/protobuf/protoc-gen-yarpc-go) +$(BIN)/metricslint: internal/tools/go.mod go.work $(wildcard internal/tools/metricslint/* internal/tools/metricslint/cmd/*) | $(BIN) + $(call go_build_tool,./metricslint/cmd,metricslint) + $(BUILD)/go_mod_check: go.mod internal/tools/go.mod go.work $Q # generated == used is occasionally important for gomock / mock libs in general. this is not a definite problem if violated though. $Q ./scripts/check-gomod-version.sh github.com/golang/mock/gomock $(if $(verbose),-v) @@ -404,6 +407,11 @@ $(BUILD)/code-lint: $(LINT_SRC) $(BIN)/revive | $(BUILD) fi $Q touch $@ +$(BUILD)/metrics-lint: $(ALL_SRC) $(BIN)/metricslint | $(BUILD) + $Q echo "linting metrics definitions..." + $Q $(BIN_PATH) $(BIN)/metricslint -skip cadence_requests_per_tl,2 -skip cache_hit,2 -skip cache_full,2 -skip cache_miss,2 -skip cross_cluster_fetch_errors,2 ./... + $Q touch $@ + $(BUILD)/goversion-lint: go.work Dockerfile docker/github_actions/Dockerfile${DOCKERFILE_SUFFIX} $Q echo "checking go version..." $Q # intentionally using go.work toolchain, as GOTOOLCHAIN is user-overridable @@ -458,7 +466,7 @@ endef # useful to actually re-run to get output again. # reuse the intermediates for simplicity and consistency. lint: ## (Re)run the linter - $(call remake,proto-lint gomod-lint code-lint goversion-lint) + $(call remake,proto-lint gomod-lint code-lint goversion-lint metrics-lint) # intentionally not re-making, it's a bit slow and it's clear when it's unnecessary fmt: $(BUILD)/fmt ## Run `gofmt` / organize imports / etc @@ -547,7 +555,7 @@ tools: $(TOOLS) go-generate: $(BIN)/mockgen $(BIN)/enumer $(BIN)/mockery $(BIN)/gowrap ## Run `go generate` to regen mocks, enums, etc $Q echo "running go generate ./..., this takes a minute or more..." $Q # add our bins to PATH so `go generate` can find them - $Q $(BIN_PATH) go generate $(if $(verbose),-v) ./... + $Q $(BIN_PATH) go generate $(if $(verbose),-v) ./common/metrics $Q $(MAKE) --no-print-directory fmt # $Q echo "updating copyright headers" # $Q $(MAKE) --no-print-directory copyright @@ -577,7 +585,7 @@ tidy: ## `go mod tidy` all packages clean: ## Clean build products and SQLite database rm -f $(BINS) rm -Rf $(BUILD) - rm *.db + rm -f *.db $(if \ $(wildcard $(STABLE_BIN)/*), \ $(warning usually-stable build tools still exist, delete the $(STABLE_BIN) folder to rebuild them),) diff --git a/common/membership/resolver.go b/common/membership/resolver.go index 28b5162db21..a06309dd899 100644 --- a/common/membership/resolver.go +++ b/common/membership/resolver.go @@ -217,7 +217,7 @@ func (rpo *MultiringResolver) LookupByAddress(service, address string) (HostInfo return m, nil } } - rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(1) + rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(metrics.RingResolverError) return HostInfo{}, fmt.Errorf("host not found in service %s: %s", service, address) } diff --git a/common/metrics/defs.go b/common/metrics/defs.go index 00214a52649..6918349261c 100644 --- a/common/metrics/defs.go +++ b/common/metrics/defs.go @@ -37,10 +37,12 @@ type ( // metricDefinition contains the definition for a metric metricDefinition struct { - metricType MetricType // metric type - metricName MetricName // metric name - metricRollupName MetricName // optional. if non-empty, this name must be used for rolled-up version of this metric - buckets tally.Buckets // buckets if we are emitting histograms + metricType MetricType // metric type + metricName MetricName // metric name + metricRollupName MetricName // optional. if non-empty, this name must be used for rolled-up version of this metric + buckets tally.Buckets // buckets if we are emitting histograms + exponentialBuckets histogrammy[SubsettableHistogram] + intExponentialBuckets histogrammy[IntSubsettableHistogram] } // scopeDefinition holds the tag definitions for a scope @@ -53,6 +55,10 @@ type ( ServiceIdx int ) +func (s scopeDefinition) GetOperationString() string { + return s.operation +} + // MetricTypes which are supported const ( Counter MetricType = iota @@ -1068,7 +1074,7 @@ const ( // -- Operation scopes for History service -- const ( // HistoryStartWorkflowExecutionScope tracks StartWorkflowExecution API calls received by service - HistoryStartWorkflowExecutionScope = iota + NumCommonScopes + HistoryStartWorkflowExecutionScope = iota + NumFrontendScopes // HistoryRecordActivityTaskHeartbeatScope tracks RecordActivityTaskHeartbeat API calls received by service HistoryRecordActivityTaskHeartbeatScope // HistoryRespondDecisionTaskCompletedScope tracks RespondDecisionTaskCompleted API calls received by service @@ -1356,7 +1362,7 @@ const ( // -- Operation scopes for Matching service -- const ( // PollForDecisionTaskScope tracks PollForDecisionTask API calls received by service - MatchingPollForDecisionTaskScope = iota + NumCommonScopes + MatchingPollForDecisionTaskScope = iota + NumHistoryScopes // PollForActivityTaskScope tracks PollForActivityTask API calls received by service MatchingPollForActivityTaskScope // MatchingAddActivityTaskScope tracks AddActivityTask API calls received by service @@ -1392,7 +1398,7 @@ const ( // -- Operation scopes for Worker service -- const ( // ReplicationScope is the scope used by all metric emitted by replicator - ReplicatorScope = iota + NumCommonScopes + ReplicatorScope = iota + NumMatchingScopes // DomainReplicationTaskScope is the scope used by domain task replication processing DomainReplicationTaskScope // ESProcessorScope is scope used by all metric emitted by esProcessor @@ -1440,7 +1446,7 @@ const ( // -- Operation scopes for ShardDistributor service -- const ( // ShardDistributorGetShardOwnerScope tracks GetShardOwner API calls received by service - ShardDistributorGetShardOwnerScope = iota + NumCommonScopes + ShardDistributorGetShardOwnerScope = iota + NumWorkerScopes ShardDistributorHeartbeatScope ShardDistributorAssignLoopScope @@ -2398,6 +2404,8 @@ const ( ActiveClusterManagerLookupFailureCount ActiveClusterManagerLookupLatency + RingResolverError + NumCommonMetrics // Needs to be last on this list for iota numbering ) @@ -2414,6 +2422,7 @@ const ( TaskBatchCompleteCounter TaskBatchCompleteFailure TaskProcessingLatency + ExponentialTaskProcessingLatency TaskQueueLatency ScheduleToStartHistoryQueueLatencyPerTaskList TaskRequestsOldScheduler @@ -2676,6 +2685,7 @@ const ( ReplicationTaskCleanupCount ReplicationTaskCleanupFailure ReplicationTaskLatency + ExponentialReplicationTaskLatency MutableStateChecksumMismatch MutableStateChecksumInvalidated FailoverMarkerCount @@ -2700,12 +2710,13 @@ const ( VirtualQueueCountGauge VirtualQueuePausedGauge VirtualQueueRunningGauge + NumHistoryMetrics ) // Matching metrics enum const ( - PollSuccessPerTaskListCounter = iota + NumCommonMetrics + PollSuccessPerTaskListCounter = iota + NumHistoryMetrics PollTimeoutPerTaskListCounter PollSuccessWithSyncPerTaskListCounter LeaseRequestPerTaskListCounter @@ -2783,12 +2794,13 @@ const ( IsolationGroupUpscale IsolationGroupDownscale PartitionDrained + NumMatchingMetrics ) // Worker metrics enum const ( - ReplicatorMessages = iota + NumCommonMetrics + ReplicatorMessages = iota + NumMatchingMetrics ReplicatorFailures ReplicatorMessagesDropped ReplicatorLatency @@ -2872,12 +2884,13 @@ const ( DiagnosticsWorkflowStartedCount DiagnosticsWorkflowSuccess DiagnosticsWorkflowExecutionLatency + NumWorkerMetrics ) // ShardDistributor metrics enum const ( - ShardDistributorRequests = iota + NumCommonMetrics + ShardDistributorRequests = iota + NumWorkerMetrics ShardDistributorFailures ShardDistributorLatency ShardDistributorErrContextTimeoutCounter @@ -3178,18 +3191,21 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{ ActiveClusterManagerLookupSuccessCount: {metricName: "active_cluster_manager_lookup_success_count", metricType: Counter}, ActiveClusterManagerLookupFailureCount: {metricName: "active_cluster_manager_lookup_failure_count", metricType: Counter}, ActiveClusterManagerLookupLatency: {metricName: "active_cluster_manager_lookup_latency", metricType: Histogram, buckets: ExponentialDurationBuckets}, + + RingResolverError: {metricName: "ring_resolver_error", metricType: Counter}, }, History: { - TaskRequests: {metricName: "task_requests", metricType: Counter}, - TaskLatency: {metricName: "task_latency", metricType: Timer}, - TaskAttemptTimer: {metricName: "task_attempt", metricType: Timer}, - TaskFailures: {metricName: "task_errors", metricType: Counter}, - TaskDiscarded: {metricName: "task_errors_discarded", metricType: Counter}, - TaskStandbyRetryCounter: {metricName: "task_errors_standby_retry_counter", metricType: Counter}, - TaskNotActiveCounter: {metricName: "task_errors_not_active_counter", metricType: Counter}, - TaskLimitExceededCounter: {metricName: "task_errors_limit_exceeded_counter", metricType: Counter}, - TaskProcessingLatency: {metricName: "task_latency_processing", metricType: Timer}, - TaskQueueLatency: {metricName: "task_latency_queue", metricType: Timer}, + TaskRequests: {metricName: "task_requests", metricType: Counter}, + TaskLatency: {metricName: "task_latency", metricType: Timer}, + TaskAttemptTimer: {metricName: "task_attempt", metricType: Timer}, + TaskFailures: {metricName: "task_errors", metricType: Counter}, + TaskDiscarded: {metricName: "task_errors_discarded", metricType: Counter}, + TaskStandbyRetryCounter: {metricName: "task_errors_standby_retry_counter", metricType: Counter}, + TaskNotActiveCounter: {metricName: "task_errors_not_active_counter", metricType: Counter}, + TaskLimitExceededCounter: {metricName: "task_errors_limit_exceeded_counter", metricType: Counter}, + TaskProcessingLatency: {metricName: "task_latency_processing", metricType: Timer}, + ExponentialTaskProcessingLatency: {metricName: "task_latency_processing_ns", metricType: Histogram, exponentialBuckets: Low1ms10s}, + TaskQueueLatency: {metricName: "task_latency_queue", metricType: Timer}, ScheduleToStartHistoryQueueLatencyPerTaskList: {metricName: "schedule_to_start_history_queue_latency_per_tl", metricType: Timer}, TaskRequestsOldScheduler: {metricName: "task_requests_old_scheduler", metricType: Counter}, TaskRequestsNewScheduler: {metricName: "task_requests_new_scheduler", metricType: Counter}, @@ -3444,6 +3460,7 @@ var MetricDefs = map[ServiceIdx]map[int]metricDefinition{ ReplicationTaskCleanupCount: {metricName: "replication_task_cleanup_count", metricType: Counter}, ReplicationTaskCleanupFailure: {metricName: "replication_task_cleanup_failed", metricType: Counter}, ReplicationTaskLatency: {metricName: "replication_task_latency", metricType: Timer}, + ExponentialReplicationTaskLatency: {metricName: "replication_task_latency_ns", metricType: Histogram, exponentialBuckets: Mid1ms24h}, MutableStateChecksumMismatch: {metricName: "mutable_state_checksum_mismatch", metricType: Counter}, MutableStateChecksumInvalidated: {metricName: "mutable_state_checksum_invalidated", metricType: Counter}, FailoverMarkerCount: {metricName: "failover_marker_count", metricType: Counter}, diff --git a/common/metrics/defs_test.go b/common/metrics/defs_test.go index 122aa359645..db6c15a96a3 100644 --- a/common/metrics/defs_test.go +++ b/common/metrics/defs_test.go @@ -129,6 +129,39 @@ func TestMetricDefs(t *testing.T) { } } +// "index -> operation" must be unique for structured.DynamicOperationTags' int lookup to work consistently. +// Duplicate indexes with the same operation name are technically fine, but there doesn't seem to be any benefit in allowing it, +// and it trivially ensures that all indexes have only one operation name. +func TestOperationIndexesAreUnique(t *testing.T) { + seen := make(map[int]bool) + for serviceIdx, serviceOps := range ScopeDefs { + for idx := range serviceOps { + if seen[idx] { + t.Error("duplicate operation index:", idx, "with name:", serviceOps[idx].operation, "in service:", serviceIdx) + } + seen[idx] = true + } + } +} + +func TestMetricsAreUnique(t *testing.T) { + // Duplicate indexes is arguably fine, but there doesn't seem to be any benefit in allowing it. + // + // Duplicate names are also linted, but they're done via an analyzer (metricslint) instead, to + // allow checking across multiple formats. + t.Run("indexes", func(t *testing.T) { + seen := make(map[int]bool) + for _, serviceMetrics := range MetricDefs { + for idx := range serviceMetrics { + if seen[idx] { + t.Error("duplicate metric index:", idx, "with name:", serviceMetrics[idx].metricName) + } + seen[idx] = true + } + } + }) +} + func TestExponentialDurationBuckets(t *testing.T) { factor := math.Pow(2, 0.25) assert.Equal(t, 80, len(ExponentialDurationBuckets)) diff --git a/common/metrics/histograms.go b/common/metrics/histograms.go new file mode 100644 index 00000000000..80b26ada4c8 --- /dev/null +++ b/common/metrics/histograms.go @@ -0,0 +1,236 @@ +package metrics + +import ( + "fmt" + "math" + "slices" + "time" + + "github.com/uber-go/tally" +) + +// Nearly all histograms should use pre-defined buckets here, to encourage consistency. +// Name them more by their intent than their exact ranges - if you need something outside the intent, +// make a new bucket. +// +// Extreme, rare cases should still create a new bucket here, not declare one in-line elsewhere. +// +// Partly this helps us document our ranges, and partly it ensures that any customized emitter +// can easily detect buckets with an `==` comparison, and can choose to use a different one +// if needed (e.g. to reduce scale if it is too costly to support). +var ( + // Default1ms10m is our "default" set of buckets, targeting 1ms through 100s, + // and is "rounded up" slightly to reach 80 buckets == 16 minutes (100s needs 68 buckets), + // plus multi-minute exceptions are common enough to support for the small additional cost. + // + // If you need sub-millisecond precision, or substantially longer durations than about 1 minute, + // consider using a different histogram. + Default1ms10m = makeSubsettableHistogram(time.Millisecond, 2, func(last time.Duration, length int) bool { + return last >= 10*time.Minute && length == 80 + }) + + // High1ms24h covers things like activity latency, where "longer than 1 day" is not + // particularly worth being precise about. + // + // This uses a lot of buckets, so it must be used with relatively-low cardinality elsewhere, + // and/or consider dual emitting (Mid1ms24h or lower) so overviews can be queried efficiently. + High1ms24h = makeSubsettableHistogram(time.Millisecond, 2, func(last time.Duration, length int) bool { + // 24h leads to 108 buckets, raise to 112 for cleaner subsetting + return last >= 24*time.Hour && length == 112 + }) + + // Mid1ms24h is a one-scale-lower version of High1ms24h, + // for use when we know it's too detailed to be worth emitting. + // + // This uses 57 buckets, half of High1ms24h's 113 + Mid1ms24h = High1ms24h.subsetTo(1) + + // Low1ms10s is for things that are usually very fast, like most individual database calls, + // and is MUCH lower cardinality than Default1ms10m so it's more suitable for things like per-shard metrics. + Low1ms10s = makeSubsettableHistogram(time.Millisecond, 1, func(last time.Duration, length int) bool { + // 10s needs 26 buckets, raise to 32 for cleaner subsetting + return last >= 10*time.Second && length == 32 + }) + + // Mid1To32k is a histogram for small counters, like "how many replication tasks did we receive". + // This targets 1 to 32k + Mid1To32k = IntSubsettableHistogram(makeSubsettableHistogram(1, 2, func(last time.Duration, length int) bool { + // 10k needs 56 buckets, raise to 64 for cleaner subsetting + return last >= 10000 && length == 64 + })) +) + +// SubsettableHistogram is a duration-based histogram that can be subset to a lower scale +// in a standardized, predictable way. It is intentionally compatible with Prometheus and OTEL's +// "exponential histograms": https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram +// +// These histograms MUST always have a "_ns" suffix in their name to avoid confusion with timers. +type SubsettableHistogram struct { + tallyBuckets tally.DurationBuckets + + scale int +} + +// IntSubsettableHistogram is a non-duration-based integer-distribution histogram, otherwise identical +// to SubsettableHistogram but built as a separate type so you cannot pass the wrong one. +// +// These histograms MUST always have a "_counts" suffix in their name to avoid confusion with timers, +// or modify the Emitter to allow different suffixes if something else reads better. +type IntSubsettableHistogram SubsettableHistogram + +// currently we have no apparent need for float-histograms, +// as all our value ranges go from >=1 to many thousands, where +// decimal precision is pointless and mostly just looks bad. +// +// if we ever need 0..1 precision in histograms, we can add them then. +// type FloatSubsettableHistogram struct { +// tally.ValueBuckets +// scale int +// } + +func (s SubsettableHistogram) subsetTo(newScale int) SubsettableHistogram { + if newScale >= s.scale { + panic(fmt.Sprintf("scale %v is not less than the current scale %v", newScale, s.scale)) + } + if newScale < 0 { + panic(fmt.Sprintf("negative scales (%v == greater than *2 per step) are possible, but do not have tests yet", newScale)) + } + dup := SubsettableHistogram{ + tallyBuckets: slices.Clone(s.tallyBuckets), + scale: s.scale, + } + // remove every other bucket per -1 scale + for dup.scale > newScale { + if (len(dup.tallyBuckets)-1)%2 != 0 { + panic(fmt.Sprintf("cannot subset from scale %v to %v, %v-buckets is not divisible by 2", dup.scale, dup.scale-1, len(dup.tallyBuckets)-1)) + } + half := make(tally.DurationBuckets, 0, ((len(dup.tallyBuckets)-1)/2)+1) + half = append(half, dup.tallyBuckets[0]) // keep the zero value + for i := 1; i < len(dup.tallyBuckets); i += 2 { + half = append(half, dup.tallyBuckets[i]) // add first, third, etc + } + dup.tallyBuckets = half + dup.scale-- + } + return dup +} + +func (i IntSubsettableHistogram) subsetTo(newScale int) IntSubsettableHistogram { + return IntSubsettableHistogram(SubsettableHistogram(i).subsetTo(newScale)) +} + +// makeSubsettableHistogram is a replacement for tally.MustMakeExponentialDurationBuckets, +// tailored to make "construct a range" or "ensure N buckets" simpler for OTEL-compatible exponential histograms +// (i.e. https://opentelemetry.io/docs/specs/otel/metrics/data-model/#exponentialhistogram), +// ensures values are as precise as possible (preventing display-space-wasteful numbers like 3.99996ms), +// and that all histograms start with a 0 value (else erroneous negative values are impossible to notice). +// +// Start time must be positive, scale must be 0..3 (inclusive), and the loop MUST stop within 320 or fewer steps, +// to prevent extremely-costly histograms. +// Even 320 is far too many tbh, target 160 for very-high-value data, and generally around 80 or less (ideally a value +// that is divisible by 2 many times). +// +// The stop callback will be given the current largest value and the number of values generated so far. +// This excludes the zero value (you should ignore it anyway), so you can stop at your target value, +// or == a highly-divisible-by-2 number (do not go over). +// The buckets produced may be padded further to reach a "full" power-of-2 row, as this simplifies math elsewhere +// and costs very little compared to the rest of the histogram. +// +// For all values produced, please add a test to print the concrete values, and record the length and maximum value +// so they can be quickly checked when reading. +func makeSubsettableHistogram(start time.Duration, scale int, stop func(last time.Duration, length int) bool) SubsettableHistogram { + if start <= 0 { + panic(fmt.Sprintf("start must be greater than 0 or it will not grow exponentially, got %v", start)) + } + if scale < 0 || scale > 3 { + // anything outside this range is currently not expected and probably a mistake + panic(fmt.Sprintf("scale must be between 0 (grows by *2) and 3 (grows by *2^1/8), got scale: %v", scale)) + } + buckets := tally.DurationBuckets{ + time.Duration(0), // else "too low" and "negative" are impossible to tell apart. + // ^ note this must be excluded from calculations below, hence -1 everywhere. + } + for { + if len(buckets) > 320 { + panic(fmt.Sprintf("over 320 buckets is too many, choose a smaller range or smaller scale. "+ + "started at: %v, scale: %v, last value: %v", + start, scale, buckets[len(buckets)-1])) + } + buckets = append(buckets, nextBucket(start, len(buckets)-1, scale)) + + // stop when requested. + if stop(buckets[len(buckets)-1], len(buckets)-1) { + break + } + } + + // fill in as many buckets as are necessary to make a full "row", i.e. just + // before the next power of 2 from the original value. + // this ensures subsetting keeps "round" numbers as long as possible. + powerOfTwoWidth := int(math.Pow(2, float64(scale))) // num of buckets needed to double a value + for (len(buckets)-1)%powerOfTwoWidth != 0 { + buckets = append(buckets, nextBucket(start, len(buckets)-1, scale)) + } + return SubsettableHistogram{ + tallyBuckets: buckets, + scale: scale, + } +} + +func nextBucket(start time.Duration, num int, scale int) time.Duration { + // calculating it from `start` each time reduces floating point error, ensuring "clean" multiples + // at every power of 2 (and possibly others), instead of e.g. "1ms ... 1.9999994ms" which occurs + // if you try to build from the previous value each time. + return time.Duration( + float64(start) * + math.Pow(2, float64(num)/math.Pow(2, float64(scale)))) +} + +func (s SubsettableHistogram) histScale() int { return s.scale } +func (s SubsettableHistogram) width() int { return int(math.Pow(2, float64(s.scale))) } +func (s SubsettableHistogram) len() int { return len(s.tallyBuckets) } +func (s SubsettableHistogram) start() time.Duration { return s.tallyBuckets[1] } +func (s SubsettableHistogram) end() time.Duration { return s.tallyBuckets[len(s.tallyBuckets)-1] } +func (s SubsettableHistogram) buckets() tally.DurationBuckets { return s.tallyBuckets } +func (s SubsettableHistogram) print(to func(string, ...any)) { + to("%v\n", s.tallyBuckets[0:1]) // zero value on its own row + for rowStart := 1; rowStart < s.len(); rowStart += s.width() { + to("%v\n", s.tallyBuckets[rowStart:rowStart+s.width()]) + } +} + +func (i IntSubsettableHistogram) histScale() int { return i.scale } +func (i IntSubsettableHistogram) width() int { return int(math.Pow(2, float64(i.scale))) } +func (i IntSubsettableHistogram) len() int { return len(i.tallyBuckets) } +func (i IntSubsettableHistogram) start() time.Duration { return i.tallyBuckets[1] } +func (i IntSubsettableHistogram) end() time.Duration { + return i.tallyBuckets[len(i.tallyBuckets)-1] +} +func (i IntSubsettableHistogram) buckets() tally.DurationBuckets { return i.tallyBuckets } +func (i IntSubsettableHistogram) print(to func(string, ...any)) { + // fairly unreadable as duration-strings, so convert to int by hand + to("[%d]\n", int(i.tallyBuckets[0])) // zero value on its own row + for rowStart := 1; rowStart < i.len(); rowStart += i.width() { + ints := make([]int, 0, i.width()) + for _, d := range i.tallyBuckets[rowStart : rowStart+i.width()] { + ints = append(ints, int(d)) + } + to("%v\n", ints) + } +} + +var _ histogrammy[SubsettableHistogram] = SubsettableHistogram{} +var _ histogrammy[IntSubsettableHistogram] = IntSubsettableHistogram{} + +// internal utility/test methods, but could be exposed if there's a use for it +type histogrammy[T any] interface { + histScale() int // exponential scale value. 0..3 inclusive. + width() int // number of values per power of 2 == how wide to print each row. 1, 2, 4, or 8. + len() int // number of buckets + start() time.Duration // first non-zero bucket + end() time.Duration // last bucket + buckets() tally.DurationBuckets // access to all buckets + subsetTo(newScale int) T // generic so specific types can be returned + + print(to func(string, ...any)) // test-oriented printer +} diff --git a/common/metrics/histograms_test.go b/common/metrics/histograms_test.go new file mode 100644 index 00000000000..dddb12093f9 --- /dev/null +++ b/common/metrics/histograms_test.go @@ -0,0 +1,191 @@ +package metrics + +import ( + "fmt" + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestHistogramValues(t *testing.T) { + t.Run("default_1ms_to_10m", func(t *testing.T) { + checkHistogram(t, Default1ms10m, 81, + ` +[0s] +[1ms 1.189207ms 1.414213ms 1.681792ms] +[2ms 2.378414ms 2.828427ms 3.363585ms] +[4ms 4.756828ms 5.656854ms 6.727171ms] +[8ms 9.513656ms 11.313708ms 13.454342ms] +[16ms 19.027313ms 22.627416ms 26.908685ms] +[32ms 38.054627ms 45.254833ms 53.81737ms] +[64ms 76.109255ms 90.509667ms 107.634741ms] +[128ms 152.21851ms 181.019335ms 215.269482ms] +[256ms 304.437021ms 362.038671ms 430.538964ms] +[512ms 608.874042ms 724.077343ms 861.077929ms] +[1.024s 1.217748085s 1.448154687s 1.722155858s] +[2.048s 2.435496171s 2.896309375s 3.444311716s] +[4.096s 4.870992343s 5.792618751s 6.888623433s] +[8.192s 9.741984686s 11.585237502s 13.777246867s] +[16.384s 19.483969372s 23.170475005s 27.554493735s] +[32.768s 38.967938744s 46.340950011s 55.10898747s] +[1m5.536s 1m17.935877488s 1m32.681900023s 1m50.21797494s] +[2m11.072s 2m35.871754977s 3m5.363800047s 3m40.43594988s] +[4m22.144s 5m11.743509955s 6m10.727600094s 7m20.87189976s] +[8m44.288s 10m23.48701991s 12m21.455200189s 14m41.743799521s] +`) + }) + t.Run("high_1ms_to_24h", func(t *testing.T) { + checkHistogram(t, High1ms24h, 113, + ` +[0s] +[1ms 1.189207ms 1.414213ms 1.681792ms] +[2ms 2.378414ms 2.828427ms 3.363585ms] +[4ms 4.756828ms 5.656854ms 6.727171ms] +[8ms 9.513656ms 11.313708ms 13.454342ms] +[16ms 19.027313ms 22.627416ms 26.908685ms] +[32ms 38.054627ms 45.254833ms 53.81737ms] +[64ms 76.109255ms 90.509667ms 107.634741ms] +[128ms 152.21851ms 181.019335ms 215.269482ms] +[256ms 304.437021ms 362.038671ms 430.538964ms] +[512ms 608.874042ms 724.077343ms 861.077929ms] +[1.024s 1.217748085s 1.448154687s 1.722155858s] +[2.048s 2.435496171s 2.896309375s 3.444311716s] +[4.096s 4.870992343s 5.792618751s 6.888623433s] +[8.192s 9.741984686s 11.585237502s 13.777246867s] +[16.384s 19.483969372s 23.170475005s 27.554493735s] +[32.768s 38.967938744s 46.340950011s 55.10898747s] +[1m5.536s 1m17.935877488s 1m32.681900023s 1m50.21797494s] +[2m11.072s 2m35.871754977s 3m5.363800047s 3m40.43594988s] +[4m22.144s 5m11.743509955s 6m10.727600094s 7m20.87189976s] +[8m44.288s 10m23.48701991s 12m21.455200189s 14m41.743799521s] +[17m28.576s 20m46.974039821s 24m42.910400378s 29m23.487599042s] +[34m57.152s 41m33.948079642s 49m25.820800757s 58m46.975198084s] +[1h9m54.304s 1h23m7.896159284s 1h38m51.641601515s 1h57m33.950396168s] +[2h19m48.608s 2h46m15.792318568s 3h17m43.283203031s 3h55m7.900792337s] +[4h39m37.216s 5h32m31.584637137s 6h35m26.566406062s 7h50m15.801584674s] +[9h19m14.432s 11h5m3.169274274s 13h10m53.132812125s 15h40m31.603169349s] +[18h38m28.864s 22h10m6.338548549s 26h21m46.265624251s 31h21m3.206338698s] +[37h16m57.728s 44h20m12.677097099s 52h43m32.531248503s 62h42m6.412677396s] +`) + }) + t.Run("mid_1ms_24h", func(t *testing.T) { + checkHistogram(t, Mid1ms24h, 57, ` +[0s] +[1ms 1.414213ms] +[2ms 2.828427ms] +[4ms 5.656854ms] +[8ms 11.313708ms] +[16ms 22.627416ms] +[32ms 45.254833ms] +[64ms 90.509667ms] +[128ms 181.019335ms] +[256ms 362.038671ms] +[512ms 724.077343ms] +[1.024s 1.448154687s] +[2.048s 2.896309375s] +[4.096s 5.792618751s] +[8.192s 11.585237502s] +[16.384s 23.170475005s] +[32.768s 46.340950011s] +[1m5.536s 1m32.681900023s] +[2m11.072s 3m5.363800047s] +[4m22.144s 6m10.727600094s] +[8m44.288s 12m21.455200189s] +[17m28.576s 24m42.910400378s] +[34m57.152s 49m25.820800757s] +[1h9m54.304s 1h38m51.641601515s] +[2h19m48.608s 3h17m43.283203031s] +[4h39m37.216s 6h35m26.566406062s] +[9h19m14.432s 13h10m53.132812125s] +[18h38m28.864s 26h21m46.265624251s] +[37h16m57.728s 52h43m32.531248503s] +`) + }) + t.Run("mid_to_32k_ints", func(t *testing.T) { + // note: this histogram has some duplicates. + // + // this wastes a bit of memory, but Tally will choose the same index each + // time for a specific value, so it does not cause extra non-zero series + // to be stored or emitted. + // + // if this turns out to be expensive in Prometheus / etc, it's easy + // enough to start the histogram at 8, and dual-emit a non-exponential + // histogram for lower values. + checkHistogram(t, Mid1To32k, 65, ` +[0] +[1 1 1 1] +[2 2 2 3] +[4 4 5 6] +[8 9 11 13] +[16 19 22 26] +[32 38 45 53] +[64 76 90 107] +[128 152 181 215] +[256 304 362 430] +[512 608 724 861] +[1024 1217 1448 1722] +[2048 2435 2896 3444] +[4096 4870 5792 6888] +[8192 9741 11585 13777] +[16384 19483 23170 27554] +[32768 38967 46340 55108] +`) + }) + t.Run("low_cardinality_1ms_10s", func(t *testing.T) { + checkHistogram(t, Low1ms10s, 33, ` +[0s] +[1ms 1.414213ms] +[2ms 2.828427ms] +[4ms 5.656854ms] +[8ms 11.313708ms] +[16ms 22.627416ms] +[32ms 45.254833ms] +[64ms 90.509667ms] +[128ms 181.019335ms] +[256ms 362.038671ms] +[512ms 724.077343ms] +[1.024s 1.448154687s] +[2.048s 2.896309375s] +[4.096s 5.792618751s] +[8.192s 11.585237502s] +[16.384s 23.170475005s] +[32.768s 46.340950011s] +`) + }) +} + +// most histograms should pass this check, but fuzzy comparison is fine if needed for extreme cases. +func checkHistogram[T any](t *testing.T, h histogrammy[T], length int, expected string) { + var buf strings.Builder + h.print(func(s string, a ...any) { + str := fmt.Sprintf(s, a...) + t.Logf(str) + buf.WriteString(str) + }) + if strings.TrimSpace(expected) != strings.TrimSpace(buf.String()) { + t.Error("histogram definition changed, update the test if this is intended") + } + + assert.Equal(t, length, h.len(), "wrong number of buckets") + + buckets := h.buckets() + assert.EqualValues(t, 0, buckets[0], "first bucket should always be zero") + for i := 1; i < len(buckets); i += h.width() { + if i > 1 { + // ensure good float math. + // + // this is *intentionally* doing exact comparisons, as floating point math with + // human-friendly integers have precise power-of-2 multiples for a very long time. + // + // note that the equivalent tally buckets, e.g.: + // tally.MustMakeExponentialDurationBuckets(time.Millisecond, math.Pow(2, 1.0/4.0)) + // fails this test, and the logs produced show ugly e.g. 31.999942ms values. + // tally also produces incorrect results if you start at e.g. 1, + // as it just produces endless `1` values. + assert.Equalf(t, buckets[i-h.width()]*2, buckets[i], + "current row's value (%v) is not a power of 2 greater than previous (%v), skewed / bad math?", + buckets[i-h.width()], buckets[i]) + } + } +} diff --git a/common/metrics/interfaces.go b/common/metrics/interfaces.go index f36993e5f4a..695565a5b38 100644 --- a/common/metrics/interfaces.go +++ b/common/metrics/interfaces.go @@ -65,6 +65,10 @@ type ( RecordHistogramDuration(timer int, d time.Duration) // RecordHistogramValue records a histogram value for the given metric name RecordHistogramValue(timer int, value float64) + // ExponentialHistogram records a subsettable exponential histogram value for the given metric name + ExponentialHistogram(hist int, d time.Duration) + // IntExponentialHistogram records a subsettable exponential histogram value for the given metric name + IntExponentialHistogram(hist int, value int) // UpdateGauge reports Gauge type absolute value metric UpdateGauge(gauge int, value float64) // Tagged return an internal scope that can be used to add additional diff --git a/common/metrics/metricsfx/metricsfx.go b/common/metrics/metricsfx/metricsfx.go index 56702a26107..98aa7d10151 100644 --- a/common/metrics/metricsfx/metricsfx.go +++ b/common/metrics/metricsfx/metricsfx.go @@ -34,7 +34,8 @@ import ( // Module provides metrics client for fx application. var Module = fx.Module("metricsfx", - fx.Provide(buildClient)) + fx.Provide(buildClient), +) // ModuleForExternalScope provides metrics client for fx application when tally.Scope is created outside. var ModuleForExternalScope = fx.Module("metricsfx", diff --git a/common/metrics/mocks/Scope.go b/common/metrics/mocks/Scope.go index 8456725404c..e8f95750f22 100644 --- a/common/metrics/mocks/Scope.go +++ b/common/metrics/mocks/Scope.go @@ -54,6 +54,14 @@ func (_m *Scope) RecordHistogramValue(timer int, value float64) { _m.Called(timer, value) } +func (_m *Scope) ExponentialHistogram(hist int, d time.Duration) { + _m.Called(hist, d) +} + +func (_m *Scope) IntExponentialHistogram(hist int, value int) { + _m.Called(hist, value) +} + // RecordTimer provides a mock function with given fields: timer, d func (_m *Scope) RecordTimer(timer int, d time.Duration) { _m.Called(timer, d) diff --git a/common/metrics/nop.go b/common/metrics/nop.go index 5ccc9d15de5..fccbdb082ee 100644 --- a/common/metrics/nop.go +++ b/common/metrics/nop.go @@ -27,9 +27,9 @@ import ( ) var ( - NoopClient = &noopClientImpl{} - NoopScope = &noopScopeImpl{} - NoopStopwatch = tally.NewStopwatch(time.Now(), &nopStopwatchRecorder{}) + NoopClient Client = &noopClientImpl{} + NoopScope Scope = &noopScopeImpl{} + NoopStopwatch = tally.NewStopwatch(time.Now(), &nopStopwatchRecorder{}) ) type nopStopwatchRecorder struct{} @@ -74,6 +74,10 @@ func NewNoopMetricsClient() Client { type noopScopeImpl struct{} +func (n *noopScopeImpl) ExponentialHistogram(hist int, d time.Duration) {} + +func (n *noopScopeImpl) IntExponentialHistogram(hist int, value int) {} + func (n *noopScopeImpl) IncCounter(counter int) { } diff --git a/common/metrics/scope.go b/common/metrics/scope.go index 13e1a351795..4002651a63b 100644 --- a/common/metrics/scope.go +++ b/common/metrics/scope.go @@ -21,6 +21,7 @@ package metrics import ( + "strconv" "time" "github.com/uber-go/tally" @@ -110,6 +111,28 @@ func (m *metricsScope) RecordHistogramValue(id int, value float64) { } } +func (m *metricsScope) ExponentialHistogram(id int, value time.Duration) { + def := m.defs[id] + m.scope.Tagged( + map[string]string{ + "histogram_start": def.exponentialBuckets.start().String(), + "histogram_end": def.exponentialBuckets.end().String(), + "histogram_scale": strconv.Itoa(def.exponentialBuckets.histScale()), + }, + ).Histogram(def.metricName.String(), def.exponentialBuckets.buckets()).RecordDuration(value) +} + +func (m *metricsScope) IntExponentialHistogram(id int, value int) { + def := m.defs[id] + m.scope.Tagged( + map[string]string{ + "histogram_start": def.intExponentialBuckets.start().String(), + "histogram_end": def.intExponentialBuckets.end().String(), + "histogram_scale": strconv.Itoa(def.intExponentialBuckets.histScale()), + }, + ).Histogram(def.metricName.String(), def.intExponentialBuckets.buckets()).RecordDuration(time.Duration(value)) +} + func (m *metricsScope) Tagged(tags ...Tag) Scope { domainTagged := m.isDomainTagged tagMap := make(map[string]string, len(tags)) diff --git a/internal/tools/metricslint/analyzer.go b/internal/tools/metricslint/analyzer.go new file mode 100644 index 00000000000..e5cb925140d --- /dev/null +++ b/internal/tools/metricslint/analyzer.go @@ -0,0 +1,263 @@ +package metricslint + +import ( + "go/ast" + "go/token" + "go/types" + "strings" + + "golang.org/x/tools/go/analysis" + "golang.org/x/tools/go/analysis/passes/inspect" + "golang.org/x/tools/go/ast/inspector" +) + +const ( + scopePkg = "github.com/uber/cadence/common/metrics" + scopeTypeName = scopePkg + ".Scope" +) + +var scopeMethods = map[string]bool{ + "ExponentialHistogram": true, + "IntExponentialHistogram": true, + "IncCounter": true, + "AddCounter": true, + "UpdateGauge": true, + "StartTimer": true, + "RecordTimer": true, + "RecordHistogramDuration": true, + "RecordHistogramValue": true, +} + +// Analyzer reports all (inline string) metric names passed to Emitter methods or defined in metrics/defs.go. +// +// This is NOT intended to be used directly, it's just using the analysis framework to simplify type checking. +// The output needs to be checked for duplicates or error messages by a separate process (see cmd/main.go). +var Analyzer = &analysis.Analyzer{ + Name: "metricnames", + Doc: "finds metricDefinition defs and uses, and reports them so they can be post-processed for uniqueness", + Requires: []*analysis.Analyzer{inspect.Analyzer}, + Run: run, +} + +func run(pass *analysis.Pass) (interface{}, error) { + i := pass.ResultOf[inspect.Analyzer].(*inspector.Inspector) + + if pass.Pkg.Path() == scopePkg { + // report all hard-coded metricDefinition names, so we can make sure + // there are no duplicates between the integers and the const strings + // passed to the emitter. + // + // if you want a duplicate: no, you really don't. + // use a different name, and once verified just swap to the old name and + // delete the integer def. otherwise you risk double-counting events if + // both are emitted, and can't tell if your tags are wrong. + // + // if you're swapping calls in one step, also delete the old definition. + // + // if you really do have an exception, modify this linter to allow it. + reportMetricDefinitionNames(pass, i) + // and DO NOT return, in case there are emitter calls in this package too, + // though currently that would imply an import loop. + } + + // print calls to metrics-emitting methods on Emitter, regardless of location + reportScopeCalls(pass, i) + + return nil, nil +} + +func reportMetricDefinitionNames(pass *analysis.Pass, i *inspector.Inspector) { + nodeFilter := []ast.Node{ + (*ast.CompositeLit)(nil), + } + i.Preorder(nodeFilter, func(n ast.Node) { + decl := n.(*ast.CompositeLit) + var metricNameField *ast.KeyValueExpr + var histogramType string + for _, el := range decl.Elts { + kv, ok := el.(*ast.KeyValueExpr) + if !ok { + return // not a struct initializer + } + ident, ok := kv.Key.(*ast.Ident) + if !ok { + return // dynamic key in a dict or something, ignore + } + switch ident.Name { + case "metricName": + metricNameField = kv // currently this is exclusively used by metricDefinition + case "exponentialHistogram": + if histogramType != "" { + pass.Reportf(kv.Pos(), "only one of exponentialHistogram or intExponentialHistogram can be set") + } + histogramType = "duration" + case "intExponentialHistogram": + if histogramType != "" { + pass.Reportf(kv.Pos(), "only one of exponentialHistogram or intExponentialHistogram can be set") + } + histogramType = "int" + } + } + if metricNameField == nil { + return + } + str, ok := getConstantString(metricNameField.Value) + if !ok { + pass.Reportf(metricNameField.Pos(), "metricDefinition{metricName: ...} value must be an inline string literal") + return + } + switch histogramType { + case "duration": + if !strings.HasSuffix(str, "_ns") { + pass.Reportf(metricNameField.Pos(), "exponential histogram metric names must end with _ns: %q", str) + return + } + case "int": + if !strings.HasSuffix(str, "_counts") { + pass.Reportf(metricNameField.Pos(), "int-exponential histogram metric names must end with _counts: %q", str) + return + } + } + pass.Reportf(metricNameField.Pos(), "success: %v", str) + }) +} + +func reportScopeCalls(pass *analysis.Pass, i *inspector.Inspector) { + nodeFilter := []ast.Node{ + (*ast.File)(nil), // to skip test files + (*ast.FuncDecl)(nil), // to skip test helper funcs + (*ast.CallExpr)(nil), + } + i.Nodes(nodeFilter, func(n ast.Node, push bool) (proceed bool) { + // always descend by default, in case the call contains a closure that emits metrics. + // this covers the sync.Once case in the testdata, for example. + proceed = true + if !push { + return // do nothing when ascending the ast tree + } + + // check for test files, ignore their content if found + file, ok := n.(*ast.File) + if ok { + filename := pass.Fset.Position(file.Pos()).Filename + if strings.HasSuffix(filename, "_test.go") { + return false // don't inspect test files + } + return + } + + // check for test helper funcs anywhere, ignore their content if found. + // these are identified by a *testing.T as their first arg. + if fn, ok := n.(*ast.FuncDecl); ok { + if len(fn.Type.Params.List) > 0 { + firstArgType := pass.TypesInfo.TypeOf(fn.Type.Params.List[0].Type) + asStr := types.TypeString(firstArgType, nil) // "full/path/to.Type" + if asStr == "*testing.T" { + return false // don't inspect test helpers + } + } + return + } + + call := n.(*ast.CallExpr) // only other possibility due to nodeFilter + + // check if this is a method call (receiver.Method() == X.Sel) + // for one of the names we care about (as an early / efficient check) + sel, ok := call.Fun.(*ast.SelectorExpr) + if !ok { + return + } + methodName := sel.Sel.Name + if !scopeMethods[methodName] { + return + } + + // check if the receiver type is one we care about + receiver := pass.TypesInfo.TypeOf(sel.X) + // pointer receivers are not allowed, complain if found. + // if we just ignore them here, we might miss a metric call with a duplicate name. + named, ok := receiver.(*types.Named) + if !ok { + if _, isPtr := receiver.(*types.Pointer); isPtr { + // ignore pointer use, e.g. this occurs on the impl of scope itself + return + } + // not named and not pointer == allowed by the type system, but should be impossible in a CallExpr + pass.Reportf(sel.Pos(), "anonymous receiver of a method call should be impossible?") + return + } + fullName := getPkgAndName(named) + if fullName != scopeTypeName { + return // apparently not the scope, it just has similarly-named methods. ignore it. + } + + // at this point we know that it's "a metrics method", make sure it's valid. + + if len(call.Args) == 0 { + // 0 args == not currently in use + pass.Reportf(call.Pos(), "method call %v looks like a metrics method, but has too few args", call) + return + } + + filename := pass.Fset.Position(call.Pos()).Filename + if strings.HasSuffix(filename, "common/util.go") { + // util function that checks key lengths + return + } else if strings.HasSuffix(filename, "service/worker/archiver/replay_metrics_client.go") { + // a mock, ignore the metric-ID variable use + return + } else if strings.HasSuffix(filename, "service/history/workflowcache/cache.go") { + // reasonable usage, internal+external passed in. maybe worth changing later. + return + } else if strings.HasSuffix(filename, "service/history/workflowcache/metrics.go") { + // reasonable usage, internal+external passed in. maybe worth changing later. + return + } + + // pull out the first arg's text and report it for duplicate-checking + nameArg := call.Args[0] + nameArgText := types.ExprString(nameArg) // reconstructs the expression, but should be fine + if !strings.HasPrefix(nameArgText, "metrics.") { + pass.Reportf(nameArg.Pos(), "first argument to %v must be a metrics constant, not %v", methodName, nameArgText) + return + } + + // valid call! + // + // "report" the data we've found so it can be checked with a separate process (see cmd/main.go). + // that will error if any non-"success" lines were found, else check that the list is unique. + // + // tests may lead to the same package being processed more than once, but + // the reported lines for "path/to/file:line success: {name}" are still unique + // and that won't count as a duplicate. + pass.Reportf(call.Pos(), "success: %s", nameArgText) + return + }) +} + +func getPkgAndName(named *types.Named) (fullName string) { + obj := named.Obj() + if obj == nil { + // afaik not possible, this would be "a named type without a type" + return "" + } + pkg := obj.Pkg() + if pkg == nil { + // given where this is used, I believe this would currently imply + // "a builtin type with a method", which does not exist. + // but it's fine to return partial values, in case it's used in more places. + return obj.Name() + } + return pkg.Path() + "." + obj.Name() +} + +func getConstantString(expr ast.Expr) (str string, ok bool) { + lit, ok := expr.(*ast.BasicLit) + if !ok { + return "", false + } + if lit.Kind != token.STRING { + return "", false + } + return strings.Trim(lit.Value, `"`), true +} diff --git a/internal/tools/metricslint/analyzer_test.go b/internal/tools/metricslint/analyzer_test.go new file mode 100644 index 00000000000..22d1c9c1696 --- /dev/null +++ b/internal/tools/metricslint/analyzer_test.go @@ -0,0 +1,12 @@ +package metricslint + +import ( + "testing" + + "golang.org/x/tools/go/analysis/analysistest" +) + +func TestAnalyzer(t *testing.T) { + testdata := analysistest.TestData() + analysistest.Run(t, testdata, Analyzer, "github.com/uber/cadence/common/metrics") +} diff --git a/internal/tools/metricslint/cmd/main.go b/internal/tools/metricslint/cmd/main.go new file mode 100644 index 00000000000..9547c4f7516 --- /dev/null +++ b/internal/tools/metricslint/cmd/main.go @@ -0,0 +1,117 @@ +package main + +import ( + "flag" + "fmt" + "os" + "os/exec" + "slices" + "strconv" + "strings" + + "golang.org/x/tools/go/analysis/singlechecker" + + "github.com/uber/cadence/internal/tools/metricslint" +) + +func main() { + var analyze bool + flag.BoolVar(&analyze, "analyze", false, "if true, run the analyzer normally (e.g. for debugging). if false, run the analyzer internally and interpret the results as a linter") + skip := map[string]int{} + flag.Func("skip", "metric name to ignore, comma, followed by the number of times it is duplicated. repeat to skip more things.", func(s string) error { + parts := strings.SplitN(s, ",", 2) + if len(parts) != 2 { + return fmt.Errorf("skip argument %q is not in the form `name,count`", s) + } + count, err := strconv.Atoi(parts[1]) + if err != nil || count < 2 { + return fmt.Errorf("skip argument %q has invalid count %q: %v", s, parts[1], err) + } + skip[parts[0]] = count + return nil + }) + flag.Parse() + + if analyze { + singlechecker.Main(metricslint.Analyzer) + } else { + os.Exit(checkAnalyzerOutput(skip)) + } +} + +func checkAnalyzerOutput(skip map[string]int) (exitCode int) { + cmd := exec.Command(os.Args[0], append([]string{"-analyze"}, os.Args[1:]...)...) + out, _ := cmd.CombinedOutput() + + defer func() { + // in case of crashes, print the output we got so far as it likely narrows down the cause. + if r := recover(); r != nil { + _, _ = fmt.Fprint(os.Stderr, "\n\n"+string(out)) + panic(r) + } + }() + + lines := strings.Split(string(out), "\n") + // map of metric name to set of lines using it (to deduplicate) + names := make(map[string]map[string]struct{}, len(lines)) + var failures []string + for _, line := range lines { + if line == "" { + continue // empty lines are fine + } + words := strings.Fields(line) + if len(words) == 3 && words[1] == "success:" { + prev, ok := names[words[2]] + if !ok { + prev = make(map[string]struct{}) + } + prev[line] = struct{}{} + names[words[2]] = prev + } else { + failures = append(failures, line) + } + } + + if len(failures) > 0 { + _, _ = fmt.Fprintln(os.Stderr, strings.Join(failures, "\n")+"\n") + return 1 + } + + exitCode = 0 + sortedNames := make([]string, 0, len(names)) + for name := range names { + sortedNames = append(sortedNames, name) + } + slices.Sort(sortedNames) + for _, name := range sortedNames { + usages := names[name] + var complain bool + expected, ok := skip[name] + if ok { + complain = len(usages) != expected // expectation mismatch + } else { + complain = len(usages) > 1 // must not have duplicates + } + if complain { + if expected > 0 { + _, _ = fmt.Fprintf(os.Stderr, "Metric name %q used in an unexpected number of places:\n", name) + } else { + _, _ = fmt.Fprintf(os.Stderr, "Metric name %q used in multiple places:\n", name) + } + // frustratingly: this can't use `maps.Keys` because it + // returns an iterator, not a slice, so it can't be sorted or `...`-spread. + // bleh. + sorted := make([]string, 0, len(usages)) + for usage := range usages { + sorted = append(sorted, usage) + } + slices.Sort(sorted) + for usage := range usages { + _, _ = fmt.Fprintf(os.Stderr, "\t%s\n", usage) + } + exitCode = 1 + } + } + + return exitCode +} diff --git a/internal/tools/metricslint/testdata/go.mod b/internal/tools/metricslint/testdata/go.mod new file mode 100644 index 00000000000..216898fe865 --- /dev/null +++ b/internal/tools/metricslint/testdata/go.mod @@ -0,0 +1,4 @@ +module github.com/uber/cadence/common/metrics +// ^ must be the same package as the scope and metricDefinition's definition for the hardcoded path to work right + +go 1.24 \ No newline at end of file diff --git a/internal/tools/metricslint/testdata/metrics.go b/internal/tools/metricslint/testdata/metrics.go new file mode 100644 index 00000000000..02384bf0fce --- /dev/null +++ b/internal/tools/metricslint/testdata/metrics.go @@ -0,0 +1,28 @@ +package metrics + +import ( + "time" +) + +type Scope interface { + IncCounter(counter int) + AddCounter(counter int, delta int64) + StartTimer(timer int) Stopwatch + RecordTimer(timer int, d time.Duration) + RecordHistogramDuration(timer int, d time.Duration) + RecordHistogramValue(timer int, value float64) + ExponentialHistogram(hist int, d time.Duration) + IntExponentialHistogram(hist int, value int) + UpdateGauge(gauge int, value float64) + Tagged(tags ...Tag) Scope +} +type Tag struct{} +type Stopwatch interface{} + +func do() { + var scope Scope + + scope.IncCounter(123) + scope.IncCounter(123) + scope.ExponentialHistogram(123, 0) +} diff --git a/internal/tools/metricslint/testdata/metrics_gen.go b/internal/tools/metricslint/testdata/metrics_gen.go new file mode 100644 index 00000000000..647411c8d4a --- /dev/null +++ b/internal/tools/metricslint/testdata/metrics_gen.go @@ -0,0 +1,11 @@ +package metrics + +// Code generated by test_shenanigans; DO NOT EDIT + +func GeneratedFunction() { + var scope Scope + + scope.IncCounter(123) + scope.IncCounter(123) + scope.ExponentialHistogram(123, 0) +} diff --git a/internal/tools/metricslint/testdata/metrics_test.go b/internal/tools/metricslint/testdata/metrics_test.go new file mode 100644 index 00000000000..a367d366dd6 --- /dev/null +++ b/internal/tools/metricslint/testdata/metrics_test.go @@ -0,0 +1,11 @@ +package metrics + +func ignoredInTestFiles() { + var scope Scope + + // test code is allowed to break the rules because it does not run in production + + scope.IncCounter(123) + scope.IncCounter(123) + scope.ExponentialHistogram(123, 0) +} diff --git a/revive.toml b/revive.toml index 2238d4dd4c9..d1b939df7af 100644 --- a/revive.toml +++ b/revive.toml @@ -20,7 +20,7 @@ warningCode = 0 [rule.errorf] # [rule.exported] # disabled due to lack of value / encouraging bad habits [rule.if-return] -[rule.increment-decrement] +# [rule.increment-decrement] # noisy on generated code, otherwise seems fine - if there's a way to fix that, probably re-enable [rule.indent-error-flow] # Disabled because we have 158 packages that need package comments; we could instead add ignore # directives for existing packages and require it for new packages. @@ -62,7 +62,7 @@ arguments=[["loop","method-call","recover","return", "immediate-recover"]] [rule.range-val-address] # beneficial [rule.range-val-in-closure] # beneficial [rule.unconditional-recursion] # probably a good idea -[rule.unreachable-code] # code simplifier +# [rule.unreachable-code] # reasonable code simplifier, but does not match compiler behavior (e.g. log.Fatal still needs panic or return) [rule.waitgroup-by-value] # correct use of sync code, important #### unused utilities diff --git a/service/history/replication/task_processor.go b/service/history/replication/task_processor.go index 2e0c9416950..cc67eaaae42 100644 --- a/service/history/replication/task_processor.go +++ b/service/history/replication/task_processor.go @@ -477,6 +477,14 @@ func (p *taskProcessorImpl) processTaskOnce(replicationTask *types.ReplicationTa } mScope = mScope.Tagged(metrics.DomainTag(domainName)) } + // emit single task processing latency + mScope.ExponentialHistogram(metrics.ExponentialTaskProcessingLatency, now.Sub(startTime)) + // emit latency from task generated to task received + mScope.ExponentialHistogram( + metrics.ExponentialReplicationTaskLatency, + now.Sub(time.Unix(0, replicationTask.GetCreationTime())), + ) + // emit single task processing latency mScope.RecordTimer(metrics.TaskProcessingLatency, now.Sub(startTime)) // emit latency from task generated to task received diff --git a/service/worker/archiver/replay_metrics_client.go b/service/worker/archiver/replay_metrics_client.go index 6ec7e1c94ce..11c42ad3924 100644 --- a/service/worker/archiver/replay_metrics_client.go +++ b/service/worker/archiver/replay_metrics_client.go @@ -158,6 +158,20 @@ func (r *replayMetricsScope) RecordHistogramValue(timer int, value float64) { r.scope.RecordHistogramValue(timer, value) } +func (r *replayMetricsScope) ExponentialHistogram(hist int, d time.Duration) { + if workflow.IsReplaying(r.ctx) { + return + } + r.scope.ExponentialHistogram(hist, d) +} + +func (r *replayMetricsScope) IntExponentialHistogram(hist int, value int) { + if workflow.IsReplaying(r.ctx) { + return + } + r.scope.IntExponentialHistogram(hist, value) +} + // UpdateGauge reports Gauge type absolute value metric func (r *replayMetricsScope) UpdateGauge(gauge int, value float64) { if workflow.IsReplaying(r.ctx) {