Skip to content

Commit 7861bc3

Browse files
authored
Prune two workflow update related metrics in favor of logs w/ namespace (#9269)
## What changed? Remove `invalid_state_transition_workflow_update_message` and `workflow_update_registry_size_limited` metrics. Add the `namespace` tag to the logs/softasserts in the `instrumentation` methods the metrics are used for. ## Why? The logs will give us the same information and the namespace without cardinality concerns. These metrics are not expected to fire frequently. ## How did you test it? - [ ] built - [ ] run locally and tested manually - [ ] covered by existing tests - [ ] added new unit test(s) - [ ] added new functional test(s) ## Potential risks Minimal, metrics changes only.
1 parent 4ad1daa commit 7861bc3

File tree

4 files changed

+44
-35
lines changed

4 files changed

+44
-35
lines changed

common/metrics/metric_defs.go

Lines changed: 28 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -882,35 +882,34 @@ var (
882882
"activity_schedule_to_close_latency",
883883
WithDescription("Duration of activity execution from scheduled time to terminal state. Includes retries and backoffs."),
884884
)
885-
ActivitySuccess = NewCounterDef("activity_success", WithDescription("Number of activities that succeeded (doesn't include retries)."))
886-
ActivityFail = NewCounterDef("activity_fail", WithDescription("Number of activities that failed and won't be retried anymore."))
887-
ActivityTaskFail = NewCounterDef("activity_task_fail", WithDescription("Number of activity task failures (includes retries)."))
888-
ActivityCancel = NewCounterDef("activity_cancel", WithDescription("Number of activities that are cancelled."))
889-
ActivityTerminate = NewCounterDef("activity_terminate", WithDescription("Number of activities that are terminated."))
890-
ActivityTaskTimeout = NewCounterDef("activity_task_timeout", WithDescription("Number of activity task timeouts (including retries)."))
891-
ActivityTimeout = NewCounterDef("activity_timeout", WithDescription("Number of terminal activity timeouts."))
892-
ActivityPayloadSize = NewCounterDef("activity_payload_size", WithDescription("Size of activity payloads in bytes."))
893-
AckLevelUpdateCounter = NewCounterDef("ack_level_update")
894-
AckLevelUpdateFailedCounter = NewCounterDef("ack_level_update_failed")
895-
CommandCounter = NewCounterDef("command")
896-
MessageTypeRequestWorkflowExecutionUpdateCounter = NewCounterDef("request_workflow_update_message")
897-
MessageTypeAcceptWorkflowExecutionUpdateCounter = NewCounterDef("accept_workflow_update_message")
898-
MessageTypeRespondWorkflowExecutionUpdateCounter = NewCounterDef("respond_workflow_update_message")
899-
MessageTypeRejectWorkflowExecutionUpdateCounter = NewCounterDef("reject_workflow_update_message")
900-
InvalidStateTransitionWorkflowExecutionUpdateCounter = NewCounterDef("invalid_state_transition_workflow_update_message")
901-
WorkflowExecutionUpdateRegistrySize = NewBytesHistogramDef("workflow_update_registry_size")
902-
WorkflowExecutionUpdateRegistrySizeLimited = NewCounterDef("workflow_update_registry_size_limited")
903-
WorkflowExecutionUpdateRequestRateLimited = NewCounterDef("workflow_update_request_rate_limited")
904-
WorkflowExecutionUpdateTooMany = NewCounterDef("workflow_update_request_too_many")
905-
WorkflowExecutionUpdateAborted = NewCounterDef("workflow_update_aborted")
906-
WorkflowExecutionUpdateSentToWorker = NewCounterDef("workflow_update_sent_to_worker")
907-
WorkflowExecutionUpdateSentToWorkerAgain = NewCounterDef("workflow_update_sent_to_worker_again")
908-
WorkflowExecutionUpdateWaitStageAccepted = NewCounterDef("workflow_update_wait_stage_accepted")
909-
WorkflowExecutionUpdateWaitStageCompleted = NewCounterDef("workflow_update_wait_stage_completed")
910-
WorkflowExecutionUpdateClientTimeout = NewCounterDef("workflow_update_client_timeout")
911-
WorkflowExecutionUpdateServerTimeout = NewCounterDef("workflow_update_server_timeout")
912-
SpeculativeWorkflowTaskCommits = NewCounterDef("speculative_workflow_task_commits")
913-
SpeculativeWorkflowTaskRollbacks = NewCounterDef("speculative_workflow_task_rollbacks")
885+
ActivitySuccess = NewCounterDef("activity_success", WithDescription("Number of activities that succeeded (doesn't include retries)."))
886+
ActivityFail = NewCounterDef("activity_fail", WithDescription("Number of activities that failed and won't be retried anymore."))
887+
ActivityTaskFail = NewCounterDef("activity_task_fail", WithDescription("Number of activity task failures (includes retries)."))
888+
ActivityCancel = NewCounterDef("activity_cancel", WithDescription("Number of activities that are cancelled."))
889+
ActivityTerminate = NewCounterDef("activity_terminate", WithDescription("Number of activities that are terminated."))
890+
ActivityTaskTimeout = NewCounterDef("activity_task_timeout", WithDescription("Number of activity task timeouts (including retries)."))
891+
ActivityTimeout = NewCounterDef("activity_timeout", WithDescription("Number of terminal activity timeouts."))
892+
ActivityPayloadSize = NewCounterDef("activity_payload_size", WithDescription("Size of activity payloads in bytes."))
893+
AckLevelUpdateCounter = NewCounterDef("ack_level_update")
894+
AckLevelUpdateFailedCounter = NewCounterDef("ack_level_update_failed")
895+
CommandCounter = NewCounterDef("command")
896+
MessageTypeRequestWorkflowExecutionUpdateCounter = NewCounterDef("request_workflow_update_message")
897+
MessageTypeAcceptWorkflowExecutionUpdateCounter = NewCounterDef("accept_workflow_update_message")
898+
MessageTypeRespondWorkflowExecutionUpdateCounter = NewCounterDef("respond_workflow_update_message")
899+
MessageTypeRejectWorkflowExecutionUpdateCounter = NewCounterDef("reject_workflow_update_message")
900+
WorkflowExecutionUpdateRegistrySize = NewBytesHistogramDef("workflow_update_registry_size")
901+
WorkflowExecutionUpdateRegistrySizeLimited = NewCounterDef("workflow_update_registry_size_limited")
902+
WorkflowExecutionUpdateRequestRateLimited = NewCounterDef("workflow_update_request_rate_limited")
903+
WorkflowExecutionUpdateTooMany = NewCounterDef("workflow_update_request_too_many")
904+
WorkflowExecutionUpdateAborted = NewCounterDef("workflow_update_aborted")
905+
WorkflowExecutionUpdateSentToWorker = NewCounterDef("workflow_update_sent_to_worker")
906+
WorkflowExecutionUpdateSentToWorkerAgain = NewCounterDef("workflow_update_sent_to_worker_again")
907+
WorkflowExecutionUpdateWaitStageAccepted = NewCounterDef("workflow_update_wait_stage_accepted")
908+
WorkflowExecutionUpdateWaitStageCompleted = NewCounterDef("workflow_update_wait_stage_completed")
909+
WorkflowExecutionUpdateClientTimeout = NewCounterDef("workflow_update_client_timeout")
910+
WorkflowExecutionUpdateServerTimeout = NewCounterDef("workflow_update_server_timeout")
911+
SpeculativeWorkflowTaskCommits = NewCounterDef("speculative_workflow_task_commits")
912+
SpeculativeWorkflowTaskRollbacks = NewCounterDef("speculative_workflow_task_rollbacks")
914913

915914
ActivityEagerExecutionCounter = NewCounterDef("activity_eager_execution")
916915
// WorkflowEagerExecutionCounter is emitted any time eager workflow start is requested.

service/history/workflow/context.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -968,6 +968,7 @@ func (c *ContextImpl) UpdateRegistry(ctx context.Context) update.Registry {
968968

969969
c.updateRegistry = update.NewRegistry(
970970
c.MutableState,
971+
update.WithNamespace(nsName),
971972
update.WithLogger(c.logger),
972973
update.WithMetrics(c.metricsHandler),
973974
update.WithTracerProvider(trace.SpanFromContext(ctx).TracerProvider()),

service/history/workflow/update/registry.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,13 @@ func WithTotalLimitSuggestCAN(f func() float64) Option {
136136
}
137137
}
138138

139+
// WithNamespace sets the namespace name to be used in Registry metrics and logs.
140+
func WithNamespace(ns string) Option {
141+
return func(r *registry) {
142+
r.instrumentation.namespace = ns
143+
}
144+
}
145+
139146
// WithLogger sets the log.Logger to be used by Registry and its Updates.
140147
func WithLogger(l log.Logger) Option {
141148
return func(r *registry) {

service/history/workflow/update/util.go

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@ import (
1414

1515
type (
1616
instrumentation struct {
17-
log log.Logger
18-
metrics metrics.Handler
19-
tracer trace.Tracer
17+
log log.Logger
18+
metrics metrics.Handler
19+
tracer trace.Tracer
20+
namespace string
2021
}
2122
)
2223

@@ -50,11 +51,12 @@ func (i *instrumentation) countRateLimited() {
5051

5152
func (i *instrumentation) countRegistrySizeLimited(updateCount, registrySize, payloadSize int) {
5253
i.oneOf(metrics.WorkflowExecutionUpdateRegistrySizeLimited.Name())
53-
// TODO: remove log once limit is enforced everywhere
5454
i.log.Warn("update registry size limit reached",
5555
tag.Int("registry-size", registrySize),
5656
tag.Int("payload-size", payloadSize),
57-
tag.Int("update-count", updateCount))
57+
tag.Int("update-count", updateCount),
58+
tag.String("namespace", i.namespace),
59+
)
5860
}
5961

6062
func (i *instrumentation) countTooMany() {
@@ -79,14 +81,14 @@ func (i *instrumentation) countSentAgain() {
7981
}
8082

8183
func (i *instrumentation) invalidStateTransition(updateID string, msg proto.Message, state state) {
82-
i.oneOf(metrics.InvalidStateTransitionWorkflowExecutionUpdateCounter.Name())
8384
softassert.Fail(
8485
i.log,
8586
"invalid state transition attempted",
8687
tag.ComponentWorkflowUpdate,
8788
tag.String("update-id", updateID),
8889
tag.String("message", fmt.Sprintf("%T", msg)),
8990
tag.Stringer("state", state),
91+
tag.String("namespace", i.namespace),
9092
)
9193
}
9294

0 commit comments

Comments
 (0)