@@ -30,6 +30,7 @@ import (
3030 "go.temporal.io/server/common/persistence"
3131 "go.temporal.io/server/common/persistence/visibility/manager"
3232 "go.temporal.io/server/common/searchattribute"
33+ "go.temporal.io/server/common/softassert"
3334 "go.temporal.io/server/common/tasktoken"
3435 "go.temporal.io/server/common/worker_versioning"
3536 "go.temporal.io/server/service/history/api"
@@ -129,6 +130,12 @@ func (handler *WorkflowTaskCompletedHandler) Invoke(
129130 metrics .StaleMutableStateCounter .With (handler .metricsHandler ).Record (
130131 1 ,
131132 metrics .OperationTag (metrics .HistoryRespondWorkflowTaskCompletedScope ))
133+ softassert .Sometimes (handler .logger ).Debug ("stale mutable state detected" ,
134+ tag .WorkflowID (token .GetWorkflowId ()),
135+ tag .WorkflowRunID (token .GetRunId ()),
136+ tag .WorkflowScheduledEventID (token .GetScheduledEventId ()),
137+ tag .NewInt64 ("mutable-state-next-event-id" , mutableState .GetNextEventID ()),
138+ )
132139 return false
133140 }
134141 return true
@@ -166,7 +173,7 @@ func (handler *WorkflowTaskCompletedHandler) Invoke(
166173 // This is NOT 100% bulletproof solution because this write operation may also fail.
167174 // TODO: remove this call when GetWorkflowExecutionHistory includes speculative WFT events.
168175 if clearStickyErr := handler .clearStickyTaskQueue (ctx , workflowLease .GetContext ()); clearStickyErr != nil {
169- handler .logger .Error ("Failed to clear stickiness after speculative workflow task failed to complete." ,
176+ softassert . Sometimes ( handler .logger ) .Error ("Failed to clear stickiness after speculative workflow task failed to complete." ,
170177 tag .NewErrorTag ("clear-sticky-error" , clearStickyErr ),
171178 tag .Error (retError ),
172179 tag .WorkflowID (token .GetWorkflowId ()),
@@ -225,7 +232,7 @@ func (handler *WorkflowTaskCompletedHandler) Invoke(
225232 if retError != nil {
226233 cancelled := effects .Cancel (ctx )
227234 if cancelled {
228- handler .logger .Info ("Canceled effects due to error. " ,
235+ softassert . Sometimes ( handler .logger ) .Info ("Canceled effects due to error" ,
229236 tag .Error (retError ),
230237 tag .WorkflowID (token .GetWorkflowId ()),
231238 tag .WorkflowRunID (token .GetRunId ()),
@@ -284,6 +291,11 @@ func (handler *WorkflowTaskCompletedHandler) Invoke(
284291 metrics .NamespaceTag (nsName ),
285292 )
286293 metrics .WorkflowTaskHeartbeatTimeoutCounter .With (scope ).Record (1 )
294+ softassert .Sometimes (handler .logger ).Debug ("workflow task heartbeat timed out" ,
295+ tag .WorkflowNamespaceID (nsName ),
296+ tag .WorkflowID (token .GetWorkflowId ()),
297+ tag .WorkflowRunID (token .GetRunId ()),
298+ )
287299 completedEvent , err = ms .AddWorkflowTaskTimedOutEvent (currentWorkflowTask )
288300 if err != nil {
289301 return nil , err
@@ -338,6 +350,12 @@ func (handler *WorkflowTaskCompletedHandler) Invoke(
338350 // and admitted updates are lost. Uncomment this check when durable admitted is implemented
339351 // or updates stay in the registry after WFT is failed.
340352 hasBufferedEventsOrMessages := ms .HasBufferedEvents () // || updateRegistry.HasOutgoingMessages(false)
353+ if hasBufferedEventsOrMessages {
354+ softassert .Sometimes (handler .logger ).Debug ("workflow has buffered events/messages" ,
355+ tag .WorkflowID (token .GetWorkflowId ()),
356+ tag .WorkflowRunID (token .GetRunId ()),
357+ )
358+ }
341359 if err := namespaceEntry .VerifyBinaryChecksum (request .GetBinaryChecksum ()); err != nil {
342360 wtFailedCause = newWorkflowTaskFailedCause (
343361 enumspb .WORKFLOW_TASK_FAILED_CAUSE_BAD_BINARY ,
@@ -447,11 +465,14 @@ func (handler *WorkflowTaskCompletedHandler) Invoke(
447465 metrics .FailureTag (wtFailedCause .failedCause .String ()),
448466 metrics .FirstAttemptTag (currentWorkflowTask .Attempt ),
449467 )
450- handler .logger .Info ("Failing the workflow task." ,
468+ softassert . Sometimes ( handler .logger ) .Info ("Failing the workflow task." ,
451469 tag .Value (wtFailedCause .Message ()),
452470 tag .WorkflowID (token .GetWorkflowId ()),
453471 tag .WorkflowRunID (token .GetRunId ()),
454- tag .WorkflowNamespaceID (namespaceEntry .ID ().String ()))
472+ tag .WorkflowNamespaceID (namespaceEntry .ID ().String ()),
473+ tag .Attempt (currentWorkflowTask .Attempt ),
474+ tag .Cause (wtFailedCause .failedCause .String ()),
475+ )
455476 if currentWorkflowTask .Attempt > 1 && wtFailedCause .failedCause != enumspb .WORKFLOW_TASK_FAILED_CAUSE_UNHANDLED_COMMAND {
456477 // drop this workflow task if it keeps failing. This will cause the workflow task to timeout and get retried after timeout.
457478 return nil , serviceerror .NewInvalidArgument (wtFailedCause .Message ())
@@ -620,6 +641,12 @@ func (handler *WorkflowTaskCompletedHandler) Invoke(
620641 // if updateErr resulted in TransactionSizeLimitError then fail workflow
621642 switch updateErr .(type ) {
622643 case * persistence.TransactionSizeLimitError :
644+ softassert .Sometimes (handler .logger ).Debug ("workflow terminated due to size limit" ,
645+ tag .WorkflowID (token .GetWorkflowId ()),
646+ tag .WorkflowRunID (token .GetRunId ()),
647+ tag .Error (updateErr ),
648+ )
649+
623650 // must reload mutable state because the first call to updateWorkflowExecutionWithContext or continueAsNewWorkflowExecution
624651 // clears mutable state if error is returned
625652 ms , err = weContext .LoadMutableState (ctx , handler .shardContext )
0 commit comments