@@ -719,6 +719,14 @@ ProcessEvents:
719719 }
720720 }
721721
722+ // Non-deterministic error could happen in 2 different places:
723+ // 1) the replay decisions does not match to history events. This is usually due to non backwards compatible code
724+ // change to decider logic. For example, change calling one activity to a different activity.
725+ // 2) the decision state machine is trying to make illegal state transition while replay a history event (like
726+ // activity task completed), but the corresponding decider code that start the event has been removed. In that case
727+ // the replay of that event will panic on the decision state machine and the workflow will be marked as completed
728+ // with the panic error.
729+ var nonDeterministicErr error
722730 if ! skipReplayCheck && ! w .isWorkflowCompleted {
723731 // check if decisions from reply matches to the history events
724732 if err := matchReplayWithHistory (replayDecisions , respondEvents ); err != nil {
@@ -728,29 +736,32 @@ ProcessEvents:
728736 zap .String (tagWorkflowID , task .WorkflowExecution .GetWorkflowId ()),
729737 zap .String (tagRunID , task .WorkflowExecution .GetRunId ()),
730738 zap .Error (err ))
731-
732- // Whether or not we store the error in workflowContext.err makes
733- // a significant difference, to the point that it affects client's observable
734- // behavior as far as handling non-deterministic workflows.
735- //
736- // If we store it in workflowContext.err, the decision task completion code
737- // will pick up the error and correctly wrap it in the response request we sent back
738- // to the server, which in this case will contain a request to fail the workflow.
739- //
740- // If we simply return the error, the decision task completion code path will not
741- // execute at all, therefore, no response is sent back to the server and we will
742- // look like a decision task time out.
743- switch w .wth .nonDeterministicWorkflowPolicy {
744- case NonDeterministicWorkflowPolicyFailWorkflow :
745- eventHandler .Complete (nil , NewCustomError ("nondeterministic workflow" , err .Error ()))
746- case NonDeterministicWorkflowPolicyBlockWorkflow :
747- return nil , err
748- default :
749- panic (fmt .Sprintf ("unknown mismatched workflow history policy." ))
739+ nonDeterministicErr = err
740+ }
741+ }
742+ if nonDeterministicErr == nil && w .err != nil {
743+ if panicErr , ok := w .err .(* PanicError ); ok && panicErr .value != nil {
744+ if _ , isStateMachinePanic := panicErr .value .(stateMachineIllegalStatePanic ); isStateMachinePanic {
745+ nonDeterministicErr = panicErr
750746 }
751747 }
752748 }
753749
750+ if nonDeterministicErr != nil {
751+ switch w .wth .nonDeterministicWorkflowPolicy {
752+ case NonDeterministicWorkflowPolicyFailWorkflow :
753+ // complete workflow with custom error will fail the workflow
754+ eventHandler .Complete (nil , NewCustomError ("NonDeterministicWorkflowPolicyFailWorkflow" , nonDeterministicErr .Error ()))
755+ case NonDeterministicWorkflowPolicyBlockWorkflow :
756+ // return error here will be convert to DecisionTaskFailed for the first time, and ignored for subsequent
757+ // attempts which will cause DecisionTaskTimeout and server will retry forever until issue got fixed or
758+ // workflow timeout.
759+ return nil , nonDeterministicErr
760+ default :
761+ panic (fmt .Sprintf ("unknown mismatched workflow history policy." ))
762+ }
763+ }
764+
754765 return w .CompleteDecisionTask (workflowTask , true ), nil
755766}
756767
0 commit comments