Skip to content

Commit ad91fc7

Browse files
authored
Better logging when getting some nondeterministic behaviours (#1245)
* Added a better log message for nondeterministic errors * Upped log level to Error as this is an error * Created test for the logging update * Fixed imports
1 parent 18c143a commit ad91fc7

File tree

2 files changed

+71
-1
lines changed

2 files changed

+71
-1
lines changed

internal/internal_task_handlers.go

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -944,13 +944,18 @@ ProcessEvents:
944944
if isReplayTest {
945945
// NOTE: we should do this regardless if it's in replay test or not
946946
// but since previously we checked the wrong error type, it may break existing customers workflow
947+
// the issue is that we change the error type and that we change the error message, the customers
948+
// are checking the error string - we plan to wrap all errors to avoid this issue in client v2
947949
nonDeterministicErr = panicErr
948950
} else {
949-
w.wth.logger.Warn("Ignored workflow panic error",
951+
// Since we know there is an error, we do the replay check to give more context in the log
952+
replayErr := matchReplayWithHistory(replayDecisions, respondEvents)
953+
w.wth.logger.Error("Ignored workflow panic error",
950954
zap.String(tagWorkflowType, task.WorkflowType.GetName()),
951955
zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),
952956
zap.String(tagRunID, task.WorkflowExecution.GetRunId()),
953957
zap.Error(nonDeterministicErr),
958+
zap.NamedError("ReplayError", replayErr),
954959
)
955960
}
956961
}

internal/internal_task_handlers_test.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ import (
3939
"github.com/stretchr/testify/suite"
4040
"go.uber.org/goleak"
4141
"go.uber.org/zap"
42+
"go.uber.org/zap/zapcore"
4243
"go.uber.org/zap/zaptest"
44+
"go.uber.org/zap/zaptest/observer"
4345

4446
"go.uber.org/cadence/.gen/go/cadence/workflowservicetest"
4547
s "go.uber.org/cadence/.gen/go/shared"
@@ -826,6 +828,69 @@ func (t *TaskHandlersTestSuite) TestWorkflowTask_NondeterministicDetection() {
826828
t.NotNil(request)
827829
}
828830

831+
func (t *TaskHandlersTestSuite) TestWorkflowTask_NondeterministicLogNonexistingID() {
832+
taskList := "taskList"
833+
testEvents := []*s.HistoryEvent{
834+
createTestEventWorkflowExecutionStarted(1, &s.WorkflowExecutionStartedEventAttributes{TaskList: &s.TaskList{Name: &taskList}}),
835+
createTestEventDecisionTaskScheduled(2, &s.DecisionTaskScheduledEventAttributes{TaskList: &s.TaskList{Name: &taskList}}),
836+
createTestEventDecisionTaskStarted(3),
837+
createTestEventDecisionTaskCompleted(4, &s.DecisionTaskCompletedEventAttributes{ScheduledEventId: common.Int64Ptr(2)}),
838+
createTestEventActivityTaskScheduled(5, &s.ActivityTaskScheduledEventAttributes{
839+
// Insert an ID which does not exist
840+
ActivityId: common.StringPtr("NotAnActivityID"),
841+
ActivityType: &s.ActivityType{Name: common.StringPtr("pkg.Greeter_Activity")},
842+
TaskList: &s.TaskList{Name: &taskList},
843+
}),
844+
}
845+
846+
obs, logs := observer.New(zap.ErrorLevel)
847+
logger := zap.New(obs)
848+
849+
task := createWorkflowTask(testEvents, 3, "HelloWorld_Workflow")
850+
stopC := make(chan struct{})
851+
params := workerExecutionParameters{
852+
TaskList: taskList,
853+
WorkerOptions: WorkerOptions{
854+
Identity: "test-id-1",
855+
Logger: logger,
856+
},
857+
WorkerStopChannel: stopC,
858+
}
859+
860+
taskHandler := newWorkflowTaskHandler(testDomain, params, nil, t.registry)
861+
request, err := taskHandler.ProcessWorkflowTask(&workflowTask{task: task}, nil)
862+
863+
t.NotNil(request)
864+
response := request.(*s.RespondDecisionTaskFailedRequest)
865+
866+
// NOTE: we might acctually want to return an error
867+
// but since previously we checked the wrong error type, it may break existing customers workflow
868+
// The issue is that we change the error type and that we change the error message, the customers
869+
// are checking the error string - we plan to wrap all errors to avoid this issue in client v2
870+
t.NoError(err)
871+
t.NotNil(response)
872+
873+
// Check that the error was logged
874+
ignoredWorkflowLogs := logs.FilterMessage("Ignored workflow panic error")
875+
require.Len(t.T(), ignoredWorkflowLogs.All(), 1)
876+
877+
// Find the ReplayError field
878+
withField := ignoredWorkflowLogs.All()[0]
879+
var replayErrorField *zapcore.Field
880+
for _, field := range withField.Context {
881+
if field.Key == "ReplayError" {
882+
replayErrorField = &field
883+
}
884+
}
885+
require.NotNil(t.T(), replayErrorField)
886+
require.Equal(t.T(), zapcore.ErrorType, replayErrorField.Type)
887+
require.ErrorContains(t.T(), replayErrorField.Interface.(error),
888+
"nondeterministic workflow: "+
889+
"history event is ActivityTaskScheduled: (ActivityId:NotAnActivityID, ActivityType:(Name:pkg.Greeter_Activity), TaskList:(Name:taskList), Input:[]), "+
890+
"replay decision is ScheduleActivityTask: (ActivityId:0, ActivityType:(Name:Greeter_Activity), TaskList:(Name:taskList)")
891+
892+
}
893+
829894
func (t *TaskHandlersTestSuite) TestWorkflowTask_WorkflowReturnsPanicError() {
830895
taskList := "taskList"
831896
testEvents := []*s.HistoryEvent{

0 commit comments

Comments
 (0)