Skip to content

Commit 9fdcd4f

Browse files
authored
Add a structured error for non-determinism failures (#1269)
This is being added for a few reasons: 1. it's useful information, and an unstructured error has no way to safely communicate it 2. previously these were un-typed errors, so this custom type can be added without breaking backwards compatibility 3. we want to use it to expose more information about shadow-test failures, which are currently far more opaque than they need to be Shadow tests are opaque for a lot of reasons that need to be improved, but enhancing the error is a pretty straightforward first step. And it seems useful in general. This intentionally does *not* change the `.Error()` text, largely just because I didn't see any reason to do so. The error strings are reasonably well-known and there doesn't seem to be any major benefit that can be gained by improving them... currently. That said, these strings *must not* be considered stable in general, and we should consider changing them in the future.
1 parent c0f5bb0 commit 9fdcd4f

File tree

6 files changed

+112
-15
lines changed

6 files changed

+112
-15
lines changed

error.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ type (
3232

3333
// CanceledError returned when operation was canceled.
3434
CanceledError = internal.CanceledError
35+
36+
// NonDeterministicError is returned when a workflow's replay was non-deterministic, and it could not be resumed safely.
37+
NonDeterministicError = internal.NonDeterministicError
3538
)
3639

3740
// ErrNoData is returned when trying to extract strong typed data while there is no data available.

internal/client.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,14 @@ import (
2626
"fmt"
2727
"time"
2828

29-
"go.uber.org/cadence/internal/common/isolationgroup"
30-
3129
"github.com/opentracing/opentracing-go"
3230
"github.com/uber-go/tally"
3331
"go.uber.org/zap"
3432

3533
"go.uber.org/cadence/.gen/go/cadence/workflowserviceclient"
3634
s "go.uber.org/cadence/.gen/go/shared"
3735
"go.uber.org/cadence/internal/common/auth"
36+
"go.uber.org/cadence/internal/common/isolationgroup"
3837
"go.uber.org/cadence/internal/common/metrics"
3938
)
4039

internal/common/isolationgroup/service_wrapper_test.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,14 +25,12 @@ import (
2525
"testing"
2626
"time"
2727

28-
"github.com/stretchr/testify/assert"
29-
30-
"go.uber.org/cadence/.gen/go/cadence/workflowserviceclient"
31-
3228
"github.com/golang/mock/gomock"
29+
"github.com/stretchr/testify/assert"
3330
"github.com/stretchr/testify/suite"
3431
"github.com/uber/tchannel-go/thrift"
3532

33+
"go.uber.org/cadence/.gen/go/cadence/workflowserviceclient"
3634
"go.uber.org/cadence/.gen/go/cadence/workflowservicetest"
3735
"go.uber.org/cadence/.gen/go/shared"
3836
)

internal/error.go

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"strings"
2929

3030
"go.uber.org/cadence/.gen/go/shared"
31+
"go.uber.org/cadence/internal/common/util"
3132
)
3233

3334
/*
@@ -125,6 +126,51 @@ type (
125126
stackTrace string
126127
}
127128

129+
// NonDeterministicError contains some structured data related to a non-deterministic
130+
// replay failure, and is primarily intended for allowing richer error reporting.
131+
//
132+
// WorkflowType, WorkflowID, RunID, TaskList, and DomainName will likely be long-term stable
133+
// and included in some form in future library versions, but the rest of these fields may
134+
// change at any time, or be removed in a future major version change.
135+
NonDeterministicError struct {
136+
137+
// Reason is a relatively free-form description of what kind of non-determinism
138+
// was detected.
139+
//
140+
// You are *strongly* encouraged to not rely on these strings for behavior, only
141+
// explanation, for a few reasons. More will likely appear in the future, they may
142+
// change, and there is little that can be safely decided on in an automated way.
143+
//
144+
// Currently, values roughly match the historical error strings, and are:
145+
// - "missing replay decision" (The error will contain HistoryEventText, as there
146+
// is at least one history event that has no matching replayed decision)
147+
// - "extra replay decision" (The error will contain DecisionText, as there is
148+
// at least one decision from replay that has no matching history event)
149+
// - "mismatch" (Both HistoryEventText and DecisionText will exist, as there
150+
// are issues with both. This was previously shown as "history event is ...,
151+
// replay decision is ..." error text.)
152+
Reason string
153+
154+
WorkflowType string
155+
WorkflowID string
156+
RunID string
157+
TaskList string
158+
DomainName string
159+
160+
// intentionally avoiding "history event" and "decision" names
161+
// because we *do* have types for them, but they are in thrift and should
162+
// not be exposed directly.
163+
// we should consider doing that eventually though, or providing a
164+
// simplified object for richer failure information.
165+
166+
// HistoryEventText contains a String() representation of a history
167+
// event (i.e. previously recorded) that is related to the problem.
168+
HistoryEventText string
169+
// DecisionText contains a String() representation of a replay decision
170+
// event (i.e. created during replay) that is related to the problem.
171+
DecisionText string
172+
}
173+
128174
// ContinueAsNewError contains information about how to continue the workflow as new.
129175
ContinueAsNewError struct {
130176
wfn interface{}
@@ -419,3 +465,57 @@ func (b ErrorDetailsValues) Get(valuePtr ...interface{}) error {
419465
}
420466
return nil
421467
}
468+
469+
// NewNonDeterminsticError constructs a new *NonDeterministicError.
470+
//
471+
// - reason should be a documented NonDeterminsticError.Reason value
472+
// - info is always required. only a portion of it is used, but it is a convenient
473+
// and currently always-available object.
474+
// - history and decision may each be present or nil at any time
475+
func NewNonDeterminsticError(reason string, info *WorkflowInfo, history *shared.HistoryEvent, decision *shared.Decision) error {
476+
var historyText string
477+
if history != nil {
478+
historyText = util.HistoryEventToString(history)
479+
}
480+
var decisionText string
481+
if decision != nil {
482+
decisionText = util.DecisionToString(decision)
483+
}
484+
return &NonDeterministicError{
485+
Reason: reason,
486+
487+
WorkflowType: info.WorkflowType.Name,
488+
WorkflowID: info.WorkflowExecution.ID,
489+
RunID: info.WorkflowExecution.RunID,
490+
TaskList: info.TaskListName,
491+
DomainName: info.Domain,
492+
493+
HistoryEventText: historyText,
494+
DecisionText: decisionText,
495+
}
496+
}
497+
498+
func (e *NonDeterministicError) Error() string {
499+
switch e.Reason {
500+
case "missing replay decision":
501+
// historical text
502+
return "nondeterministic workflow: " +
503+
"missing replay decision for " + e.HistoryEventText
504+
case "extra replay decision":
505+
// historical text
506+
return "nondeterministic workflow: " +
507+
"extra replay decision for " + e.DecisionText
508+
case "mismatch":
509+
// historical text
510+
return "nondeterministic workflow: " +
511+
"history event is " + e.HistoryEventText + ", " +
512+
"replay decision is " + e.DecisionText
513+
default:
514+
// should not occur in practice, but it's basically fine if it does.
515+
// ideally this should crash in internal builds / tests, to prevent mismatched values.
516+
return fmt.Sprintf(
517+
"unknown reason %q, history event is: %s, replay decision is: %s",
518+
e.Reason, e.HistoryEventText, e.DecisionText,
519+
)
520+
}
521+
}

internal/internal_task_handlers.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -932,7 +932,7 @@ ProcessEvents:
932932
var nonDeterministicErr error
933933
if !skipReplayCheck && !w.isWorkflowCompleted || isReplayTest {
934934
// check if decisions from reply matches to the history events
935-
if err := matchReplayWithHistory(replayDecisions, respondEvents); err != nil {
935+
if err := matchReplayWithHistory(w.workflowInfo, replayDecisions, respondEvents); err != nil {
936936
nonDeterministicErr = err
937937
}
938938
}
@@ -947,7 +947,7 @@ ProcessEvents:
947947
nonDeterministicErr = panicErr
948948
} else {
949949
// Since we know there is an error, we do the replay check to give more context in the log
950-
replayErr := matchReplayWithHistory(replayDecisions, respondEvents)
950+
replayErr := matchReplayWithHistory(w.workflowInfo, replayDecisions, respondEvents)
951951
w.wth.logger.Error("Ignored workflow panic error",
952952
zap.String(tagWorkflowType, task.WorkflowType.GetName()),
953953
zap.String(tagWorkflowID, task.WorkflowExecution.GetWorkflowId()),

internal/workflow_replayer_utils.go

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,13 @@ package internal
2222

2323
import (
2424
"bytes"
25-
"fmt"
2625
"reflect"
2726
"strings"
2827

2928
s "go.uber.org/cadence/.gen/go/shared"
30-
"go.uber.org/cadence/internal/common/util"
3129
)
3230

33-
func matchReplayWithHistory(replayDecisions []*s.Decision, historyEvents []*s.HistoryEvent) error {
31+
func matchReplayWithHistory(info *WorkflowInfo, replayDecisions []*s.Decision, historyEvents []*s.HistoryEvent) error {
3432
di := 0
3533
hi := 0
3634
hSize := len(historyEvents)
@@ -60,16 +58,15 @@ matchLoop:
6058
}
6159

6260
if d == nil {
63-
return fmt.Errorf("nondeterministic workflow: missing replay decision for %s", util.HistoryEventToString(e))
61+
return NewNonDeterminsticError("missing replay decision", info, e, nil)
6462
}
6563

6664
if e == nil {
67-
return fmt.Errorf("nondeterministic workflow: extra replay decision for %s", util.DecisionToString(d))
65+
return NewNonDeterminsticError("extra replay decision", info, nil, d)
6866
}
6967

7068
if !isDecisionMatchEvent(d, e, false) {
71-
return fmt.Errorf("nondeterministic workflow: history event is %s, replay decision is %s",
72-
util.HistoryEventToString(e), util.DecisionToString(d))
69+
return NewNonDeterminsticError("mismatch", info, e, d)
7370
}
7471

7572
di++

0 commit comments

Comments
 (0)