@@ -35,6 +35,7 @@ import (
35
35
"time"
36
36
37
37
"github.com/opentracing/opentracing-go"
38
+ "github.com/uber-go/tally"
38
39
"go.uber.org/zap"
39
40
40
41
"go.uber.org/cadence/.gen/go/cadence/workflowserviceclient"
@@ -674,12 +675,12 @@ func (wth *workflowTaskHandlerImpl) getOrCreateWorkflowContext(
674
675
historyIterator HistoryIterator ,
675
676
) (workflowContext * workflowExecutionContextImpl , err error ) {
676
677
metricsScope := wth .metricsScope .GetTaggedScope (tagWorkflowType , task .WorkflowType .GetName ())
677
- defer func () {
678
+ defer func (metricsScope tally. Scope ) {
678
679
if err == nil && workflowContext != nil && workflowContext .laTunnel == nil {
679
680
workflowContext .laTunnel = wth .laTunnel
680
681
}
681
682
metricsScope .Gauge (metrics .StickyCacheSize ).Update (float64 (getWorkflowCache ().Size ()))
682
- }()
683
+ }(metricsScope )
683
684
684
685
runID := task .WorkflowExecution .GetRunId ()
685
686
@@ -694,14 +695,13 @@ func (wth *workflowTaskHandlerImpl) getOrCreateWorkflowContext(
694
695
if workflowContext != nil {
695
696
workflowContext .Lock ()
696
697
// add new tag on metrics scope with workflow runtime length category
697
- executionRuntimeType := workflowCategorizedByTimeout (workflowContext .workflowInfo .ExecutionStartToCloseTimeoutSeconds )
698
- metricsScope = metricsScope .Tagged (map [string ]string {tagworkflowruntimelength : executionRuntimeType })
698
+ scope := metricsScope .Tagged (map [string ]string {tagWorkflowRuntimeLength : workflowCategorizedByTimeout (workflowContext )})
699
699
if task .Query != nil && ! isFullHistory {
700
700
// query task and we have a valid cached state
701
- metricsScope .Counter (metrics .StickyCacheHit ).Inc (1 )
701
+ scope .Counter (metrics .StickyCacheHit ).Inc (1 )
702
702
} else if history .Events [0 ].GetEventId () == workflowContext .previousStartedEventID + 1 {
703
703
// non query task and we have a valid cached state
704
- metricsScope .Counter (metrics .StickyCacheHit ).Inc (1 )
704
+ scope .Counter (metrics .StickyCacheHit ).Inc (1 )
705
705
} else {
706
706
// non query task and cached state is missing events, we need to discard the cached state and rebuild one.
707
707
workflowContext .ResetIfStale (task , historyIterator )
@@ -949,39 +949,31 @@ ProcessEvents:
949
949
// the replay of that event will panic on the decision state machine and the workflow will be marked as completed
950
950
// with the panic error.
951
951
var nonDeterministicErr error
952
+ var nonDeterminismType nonDeterminismDetectionType
952
953
if ! skipReplayCheck && ! w .isWorkflowCompleted || isReplayTest {
953
954
// check if decisions from reply matches to the history events
954
955
if err := matchReplayWithHistory (w .workflowInfo , replayDecisions , respondEvents ); err != nil {
955
956
nonDeterministicErr = err
957
+ nonDeterminismType = nonDeterminismDetectionTypeReplayComparison
956
958
}
957
- }
958
- if nonDeterministicErr == nil && w .err != nil {
959
- if panicErr , ok := w .err .(* workflowPanicError ); ok && panicErr .value != nil {
960
- if _ , isStateMachinePanic := panicErr .value .(stateMachineIllegalStatePanic ); isStateMachinePanic {
961
- if isReplayTest {
962
- // NOTE: we should do this regardless if it's in replay test or not
963
- // but since previously we checked the wrong error type, it may break existing customers workflow
964
- // the issue is that we change the error type and that we change the error message, the customers
965
- // are checking the error string - we plan to wrap all errors to avoid this issue in client v2
966
- nonDeterministicErr = panicErr
967
- } else {
968
- // Since we know there is an error, we do the replay check to give more context in the log
969
- replayErr := matchReplayWithHistory (w .workflowInfo , replayDecisions , respondEvents )
970
- w .wth .logger .Error ("Ignored workflow panic error" ,
971
- zap .String (tagWorkflowType , task .WorkflowType .GetName ()),
972
- zap .String (tagWorkflowID , task .WorkflowExecution .GetWorkflowId ()),
973
- zap .String (tagRunID , task .WorkflowExecution .GetRunId ()),
974
- zap .Error (nonDeterministicErr ),
975
- zap .NamedError ("ReplayError" , replayErr ),
976
- )
977
- }
978
- }
979
- }
959
+ } else if panicErr , ok := w .getWorkflowPanicIfIllegaleStatePanic (); ok {
960
+ // This is a nondeterministic execution which ended up panicking
961
+ nonDeterministicErr = panicErr
962
+ nonDeterminismType = nonDeterminismDetectionTypeIllegalStatePanic
963
+ // Since we know there is an error, we do the replay check to give more context in the log
964
+ replayErr := matchReplayWithHistory (w .workflowInfo , replayDecisions , respondEvents )
965
+ w .wth .logger .Error ("Illegal state caused panic" ,
966
+ zap .String (tagWorkflowType , task .WorkflowType .GetName ()),
967
+ zap .String (tagWorkflowID , task .WorkflowExecution .GetWorkflowId ()),
968
+ zap .String (tagRunID , task .WorkflowExecution .GetRunId ()),
969
+ zap .Error (nonDeterministicErr ),
970
+ zap .NamedError ("ReplayError" , replayErr ),
971
+ )
980
972
}
981
973
982
974
if nonDeterministicErr != nil {
983
-
984
- w . wth . metricsScope . GetTaggedScope ( tagWorkflowType , task . WorkflowType . GetName ()) .Counter (metrics .NonDeterministicError ).Inc (1 )
975
+ scope := w . wth . metricsScope . GetTaggedScope ( tagWorkflowType , task . WorkflowType . GetName (), tagNonDeterminismDetectionType , string ( nonDeterminismType ))
976
+ scope .Counter (metrics .NonDeterministicError ).Inc (1 )
985
977
w .wth .logger .Error ("non-deterministic-error" ,
986
978
zap .String (tagWorkflowType , task .WorkflowType .GetName ()),
987
979
zap .String (tagWorkflowID , task .WorkflowExecution .GetWorkflowId ()),
@@ -998,7 +990,7 @@ ProcessEvents:
998
990
// workflow timeout.
999
991
return nil , nonDeterministicErr
1000
992
default :
1001
- panic (fmt . Sprintf ( "unknown mismatched workflow history policy." ) )
993
+ panic ("unknown mismatched workflow history policy." )
1002
994
}
1003
995
}
1004
996
@@ -1205,6 +1197,24 @@ func (w *workflowExecutionContextImpl) GetDecisionTimeout() time.Duration {
1205
1197
return time .Second * time .Duration (w .workflowInfo .TaskStartToCloseTimeoutSeconds )
1206
1198
}
1207
1199
1200
+ func (w * workflowExecutionContextImpl ) getWorkflowPanicIfIllegaleStatePanic () (* workflowPanicError , bool ) {
1201
+ if ! w .isWorkflowCompleted || w .err == nil {
1202
+ return nil , false
1203
+ }
1204
+
1205
+ panicErr , ok := w .err .(* workflowPanicError )
1206
+ if ! ok || panicErr .value == nil {
1207
+ return nil , false
1208
+ }
1209
+
1210
+ _ , ok = panicErr .value .(stateMachineIllegalStatePanic )
1211
+ if ! ok {
1212
+ return nil , false
1213
+ }
1214
+
1215
+ return panicErr , true
1216
+ }
1217
+
1208
1218
func (wth * workflowTaskHandlerImpl ) completeWorkflow (
1209
1219
eventHandler * workflowExecutionEventHandlerImpl ,
1210
1220
task * s.PollForDecisionTaskResponse ,
@@ -1312,7 +1322,7 @@ func (wth *workflowTaskHandlerImpl) completeWorkflow(
1312
1322
1313
1323
if closeDecision != nil {
1314
1324
decisions = append (decisions , closeDecision )
1315
- elapsed := time .Now (). Sub (workflowContext .workflowStartTime )
1325
+ elapsed := time .Since (workflowContext .workflowStartTime )
1316
1326
metricsScope .Timer (metrics .WorkflowEndToEndLatency ).Record (elapsed )
1317
1327
forceNewDecision = false
1318
1328
}
@@ -1845,7 +1855,8 @@ func traceLog(fn func()) {
1845
1855
}
1846
1856
}
1847
1857
1848
- func workflowCategorizedByTimeout (executionTimeout int32 ) string {
1858
+ func workflowCategorizedByTimeout (wfContext * workflowExecutionContextImpl ) string {
1859
+ executionTimeout := wfContext .workflowInfo .ExecutionStartToCloseTimeoutSeconds
1849
1860
if executionTimeout <= defaultInstantLivedWorkflowTimeoutUpperLimitInSec {
1850
1861
return "instant"
1851
1862
} else if executionTimeout <= defaultShortLivedWorkflowTimeoutUpperLimitInSec {
0 commit comments