Skip to content

Commit ee150af

Browse files
tczhaoagilgur5
authored andcommitted
fix: only evaluate retry expression for fail/error node. Fixes argoproj#13058 (argoproj#13165)
Signed-off-by: Tianchu Zhao <[email protected]> (cherry picked from commit b28486c)
1 parent 028f9ec commit ee150af

File tree

2 files changed

+127
-12
lines changed

2 files changed

+127
-12
lines changed

workflow/controller/operator.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -961,18 +961,6 @@ func (woc *wfOperationCtx) processNodeRetries(node *wfv1.NodeStatus, retryStrate
961961
return node, true, nil
962962
}
963963

964-
if retryStrategy.Expression != "" && len(childNodeIds) > 0 {
965-
localScope := buildRetryStrategyLocalScope(node, woc.wf.Status.Nodes)
966-
scope := env.GetFuncMap(localScope)
967-
shouldContinue, err := argoexpr.EvalBool(retryStrategy.Expression, scope)
968-
if err != nil {
969-
return nil, false, err
970-
}
971-
if !shouldContinue && lastChildNode.Fulfilled() {
972-
return woc.markNodePhase(node.Name, lastChildNode.Phase, "retryStrategy.expression evaluated to false"), true, nil
973-
}
974-
}
975-
976964
if lastChildNode == nil {
977965
return node, true, nil
978966
}
@@ -1100,6 +1088,18 @@ func (woc *wfOperationCtx) processNodeRetries(node *wfv1.NodeStatus, retryStrate
11001088
return woc.markNodePhase(node.Name, lastChildNode.Phase, "No more retries left"), true, nil
11011089
}
11021090

1091+
if retryStrategy.Expression != "" && len(childNodeIds) > 0 {
1092+
localScope := buildRetryStrategyLocalScope(node, woc.wf.Status.Nodes)
1093+
scope := env.GetFuncMap(localScope)
1094+
shouldContinue, err := argoexpr.EvalBool(retryStrategy.Expression, scope)
1095+
if err != nil {
1096+
return nil, false, err
1097+
}
1098+
if !shouldContinue && lastChildNode.Fulfilled() {
1099+
return woc.markNodePhase(node.Name, lastChildNode.Phase, "retryStrategy.expression evaluated to false"), true, nil
1100+
}
1101+
}
1102+
11031103
woc.log.Infof("%d child nodes of %s failed. Trying again...", len(childNodeIds), node.Name)
11041104
return node, true, nil
11051105
}

workflow/controller/operator_test.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -934,6 +934,7 @@ func TestProcessNodeRetriesWithExpression(t *testing.T) {
934934
assert.Nil(t, err)
935935
// The parent node also gets marked as Succeeded.
936936
assert.Equal(t, n.Phase, wfv1.NodeSucceeded)
937+
assert.Equal(t, "", n.Message)
937938

938939
// Mark the parent node as running again and the lastChild as errored.
939940
n = woc.markNodePhase(n.Name, wfv1.NodeRunning)
@@ -943,6 +944,7 @@ func TestProcessNodeRetriesWithExpression(t *testing.T) {
943944
n, err = woc.wf.GetNodeByName(nodeName)
944945
assert.NoError(t, err)
945946
assert.Equal(t, n.Phase, wfv1.NodeError)
947+
assert.Equal(t, "retryStrategy.expression evaluated to false", n.Message)
946948

947949
// Add a third node that has failed.
948950
woc.markNodePhase(n.Name, wfv1.NodeRunning)
@@ -954,6 +956,119 @@ func TestProcessNodeRetriesWithExpression(t *testing.T) {
954956
n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
955957
assert.NoError(t, err)
956958
assert.Equal(t, n.Phase, wfv1.NodeFailed)
959+
assert.Equal(t, "No more retries left", n.Message)
960+
}
961+
962+
func TestProcessNodeRetriesMessageOrder(t *testing.T) {
963+
cancel, controller := newController()
964+
defer cancel()
965+
assert.NotNil(t, controller)
966+
wf := wfv1.MustUnmarshalWorkflow(helloWorldWf)
967+
assert.NotNil(t, wf)
968+
woc := newWorkflowOperationCtx(wf, controller)
969+
assert.NotNil(t, woc)
970+
// Verify that there are no nodes in the wf status.
971+
assert.Zero(t, len(woc.wf.Status.Nodes))
972+
973+
// Add the parent node for retries.
974+
nodeName := "test-node"
975+
nodeID := woc.wf.NodeID(nodeName)
976+
node := woc.initializeNode(nodeName, wfv1.NodeTypeRetry, "", &wfv1.WorkflowStep{}, "", wfv1.NodeRunning, &wfv1.NodeFlag{})
977+
retries := wfv1.RetryStrategy{}
978+
retries.Expression = "false"
979+
retries.Limit = intstrutil.ParsePtr("1")
980+
retries.RetryPolicy = wfv1.RetryPolicyAlways
981+
woc.wf.Status.Nodes[nodeID] = *node
982+
983+
assert.Equal(t, node.Phase, wfv1.NodeRunning)
984+
985+
// Ensure there are no child nodes yet.
986+
lastChild := getChildNodeIndex(node, woc.wf.Status.Nodes, -1)
987+
assert.Nil(t, lastChild)
988+
989+
// Add child nodes.
990+
for i := 0; i < 1; i++ {
991+
childNode := fmt.Sprintf("%s(%d)", nodeName, i)
992+
woc.initializeNode(childNode, wfv1.NodeTypePod, "", &wfv1.WorkflowStep{}, "", wfv1.NodeRunning, &wfv1.NodeFlag{Retried: true})
993+
woc.addChildNode(nodeName, childNode)
994+
}
995+
996+
n, err := woc.wf.GetNodeByName(nodeName)
997+
assert.NoError(t, err)
998+
lastChild = getChildNodeIndex(n, woc.wf.Status.Nodes, -1)
999+
assert.NotNil(t, lastChild)
1000+
1001+
// No retry related message for running node
1002+
n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
1003+
assert.NoError(t, err)
1004+
assert.Equal(t, n.Phase, wfv1.NodeRunning)
1005+
1006+
// No retry related message for pending node
1007+
woc.markNodePhase(lastChild.Name, wfv1.NodePending)
1008+
n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
1009+
assert.Nil(t, err)
1010+
assert.Equal(t, n.Phase, wfv1.NodeRunning)
1011+
assert.Equal(t, "", n.Message)
1012+
1013+
// No retry related message for succeeded node
1014+
woc.markNodePhase(lastChild.Name, wfv1.NodeSucceeded)
1015+
n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
1016+
assert.Nil(t, err)
1017+
assert.Equal(t, n.Phase, wfv1.NodeSucceeded)
1018+
assert.Equal(t, "", n.Message)
1019+
1020+
// workflow mark shutdown, no retry is evaluated
1021+
woc.wf.Spec.Shutdown = wfv1.ShutdownStrategyStop
1022+
n = woc.markNodePhase(n.Name, wfv1.NodeRunning)
1023+
woc.markNodePhase(lastChild.Name, wfv1.NodeError)
1024+
_, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
1025+
assert.NoError(t, err)
1026+
n, err = woc.wf.GetNodeByName(nodeName)
1027+
assert.NoError(t, err)
1028+
assert.Equal(t, n.Phase, wfv1.NodeError)
1029+
assert.Equal(t, "Stopped with strategy 'Stop'", n.Message)
1030+
woc.wf.Spec.Shutdown = ""
1031+
1032+
// Invalid retry policy, shouldn't evaluate expression
1033+
retries.RetryPolicy = "noExist"
1034+
n = woc.markNodePhase(n.Name, wfv1.NodeRunning)
1035+
woc.markNodePhase(lastChild.Name, wfv1.NodeError)
1036+
_, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
1037+
assert.Equal(t, err.Error(), "noExist is not a valid RetryPolicy")
1038+
1039+
// Node status doesn't with retrypolicy, shouldn't evaluate expression
1040+
retries.RetryPolicy = wfv1.RetryPolicyOnFailure
1041+
n = woc.markNodePhase(n.Name, wfv1.NodeRunning)
1042+
woc.markNodePhase(lastChild.Name, wfv1.NodeError)
1043+
_, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
1044+
assert.NoError(t, err)
1045+
n, err = woc.wf.GetNodeByName(nodeName)
1046+
assert.NoError(t, err)
1047+
assert.Equal(t, n.Phase, wfv1.NodeError)
1048+
assert.Equal(t, "", n.Message)
1049+
1050+
// Node status aligns with retrypolicy, should evaluate expression
1051+
retries.RetryPolicy = wfv1.RetryPolicyOnFailure
1052+
n = woc.markNodePhase(n.Name, wfv1.NodeRunning)
1053+
woc.markNodePhase(lastChild.Name, wfv1.NodeFailed)
1054+
_, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
1055+
assert.NoError(t, err)
1056+
n, err = woc.wf.GetNodeByName(nodeName)
1057+
assert.NoError(t, err)
1058+
assert.Equal(t, n.Phase, wfv1.NodeFailed)
1059+
assert.Equal(t, "retryStrategy.expression evaluated to false", n.Message)
1060+
1061+
// Node status aligns with retrypolicy but reach max retry limit, shouldn't evaluate expression
1062+
woc.markNodePhase(n.Name, wfv1.NodeRunning)
1063+
childNode := fmt.Sprintf("%s(%d)", nodeName, 1)
1064+
woc.initializeNode(childNode, wfv1.NodeTypePod, "", &wfv1.WorkflowStep{}, "", wfv1.NodeFailed, &wfv1.NodeFlag{Retried: true})
1065+
woc.addChildNode(nodeName, childNode)
1066+
n, err = woc.wf.GetNodeByName(nodeName)
1067+
assert.NoError(t, err)
1068+
n, _, err = woc.processNodeRetries(n, retries, &executeTemplateOpts{})
1069+
assert.NoError(t, err)
1070+
assert.Equal(t, n.Phase, wfv1.NodeFailed)
1071+
assert.Equal(t, "No more retries left", n.Message)
9571072
}
9581073

9591074
func parseRetryMessage(message string) (int, error) {

0 commit comments

Comments
 (0)