Skip to content

Commit 659dce0

Browse files
eleanorprattEleanor Pratt
andauthored
Allow pods to be retried when failed containers match regex (#4483)
This adds functionality to the executor to allow Armada to be configured such that pods that have failed and have entered "user" code, i.e any initContainers or Containers in non-pending state, can be retried if they match configured regex. This regex is for the container name and the container message. For any retrying of this type to occur, both the container name regex and the message regex must match. Given the nature of regex, not setting the regex value results in that regex matching all string values. For example, this configuration would retry any container where 'test' is in the termination message, regardless of the container name. ``` failedPodChecks: containerStatuses: messageRegexp: "test" ``` whereas this configuration would match any container, regardless of what the termination message is or if there is a termination message value at all. (The message string can be empty.) ``` failedPodChecks: containerStatuses: - containerNameRegexp: ".*" ``` --------- Co-authored-by: Eleanor Pratt <Eleanor.Pratt@gresearch.co.uk>
1 parent 9c8611c commit 659dce0

File tree

5 files changed

+240
-10
lines changed

5 files changed

+240
-10
lines changed

internal/executor/configuration/podchecks/types.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ type ContainerStatusCheck struct {
4949
}
5050

5151
type FailedChecks struct {
52-
Events []PodEventCheck
53-
PodStatuses []PodStatusCheck
52+
Events []PodEventCheck
53+
PodStatuses []PodStatusCheck
54+
ContainerStatuses []FailedContainerStatusCheck
5455
}
5556

5657
type PodEventCheck struct {
@@ -63,3 +64,8 @@ type PodStatusCheck struct {
6364
Regexp string
6465
Reason string
6566
}
67+
68+
type FailedContainerStatusCheck struct {
69+
ContainerNameRegexp string
70+
MessageRegexp string
71+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package failedpodchecks
2+
3+
import (
4+
"fmt"
5+
"regexp"
6+
7+
v1 "k8s.io/api/core/v1"
8+
9+
"github.com/armadaproject/armada/internal/executor/configuration/podchecks"
10+
)
11+
12+
type failedContainerStatusRetryChecker interface {
13+
IsRetryable(pod *v1.Pod) (bool, string)
14+
}
15+
16+
type failedContainerStatusCheck struct {
17+
containerNameRegexp *regexp.Regexp
18+
messageRegexp *regexp.Regexp
19+
}
20+
21+
type FailedContainerStatusChecker struct {
22+
checks []failedContainerStatusCheck
23+
}
24+
25+
func NewFailedContainerStatusChecker(checks []podchecks.FailedContainerStatusCheck) (*FailedContainerStatusChecker, error) {
26+
podContainerStatusChecks := make([]failedContainerStatusCheck, 0, len(checks))
27+
28+
for _, check := range checks {
29+
containerNameRegex, err := regexp.Compile(check.ContainerNameRegexp)
30+
if err != nil {
31+
return nil, fmt.Errorf("cannot parse regexp \"%s\": %+v", check.ContainerNameRegexp, err)
32+
}
33+
messageRegex, err := regexp.Compile(check.MessageRegexp)
34+
if err != nil {
35+
return nil, fmt.Errorf("cannot parse regexp \"%s\": %+v", check.MessageRegexp, err)
36+
}
37+
38+
podContainerStatusChecks = append(podContainerStatusChecks, failedContainerStatusCheck{
39+
containerNameRegexp: containerNameRegex,
40+
messageRegexp: messageRegex,
41+
})
42+
}
43+
return &FailedContainerStatusChecker{
44+
checks: podContainerStatusChecks,
45+
}, nil
46+
}
47+
48+
func (f *FailedContainerStatusChecker) IsRetryable(pod *v1.Pod) (bool, string) {
49+
containers := pod.Status.ContainerStatuses
50+
containers = append(containers, pod.Status.InitContainerStatuses...)
51+
52+
for _, check := range f.checks {
53+
for _, container := range containers {
54+
terminatedContainer := container.State.Terminated
55+
if terminatedContainer != nil {
56+
if check.containerNameRegexp.MatchString(container.Name) {
57+
if check.messageRegexp.MatchString(terminatedContainer.Message) {
58+
return true, terminatedContainer.Message
59+
}
60+
}
61+
}
62+
63+
}
64+
}
65+
66+
return false, ""
67+
}
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
package failedpodchecks
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
"github.com/stretchr/testify/require"
8+
v1 "k8s.io/api/core/v1"
9+
10+
"github.com/armadaproject/armada/internal/executor/configuration/podchecks"
11+
)
12+
13+
func TestFailedContainerStatusChecker_IsRetryable(t *testing.T) {
14+
tests := map[string]struct {
15+
input *v1.Pod
16+
checks []podchecks.FailedContainerStatusCheck
17+
expectedResult bool
18+
expectMessage bool
19+
}{
20+
"empty checks": {
21+
input: makePodInitContainerWithNameAndMessage("orange", "message"),
22+
expectedResult: false,
23+
expectMessage: false,
24+
},
25+
"matches on name regex": {
26+
input: makePodContainerWithNameAndMessage("apple", ""),
27+
checks: []podchecks.FailedContainerStatusCheck{{ContainerNameRegexp: "app.*"}},
28+
expectedResult: true,
29+
expectMessage: false,
30+
},
31+
"matches on message regex": {
32+
input: makePodInitContainerWithNameAndMessage("apple", "message"),
33+
checks: []podchecks.FailedContainerStatusCheck{{ContainerNameRegexp: ".*", MessageRegexp: "mess.*"}},
34+
expectedResult: true,
35+
expectMessage: true,
36+
},
37+
"matches on name and message if supplied": {
38+
input: makePodContainerWithNameAndMessage("banana", "message"),
39+
checks: []podchecks.FailedContainerStatusCheck{{ContainerNameRegexp: "ban.*", MessageRegexp: "mess.*"}},
40+
expectedResult: true,
41+
expectMessage: true,
42+
},
43+
"matches on name - no match": {
44+
input: makePodInitContainerWithNameAndMessage("plum", "message"),
45+
checks: []podchecks.FailedContainerStatusCheck{{ContainerNameRegexp: "app.*", MessageRegexp: ""}},
46+
expectedResult: false,
47+
expectMessage: false,
48+
},
49+
"matches on message- no match": {
50+
input: makePodContainerWithNameAndMessage("plum", "message"),
51+
checks: []podchecks.FailedContainerStatusCheck{{ContainerNameRegexp: "plum", MessageRegexp: "memo.*"}},
52+
expectedResult: false,
53+
expectMessage: false,
54+
},
55+
"multiple checks - no match": {
56+
input: makePodInitContainerWithNameAndMessage("name", "message"),
57+
checks: []podchecks.FailedContainerStatusCheck{
58+
{ContainerNameRegexp: "nomenclature.*", MessageRegexp: ""},
59+
{ContainerNameRegexp: "name", MessageRegexp: "memo.*"},
60+
{ContainerNameRegexp: "nomenclature.*", MessageRegexp: "message2"},
61+
},
62+
expectedResult: false,
63+
expectMessage: false,
64+
},
65+
"multiple checks - match": {
66+
input: makePodContainerWithNameAndMessage("name", "message"),
67+
checks: []podchecks.FailedContainerStatusCheck{
68+
{ContainerNameRegexp: ".*", MessageRegexp: ""},
69+
{ContainerNameRegexp: "na.*", MessageRegexp: "message"},
70+
{ContainerNameRegexp: "mess.*", MessageRegexp: ""},
71+
},
72+
expectedResult: true,
73+
expectMessage: true,
74+
},
75+
}
76+
77+
for name, tc := range tests {
78+
t.Run(name, func(t *testing.T) {
79+
checker, err := NewFailedContainerStatusChecker(tc.checks)
80+
require.NoError(t, err)
81+
82+
isRetryable, message := checker.IsRetryable(tc.input)
83+
84+
assert.Equal(t, tc.expectedResult, isRetryable)
85+
if tc.expectMessage {
86+
assert.NotEmpty(t, message)
87+
} else {
88+
assert.Empty(t, message)
89+
}
90+
})
91+
}
92+
}
93+
94+
func TestFailedContainerStatusChecker_Initialisation(t *testing.T) {
95+
// Empty
96+
_, err := NewFailedContainerStatusChecker([]podchecks.FailedContainerStatusCheck{})
97+
assert.NoError(t, err)
98+
// Valid
99+
_, err = NewFailedContainerStatusChecker([]podchecks.FailedContainerStatusCheck{{ContainerNameRegexp: ".*"}})
100+
assert.NoError(t, err)
101+
// Invalid regex
102+
_, err = NewFailedContainerStatusChecker([]podchecks.FailedContainerStatusCheck{{ContainerNameRegexp: ".*", MessageRegexp: "["}})
103+
assert.Error(t, err)
104+
}
105+
106+
func makePodInitContainerWithNameAndMessage(name string, message string) *v1.Pod {
107+
return &v1.Pod{
108+
Status: v1.PodStatus{
109+
InitContainerStatuses: []v1.ContainerStatus{
110+
{
111+
Name: name,
112+
State: v1.ContainerState{
113+
Terminated: &v1.ContainerStateTerminated{
114+
Message: message,
115+
},
116+
},
117+
},
118+
},
119+
},
120+
}
121+
}
122+
123+
func makePodContainerWithNameAndMessage(name string, message string) *v1.Pod {
124+
return &v1.Pod{
125+
Status: v1.PodStatus{
126+
ContainerStatuses: []v1.ContainerStatus{
127+
{
128+
Name: name,
129+
State: v1.ContainerState{
130+
Terminated: &v1.ContainerStateTerminated{
131+
Message: message,
132+
},
133+
},
134+
},
135+
},
136+
},
137+
}
138+
}

internal/executor/podchecks/failedpodchecks/pod_checks.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@ type RetryChecker interface {
1111
}
1212

1313
type PodRetryChecker struct {
14-
podStatusChecker podStatusRetryChecker
15-
podEventChecker eventRetryChecker
14+
podStatusChecker podStatusRetryChecker
15+
podEventChecker eventRetryChecker
16+
failedContainerStatusChecker failedContainerStatusRetryChecker
1617
}
1718

1819
func NewPodRetryChecker(config podchecks.FailedChecks) (*PodRetryChecker, error) {
@@ -24,16 +25,21 @@ func NewPodRetryChecker(config podchecks.FailedChecks) (*PodRetryChecker, error)
2425
if err != nil {
2526
return nil, err
2627
}
28+
failedContainerStatusChecker, err := NewFailedContainerStatusChecker(config.ContainerStatuses)
29+
if err != nil {
30+
return nil, err
31+
}
2732

2833
return &PodRetryChecker{
29-
podEventChecker: podEventChecker,
30-
podStatusChecker: podStatusChecker,
34+
podEventChecker: podEventChecker,
35+
podStatusChecker: podStatusChecker,
36+
failedContainerStatusChecker: failedContainerStatusChecker,
3137
}, nil
3238
}
3339

3440
func (f *PodRetryChecker) IsRetryable(pod *v1.Pod, podEvents []*v1.Event) (bool, string) {
3541
if hasStartedContainers(pod) {
36-
return false, ""
42+
return f.failedContainerStatusChecker.IsRetryable(pod)
3743
}
3844

3945
isRetryable, message := f.podEventChecker.IsRetryable(podEvents)
@@ -49,7 +55,10 @@ func hasStartedContainers(pod *v1.Pod) bool {
4955
containers := pod.Status.ContainerStatuses
5056
containers = append(containers, pod.Status.InitContainerStatuses...)
5157
for _, container := range containers {
52-
if container.LastTerminationState.Terminated != nil || container.LastTerminationState.Running != nil {
58+
if container.State.Running != nil ||
59+
container.State.Terminated != nil ||
60+
container.LastTerminationState.Running != nil ||
61+
container.LastTerminationState.Terminated != nil {
5362
return true
5463
}
5564
}

internal/executor/podchecks/failedpodchecks/pod_checks_test.go

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,9 @@ func TestPodRetryCheckerIsRetryable_ChecksPodHasNotStartedAnyContainers(t *testi
142142
for name, tc := range tests {
143143
t.Run(name, func(t *testing.T) {
144144
checker := PodRetryChecker{
145-
podEventChecker: &stubEventRetryChecker{isRetryable: true, message: "fallthrough"},
146-
podStatusChecker: &stubPodStatusRetryChecker{isRetryable: true, message: "fallthrough"},
145+
podEventChecker: &stubEventRetryChecker{isRetryable: true, message: "fallthrough"},
146+
podStatusChecker: &stubPodStatusRetryChecker{isRetryable: true, message: "fallthrough"},
147+
failedContainerStatusChecker: &stubFailedContainerStatusChecker{isRetryable: false, message: ""}, // if this reaches evaluation, then failure results in a non-retryable state overall
147148
}
148149

149150
isRetryable, message := checker.IsRetryable(tc.pod, []*v1.Event{})
@@ -207,3 +208,12 @@ type stubPodStatusRetryChecker struct {
207208
func (s *stubPodStatusRetryChecker) IsRetryable(pod *v1.Pod) (bool, string) {
208209
return s.isRetryable, s.message
209210
}
211+
212+
type stubFailedContainerStatusChecker struct {
213+
isRetryable bool
214+
message string
215+
}
216+
217+
func (s *stubFailedContainerStatusChecker) IsRetryable(pod *v1.Pod) (bool, string) {
218+
return s.isRetryable, s.message
219+
}

0 commit comments

Comments
 (0)