Skip to content

Commit a51d4cf

Browse files
committed
OCPBUGS-59750: Move the legacy container failures
Move the legacy container failures monitor tests to a dedicated MonitorTest implementation and removed the test from the default suites for disruptive test suites.
1 parent b392d63 commit a51d4cf

File tree

6 files changed

+182
-133
lines changed

6 files changed

+182
-133
lines changed

pkg/defaultmonitortests/types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/generationanalyzer"
2424
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/legacykubeapiservermonitortests"
2525
"github.com/openshift/origin/pkg/monitortests/kubeapiserver/staticpodinstall"
26+
"github.com/openshift/origin/pkg/monitortests/kubelet/containerfailures"
2627
"github.com/openshift/origin/pkg/monitortests/machines/watchmachines"
2728
"github.com/openshift/origin/pkg/monitortests/monitoring/disruptionmetricsapi"
2829
"github.com/openshift/origin/pkg/monitortests/monitoring/statefulsetsrecreation"
@@ -140,6 +141,7 @@ func newDefaultMonitorTests(info monitortestframework.MonitorTestInitializationI
140141
monitorTestRegistry.AddMonitorTestOrDie(apiunreachablefromclientmetrics.MonitorName, "kube-apiserver", apiunreachablefromclientmetrics.NewMonitorTest())
141142
monitorTestRegistry.AddMonitorTestOrDie(faultyloadbalancer.MonitorName, "kube-apiserver", faultyloadbalancer.NewMonitorTest())
142143
monitorTestRegistry.AddMonitorTestOrDie(staticpodinstall.MonitorName, "kube-apiserver", staticpodinstall.NewStaticPodInstallMonitorTest())
144+
monitorTestRegistry.AddMonitorTestOrDie(containerfailures.MonitorName, "Node / Kubelet", containerfailures.NewContainerFailuresTests())
143145

144146
return monitorTestRegistry
145147
}
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
package containerfailures
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sort"
7+
"strings"
8+
"time"
9+
10+
"github.com/openshift/origin/pkg/monitortestframework"
11+
"github.com/openshift/origin/pkg/monitortests/testframework/watchnamespaces"
12+
13+
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
14+
15+
"github.com/openshift/origin/pkg/monitor/monitorapi"
16+
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
17+
"k8s.io/apimachinery/pkg/util/sets"
18+
"k8s.io/client-go/rest"
19+
)
20+
21+
const (
22+
MonitorName = "kubelet-container-restarts"
23+
)
24+
25+
type containerFailuresTests struct {
26+
adminRESTConfig *rest.Config
27+
}
28+
29+
func NewContainerFailuresTests() monitortestframework.MonitorTest {
30+
return &containerFailuresTests{}
31+
}
32+
33+
func (w *containerFailuresTests) PrepareCollection(context.Context, *rest.Config, monitorapi.RecorderWriter) error {
34+
return nil
35+
}
36+
37+
func (w *containerFailuresTests) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, _ monitorapi.RecorderWriter) error {
38+
w.adminRESTConfig = adminRESTConfig
39+
return nil
40+
}
41+
42+
func (w *containerFailuresTests) CollectData(context.Context, string, time.Time, time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) {
43+
return nil, nil, nil
44+
}
45+
46+
func (*containerFailuresTests) ConstructComputedIntervals(context.Context, monitorapi.Intervals, monitorapi.ResourcesMap, time.Time, time.Time) (monitorapi.Intervals, error) {
47+
return nil, nil
48+
}
49+
50+
func (w *containerFailuresTests) EvaluateTestsFromConstructedIntervals(_ context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
51+
openshiftNamespaces, err := watchnamespaces.GetAllPlatformNamespaces()
52+
if err != nil {
53+
// Should not happen
54+
return nil, fmt.Errorf("unable to get platform namespaces %w", err)
55+
}
56+
containerExitsByNamespace := map[string]map[string][]string{}
57+
failuresByNamespace := map[string][]string{}
58+
for _, event := range finalIntervals {
59+
namespace := event.Locator.Keys[monitorapi.LocatorNamespaceKey]
60+
61+
reason := event.Message.Reason
62+
code := event.Message.Annotations[monitorapi.AnnotationContainerExitCode]
63+
switch {
64+
// errors during container start should be highlighted because they are unexpected
65+
case reason == monitorapi.ContainerReasonContainerWait:
66+
if event.Message.Annotations[monitorapi.AnnotationCause] == "ContainerCreating" {
67+
continue
68+
}
69+
failuresByNamespace[namespace] = append(failuresByNamespace[namespace], fmt.Sprintf("container failed to start at %v: %v - %v", event.From, event.Locator.OldLocator(), event.Message.OldMessage()))
70+
71+
// workload containers should never exit non-zero during normal operations
72+
case reason == monitorapi.ContainerReasonContainerExit && code != "0":
73+
containerExits, ok := containerExitsByNamespace[namespace]
74+
if !ok {
75+
containerExits = map[string][]string{}
76+
}
77+
containerExits[event.Locator.OldLocator()] = append(containerExits[event.Locator.OldLocator()], fmt.Sprintf("non-zero exit at %v: %v", event.From, event.Message.OldMessage()))
78+
containerExitsByNamespace[namespace] = containerExits
79+
}
80+
}
81+
// This is a map of the tests we want to fail on
82+
// In this case, this is any container that restarts more than 3 times
83+
excessiveExitsByNamespaceForFailedTests := map[string][]string{}
84+
// We want to report restarts of openshift containers as flakes
85+
excessiveExitsByNamespaceForFlakeTests := map[string][]string{}
86+
87+
maxRestartCountForFailures := 3
88+
maxRestartCountForFlakes := 2
89+
90+
clusterDataPlatform, _ := platformidentification.BuildClusterData(context.Background(), w.adminRESTConfig)
91+
92+
exclusions := Exclusion{clusterData: clusterDataPlatform}
93+
for namespace, containerExits := range containerExitsByNamespace {
94+
for locator, messages := range containerExits {
95+
if len(messages) > 0 {
96+
messageSet := sets.NewString(messages...)
97+
// Blanket fail for restarts over maxRestartCount
98+
if !isThisContainerRestartExcluded(locator, exclusions) && len(messages) > maxRestartCountForFailures {
99+
excessiveExitsByNamespaceForFailedTests[namespace] = append(excessiveExitsByNamespaceForFailedTests[namespace], fmt.Sprintf("%s restarted %d times at:\n%s", locator, len(messages), strings.Join(messageSet.List(), "\n")))
100+
} else if len(messages) >= maxRestartCountForFlakes {
101+
excessiveExitsByNamespaceForFlakeTests[namespace] = append(excessiveExitsByNamespaceForFlakeTests[namespace], fmt.Sprintf("%s restarted %d times at:\n%s", locator, len(messages), strings.Join(messageSet.List(), "\n")))
102+
}
103+
}
104+
}
105+
}
106+
for namespace, excessiveExitsFails := range excessiveExitsByNamespaceForFailedTests {
107+
sort.Strings(excessiveExitsFails)
108+
excessiveExitsByNamespaceForFailedTests[namespace] = excessiveExitsFails
109+
}
110+
for namespace, excessiveExitsFlakes := range excessiveExitsByNamespaceForFlakeTests {
111+
sort.Strings(excessiveExitsFlakes)
112+
excessiveExitsByNamespaceForFlakeTests[namespace] = excessiveExitsFlakes
113+
}
114+
115+
var testCases []*junitapi.JUnitTestCase
116+
117+
for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases
118+
failures := failuresByNamespace[namespace]
119+
failToStartTestName := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not fail to start", namespace)
120+
if len(failures) > 0 {
121+
testCases = append(testCases, &junitapi.JUnitTestCase{
122+
Name: failToStartTestName,
123+
SystemOut: strings.Join(failures, "\n"),
124+
FailureOutput: &junitapi.FailureOutput{
125+
Output: fmt.Sprintf("%d container starts had issues\n\n%s", len(failures), strings.Join(failures, "\n")),
126+
},
127+
})
128+
}
129+
// mark flaky for now while we debug
130+
testCases = append(testCases, &junitapi.JUnitTestCase{Name: failToStartTestName})
131+
}
132+
133+
// We have identified more than 3 restarts as an excessive amount
134+
// This will not be tolerated anymore so the test will fail in this case.
135+
for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases
136+
excessiveExits := excessiveExitsByNamespaceForFailedTests[namespace]
137+
excessiveRestartTestName := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not exit an excessive amount of times", namespace)
138+
if len(excessiveExits) > 0 {
139+
testCases = append(testCases, &junitapi.JUnitTestCase{
140+
Name: excessiveRestartTestName,
141+
SystemOut: strings.Join(excessiveExits, "\n"),
142+
FailureOutput: &junitapi.FailureOutput{
143+
Output: fmt.Sprintf("%d containers with multiple restarts\n\n%s", len(excessiveExits), strings.Join(excessiveExits, "\n\n")),
144+
},
145+
})
146+
} else {
147+
testCases = append(testCases, &junitapi.JUnitTestCase{Name: excessiveRestartTestName})
148+
}
149+
}
150+
151+
// We have indentified more than 2 restarts to be considered moderate.
152+
// We will investigate these as flakes and potentially bring these up as bugs to fix.
153+
for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases
154+
excessiveExits := excessiveExitsByNamespaceForFlakeTests[namespace]
155+
excessiveRestartTestNameForFlakes := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not exit a moderate amount of times", namespace)
156+
if len(excessiveExits) > 0 {
157+
testCases = append(testCases, &junitapi.JUnitTestCase{
158+
Name: excessiveRestartTestNameForFlakes,
159+
SystemOut: strings.Join(excessiveExits, "\n"),
160+
FailureOutput: &junitapi.FailureOutput{
161+
Output: fmt.Sprintf("%d containers with multiple restarts\n\n%s", len(excessiveExits), strings.Join(excessiveExits, "\n\n")),
162+
},
163+
})
164+
}
165+
testCases = append(testCases, &junitapi.JUnitTestCase{Name: excessiveRestartTestNameForFlakes})
166+
}
167+
168+
return testCases, nil
169+
}
170+
171+
func (*containerFailuresTests) WriteContentToStorage(context.Context, string, string, monitorapi.Intervals, monitorapi.ResourcesMap) error {
172+
return nil
173+
}
174+
175+
func (*containerFailuresTests) Cleanup(context.Context) error {
176+
return nil
177+
}

pkg/monitortests/node/legacynodemonitortests/exclusions.go renamed to pkg/monitortests/kubelet/containerfailures/exclusions.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package legacynodemonitortests
1+
package containerfailures
22

33
import (
44
"regexp"

pkg/monitortests/node/legacynodemonitortests/exclusions_test.go renamed to pkg/monitortests/kubelet/containerfailures/exclusions_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
package legacynodemonitortests
1+
package containerfailures
22

33
import (
44
"testing"

pkg/monitortests/node/legacynodemonitortests/kubelet.go

Lines changed: 0 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,18 @@ package legacynodemonitortests
22

33
import (
44
"bytes"
5-
"context"
65
"fmt"
76
"regexp"
87
"sort"
98
"strings"
109
"time"
1110

1211
"github.com/openshift/origin/pkg/monitortestlibrary/platformidentification"
13-
"github.com/openshift/origin/pkg/monitortests/testframework/watchnamespaces"
1412
"github.com/openshift/origin/pkg/test/ginkgo/junitapi"
1513

1614
"github.com/openshift/origin/pkg/monitor/monitorapi"
1715

1816
"k8s.io/apimachinery/pkg/util/sets"
19-
"k8s.io/client-go/rest"
2017
)
2118

2219
func testKubeletToAPIServerGracefulTermination(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
@@ -224,127 +221,6 @@ func testKubeAPIServerGracefulTermination(events monitorapi.Intervals) []*junita
224221
return tests
225222
}
226223

227-
func testContainerFailures(adminRestConfig *rest.Config, events monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
228-
openshiftNamespaces, err := watchnamespaces.GetAllPlatformNamespaces()
229-
if err != nil {
230-
// Should not happen
231-
return nil, fmt.Errorf("unable to get platform namespaces %w", err)
232-
}
233-
containerExitsByNamespace := map[string]map[string][]string{}
234-
failuresByNamespace := map[string][]string{}
235-
for _, event := range events {
236-
namespace := event.Locator.Keys[monitorapi.LocatorNamespaceKey]
237-
238-
reason := event.Message.Reason
239-
code := event.Message.Annotations[monitorapi.AnnotationContainerExitCode]
240-
switch {
241-
// errors during container start should be highlighted because they are unexpected
242-
case reason == monitorapi.ContainerReasonContainerWait:
243-
if event.Message.Annotations[monitorapi.AnnotationCause] == "ContainerCreating" {
244-
continue
245-
}
246-
failuresByNamespace[namespace] = append(failuresByNamespace[namespace], fmt.Sprintf("container failed to start at %v: %v - %v", event.From, event.Locator.OldLocator(), event.Message.OldMessage()))
247-
248-
// workload containers should never exit non-zero during normal operations
249-
case reason == monitorapi.ContainerReasonContainerExit && code != "0":
250-
containerExits, ok := containerExitsByNamespace[namespace]
251-
if !ok {
252-
containerExits = map[string][]string{}
253-
}
254-
containerExits[event.Locator.OldLocator()] = append(containerExits[event.Locator.OldLocator()], fmt.Sprintf("non-zero exit at %v: %v", event.From, event.Message.OldMessage()))
255-
containerExitsByNamespace[namespace] = containerExits
256-
}
257-
}
258-
// This is a map of the tests we want to fail on
259-
// In this case, this is any container that restarts more than 3 times
260-
excessiveExitsByNamespaceForFailedTests := map[string][]string{}
261-
// We want to report restarts of openshift containers as flakes
262-
excessiveExitsByNamespaceForFlakeTests := map[string][]string{}
263-
264-
maxRestartCountForFailures := 3
265-
maxRestartCountForFlakes := 2
266-
267-
clusterDataPlatform, _ := platformidentification.BuildClusterData(context.Background(), adminRestConfig)
268-
269-
exclusions := Exclusion{clusterData: clusterDataPlatform}
270-
for namespace, containerExits := range containerExitsByNamespace {
271-
for locator, messages := range containerExits {
272-
if len(messages) > 0 {
273-
messageSet := sets.NewString(messages...)
274-
// Blanket fail for restarts over maxRestartCount
275-
if !isThisContainerRestartExcluded(locator, exclusions) && len(messages) > maxRestartCountForFailures {
276-
excessiveExitsByNamespaceForFailedTests[namespace] = append(excessiveExitsByNamespaceForFailedTests[namespace], fmt.Sprintf("%s restarted %d times at:\n%s", locator, len(messages), strings.Join(messageSet.List(), "\n")))
277-
} else if len(messages) >= maxRestartCountForFlakes {
278-
excessiveExitsByNamespaceForFlakeTests[namespace] = append(excessiveExitsByNamespaceForFlakeTests[namespace], fmt.Sprintf("%s restarted %d times at:\n%s", locator, len(messages), strings.Join(messageSet.List(), "\n")))
279-
}
280-
}
281-
}
282-
}
283-
for namespace, excessiveExitsFails := range excessiveExitsByNamespaceForFailedTests {
284-
sort.Strings(excessiveExitsFails)
285-
excessiveExitsByNamespaceForFailedTests[namespace] = excessiveExitsFails
286-
}
287-
for namespace, excessiveExitsFlakes := range excessiveExitsByNamespaceForFlakeTests {
288-
sort.Strings(excessiveExitsFlakes)
289-
excessiveExitsByNamespaceForFlakeTests[namespace] = excessiveExitsFlakes
290-
}
291-
292-
var testCases []*junitapi.JUnitTestCase
293-
294-
for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases
295-
failures := failuresByNamespace[namespace]
296-
failToStartTestName := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not fail to start", namespace)
297-
if len(failures) > 0 {
298-
testCases = append(testCases, &junitapi.JUnitTestCase{
299-
Name: failToStartTestName,
300-
SystemOut: strings.Join(failures, "\n"),
301-
FailureOutput: &junitapi.FailureOutput{
302-
Output: fmt.Sprintf("%d container starts had issues\n\n%s", len(failures), strings.Join(failures, "\n")),
303-
},
304-
})
305-
}
306-
// mark flaky for now while we debug
307-
testCases = append(testCases, &junitapi.JUnitTestCase{Name: failToStartTestName})
308-
}
309-
310-
// We have identified more than 3 restarts as an excessive amount
311-
// This will not be tolerated anymore so the test will fail in this case.
312-
for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases
313-
excessiveExits := excessiveExitsByNamespaceForFailedTests[namespace]
314-
excessiveRestartTestName := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not exit an excessive amount of times", namespace)
315-
if len(excessiveExits) > 0 {
316-
testCases = append(testCases, &junitapi.JUnitTestCase{
317-
Name: excessiveRestartTestName,
318-
SystemOut: strings.Join(excessiveExits, "\n"),
319-
FailureOutput: &junitapi.FailureOutput{
320-
Output: fmt.Sprintf("%d containers with multiple restarts\n\n%s", len(excessiveExits), strings.Join(excessiveExits, "\n\n")),
321-
},
322-
})
323-
} else {
324-
testCases = append(testCases, &junitapi.JUnitTestCase{Name: excessiveRestartTestName})
325-
}
326-
}
327-
328-
// We have indentified more than 2 restarts to be considered moderate.
329-
// We will investigate these as flakes and potentially bring these up as bugs to fix.
330-
for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases
331-
excessiveExits := excessiveExitsByNamespaceForFlakeTests[namespace]
332-
excessiveRestartTestNameForFlakes := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not exit a moderate amount of times", namespace)
333-
if len(excessiveExits) > 0 {
334-
testCases = append(testCases, &junitapi.JUnitTestCase{
335-
Name: excessiveRestartTestNameForFlakes,
336-
SystemOut: strings.Join(excessiveExits, "\n"),
337-
FailureOutput: &junitapi.FailureOutput{
338-
Output: fmt.Sprintf("%d containers with multiple restarts\n\n%s", len(excessiveExits), strings.Join(excessiveExits, "\n\n")),
339-
},
340-
})
341-
}
342-
testCases = append(testCases, &junitapi.JUnitTestCase{Name: excessiveRestartTestNameForFlakes})
343-
}
344-
345-
return testCases, nil
346-
}
347-
348224
func testKubeApiserverProcessOverlap(events monitorapi.Intervals) []*junitapi.JUnitTestCase {
349225
const testName = "[sig-node] overlapping apiserver process detected during kube-apiserver rollout"
350226
success := &junitapi.JUnitTestCase{Name: testName}

pkg/monitortests/node/legacynodemonitortests/monitortest.go

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,13 +41,7 @@ func (*legacyMonitorTests) ConstructComputedIntervals(ctx context.Context, start
4141
func (w *legacyMonitorTests) EvaluateTestsFromConstructedIntervals(ctx context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) {
4242

4343
clusterData, _ := platformidentification.BuildClusterData(context.Background(), w.adminRESTConfig)
44-
45-
containerFailures, err := testContainerFailures(w.adminRESTConfig, finalIntervals)
46-
if err != nil {
47-
return nil, err
48-
}
49-
junits := []*junitapi.JUnitTestCase{}
50-
junits = append(junits, containerFailures...)
44+
var junits []*junitapi.JUnitTestCase
5145
junits = append(junits, testDeleteGracePeriodZero(finalIntervals)...)
5246
junits = append(junits, testKubeApiserverProcessOverlap(finalIntervals)...)
5347
junits = append(junits, testKubeAPIServerGracefulTermination(finalIntervals)...)

0 commit comments

Comments
 (0)