|
| 1 | +package containerfailures |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "fmt" |
| 6 | + "sort" |
| 7 | + "strings" |
| 8 | + "time" |
| 9 | + |
| 10 | + "github.com/openshift/origin/pkg/monitortestframework" |
| 11 | + "github.com/openshift/origin/pkg/monitortests/testframework/watchnamespaces" |
| 12 | + |
| 13 | + "github.com/openshift/origin/pkg/monitortestlibrary/platformidentification" |
| 14 | + |
| 15 | + "github.com/openshift/origin/pkg/monitor/monitorapi" |
| 16 | + "github.com/openshift/origin/pkg/test/ginkgo/junitapi" |
| 17 | + "k8s.io/apimachinery/pkg/util/sets" |
| 18 | + "k8s.io/client-go/rest" |
| 19 | +) |
| 20 | + |
| 21 | +const ( |
| 22 | + MonitorName = "kubelet-container-restarts" |
| 23 | +) |
| 24 | + |
| 25 | +type containerFailuresTests struct { |
| 26 | + adminRESTConfig *rest.Config |
| 27 | +} |
| 28 | + |
| 29 | +func NewContainerFailuresTests() monitortestframework.MonitorTest { |
| 30 | + return &containerFailuresTests{} |
| 31 | +} |
| 32 | + |
| 33 | +func (w *containerFailuresTests) PrepareCollection(context.Context, *rest.Config, monitorapi.RecorderWriter) error { |
| 34 | + return nil |
| 35 | +} |
| 36 | + |
| 37 | +func (w *containerFailuresTests) StartCollection(ctx context.Context, adminRESTConfig *rest.Config, _ monitorapi.RecorderWriter) error { |
| 38 | + w.adminRESTConfig = adminRESTConfig |
| 39 | + return nil |
| 40 | +} |
| 41 | + |
| 42 | +func (w *containerFailuresTests) CollectData(context.Context, string, time.Time, time.Time) (monitorapi.Intervals, []*junitapi.JUnitTestCase, error) { |
| 43 | + return nil, nil, nil |
| 44 | +} |
| 45 | + |
| 46 | +func (*containerFailuresTests) ConstructComputedIntervals(context.Context, monitorapi.Intervals, monitorapi.ResourcesMap, time.Time, time.Time) (monitorapi.Intervals, error) { |
| 47 | + return nil, nil |
| 48 | +} |
| 49 | + |
| 50 | +func (w *containerFailuresTests) EvaluateTestsFromConstructedIntervals(_ context.Context, finalIntervals monitorapi.Intervals) ([]*junitapi.JUnitTestCase, error) { |
| 51 | + openshiftNamespaces, err := watchnamespaces.GetAllPlatformNamespaces() |
| 52 | + if err != nil { |
| 53 | + // Should not happen |
| 54 | + return nil, fmt.Errorf("unable to get platform namespaces %w", err) |
| 55 | + } |
| 56 | + containerExitsByNamespace := map[string]map[string][]string{} |
| 57 | + failuresByNamespace := map[string][]string{} |
| 58 | + for _, event := range finalIntervals { |
| 59 | + namespace := event.Locator.Keys[monitorapi.LocatorNamespaceKey] |
| 60 | + |
| 61 | + reason := event.Message.Reason |
| 62 | + code := event.Message.Annotations[monitorapi.AnnotationContainerExitCode] |
| 63 | + switch { |
| 64 | + // errors during container start should be highlighted because they are unexpected |
| 65 | + case reason == monitorapi.ContainerReasonContainerWait: |
| 66 | + if event.Message.Annotations[monitorapi.AnnotationCause] == "ContainerCreating" { |
| 67 | + continue |
| 68 | + } |
| 69 | + failuresByNamespace[namespace] = append(failuresByNamespace[namespace], fmt.Sprintf("container failed to start at %v: %v - %v", event.From, event.Locator.OldLocator(), event.Message.OldMessage())) |
| 70 | + |
| 71 | + // workload containers should never exit non-zero during normal operations |
| 72 | + case reason == monitorapi.ContainerReasonContainerExit && code != "0": |
| 73 | + containerExits, ok := containerExitsByNamespace[namespace] |
| 74 | + if !ok { |
| 75 | + containerExits = map[string][]string{} |
| 76 | + } |
| 77 | + containerExits[event.Locator.OldLocator()] = append(containerExits[event.Locator.OldLocator()], fmt.Sprintf("non-zero exit at %v: %v", event.From, event.Message.OldMessage())) |
| 78 | + containerExitsByNamespace[namespace] = containerExits |
| 79 | + } |
| 80 | + } |
| 81 | + // This is a map of the tests we want to fail on |
| 82 | + // In this case, this is any container that restarts more than 3 times |
| 83 | + excessiveExitsByNamespaceForFailedTests := map[string][]string{} |
| 84 | + // We want to report restarts of openshift containers as flakes |
| 85 | + excessiveExitsByNamespaceForFlakeTests := map[string][]string{} |
| 86 | + |
| 87 | + maxRestartCountForFailures := 3 |
| 88 | + maxRestartCountForFlakes := 2 |
| 89 | + |
| 90 | + clusterDataPlatform, _ := platformidentification.BuildClusterData(context.Background(), w.adminRESTConfig) |
| 91 | + |
| 92 | + exclusions := Exclusion{clusterData: clusterDataPlatform} |
| 93 | + for namespace, containerExits := range containerExitsByNamespace { |
| 94 | + for locator, messages := range containerExits { |
| 95 | + if len(messages) > 0 { |
| 96 | + messageSet := sets.NewString(messages...) |
| 97 | + // Blanket fail for restarts over maxRestartCount |
| 98 | + if !isThisContainerRestartExcluded(locator, exclusions) && len(messages) > maxRestartCountForFailures { |
| 99 | + excessiveExitsByNamespaceForFailedTests[namespace] = append(excessiveExitsByNamespaceForFailedTests[namespace], fmt.Sprintf("%s restarted %d times at:\n%s", locator, len(messages), strings.Join(messageSet.List(), "\n"))) |
| 100 | + } else if len(messages) >= maxRestartCountForFlakes { |
| 101 | + excessiveExitsByNamespaceForFlakeTests[namespace] = append(excessiveExitsByNamespaceForFlakeTests[namespace], fmt.Sprintf("%s restarted %d times at:\n%s", locator, len(messages), strings.Join(messageSet.List(), "\n"))) |
| 102 | + } |
| 103 | + } |
| 104 | + } |
| 105 | + } |
| 106 | + for namespace, excessiveExitsFails := range excessiveExitsByNamespaceForFailedTests { |
| 107 | + sort.Strings(excessiveExitsFails) |
| 108 | + excessiveExitsByNamespaceForFailedTests[namespace] = excessiveExitsFails |
| 109 | + } |
| 110 | + for namespace, excessiveExitsFlakes := range excessiveExitsByNamespaceForFlakeTests { |
| 111 | + sort.Strings(excessiveExitsFlakes) |
| 112 | + excessiveExitsByNamespaceForFlakeTests[namespace] = excessiveExitsFlakes |
| 113 | + } |
| 114 | + |
| 115 | + var testCases []*junitapi.JUnitTestCase |
| 116 | + |
| 117 | + for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases |
| 118 | + failures := failuresByNamespace[namespace] |
| 119 | + failToStartTestName := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not fail to start", namespace) |
| 120 | + if len(failures) > 0 { |
| 121 | + testCases = append(testCases, &junitapi.JUnitTestCase{ |
| 122 | + Name: failToStartTestName, |
| 123 | + SystemOut: strings.Join(failures, "\n"), |
| 124 | + FailureOutput: &junitapi.FailureOutput{ |
| 125 | + Output: fmt.Sprintf("%d container starts had issues\n\n%s", len(failures), strings.Join(failures, "\n")), |
| 126 | + }, |
| 127 | + }) |
| 128 | + } |
| 129 | + // mark flaky for now while we debug |
| 130 | + testCases = append(testCases, &junitapi.JUnitTestCase{Name: failToStartTestName}) |
| 131 | + } |
| 132 | + |
| 133 | + // We have identified more than 3 restarts as an excessive amount |
| 134 | + // This will not be tolerated anymore so the test will fail in this case. |
| 135 | + for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases |
| 136 | + excessiveExits := excessiveExitsByNamespaceForFailedTests[namespace] |
| 137 | + excessiveRestartTestName := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not exit an excessive amount of times", namespace) |
| 138 | + if len(excessiveExits) > 0 { |
| 139 | + testCases = append(testCases, &junitapi.JUnitTestCase{ |
| 140 | + Name: excessiveRestartTestName, |
| 141 | + SystemOut: strings.Join(excessiveExits, "\n"), |
| 142 | + FailureOutput: &junitapi.FailureOutput{ |
| 143 | + Output: fmt.Sprintf("%d containers with multiple restarts\n\n%s", len(excessiveExits), strings.Join(excessiveExits, "\n\n")), |
| 144 | + }, |
| 145 | + }) |
| 146 | + } else { |
| 147 | + testCases = append(testCases, &junitapi.JUnitTestCase{Name: excessiveRestartTestName}) |
| 148 | + } |
| 149 | + } |
| 150 | + |
| 151 | + // We have indentified more than 2 restarts to be considered moderate. |
| 152 | + // We will investigate these as flakes and potentially bring these up as bugs to fix. |
| 153 | + for _, namespace := range openshiftNamespaces { // this ensures we create test case for every namespace, even in success cases |
| 154 | + excessiveExits := excessiveExitsByNamespaceForFlakeTests[namespace] |
| 155 | + excessiveRestartTestNameForFlakes := fmt.Sprintf("[sig-architecture] platform pods in ns/%s should not exit a moderate amount of times", namespace) |
| 156 | + if len(excessiveExits) > 0 { |
| 157 | + testCases = append(testCases, &junitapi.JUnitTestCase{ |
| 158 | + Name: excessiveRestartTestNameForFlakes, |
| 159 | + SystemOut: strings.Join(excessiveExits, "\n"), |
| 160 | + FailureOutput: &junitapi.FailureOutput{ |
| 161 | + Output: fmt.Sprintf("%d containers with multiple restarts\n\n%s", len(excessiveExits), strings.Join(excessiveExits, "\n\n")), |
| 162 | + }, |
| 163 | + }) |
| 164 | + } |
| 165 | + testCases = append(testCases, &junitapi.JUnitTestCase{Name: excessiveRestartTestNameForFlakes}) |
| 166 | + } |
| 167 | + |
| 168 | + return testCases, nil |
| 169 | +} |
| 170 | + |
| 171 | +func (*containerFailuresTests) WriteContentToStorage(context.Context, string, string, monitorapi.Intervals, monitorapi.ResourcesMap) error { |
| 172 | + return nil |
| 173 | +} |
| 174 | + |
| 175 | +func (*containerFailuresTests) Cleanup(context.Context) error { |
| 176 | + return nil |
| 177 | +} |
0 commit comments