Skip to content

Commit daf778c

Browse files
sebsotoopenshift-cherrypick-robot
authored andcommitted
Fix job success logic
We are reporting jobs as failed if a pod within the job failed. This is incorrect logic, as a single pod failing is not a failure state for the entire job. The job will keep creating pods until a success is seen.
1 parent 4a8da4c commit daf778c

File tree

2 files changed

+29
-19
lines changed

2 files changed

+29
-19
lines changed

test/e2e/network_test.go

Lines changed: 28 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"net/http"
99
"os"
1010
"path/filepath"
11+
"slices"
1112
"strings"
1213
"testing"
1314
"time"
@@ -238,12 +239,12 @@ func (tc *testContext) collectDeploymentLogs(deployment *appsv1.Deployment) erro
238239
keyValPairs = append(keyValPairs, key+"="+value)
239240
}
240241
labelSelector := strings.Join(keyValPairs, ",")
241-
_, err := tc.gatherPodLogs(labelSelector)
242+
_, err := tc.gatherPodLogs(labelSelector, false)
242243
return err
243244
}
244245

245246
// getLogs uses a label selector and returns the logs associated with each pod
246-
func (tc *testContext) getLogs(podLabelSelector string) (string, error) {
247+
func (tc *testContext) getLogs(podLabelSelector string, latestOnly bool) (string, error) {
247248
if podLabelSelector == "" {
248249
return "", fmt.Errorf("pod label selector is empty")
249250
}
@@ -256,6 +257,12 @@ func (tc *testContext) getLogs(podLabelSelector string) (string, error) {
256257
return "", fmt.Errorf("expected at least 1 pod and found 0")
257258
}
258259
var logs string
260+
if latestOnly {
261+
latestPod := slices.MaxFunc(pods.Items, func(a, b v1.Pod) int {
262+
return a.Status.StartTime.Time.Compare(b.Status.StartTime.Time)
263+
})
264+
pods.Items = []v1.Pod{latestPod}
265+
}
259266
for _, pod := range pods.Items {
260267
logStream, err := tc.client.K8s.CoreV1().Pods(tc.workloadNamespace).GetLogs(pod.Name,
261268
&v1.PodLogOptions{}).Stream(context.TODO())
@@ -921,25 +928,28 @@ func (tc *testContext) waitUntilJobSucceeds(name string) (string, error) {
921928
if err != nil {
922929
return "", err
923930
}
931+
if !slices.ContainsFunc(job.Status.Conditions, func(condition batchv1.JobCondition) bool {
932+
return condition.Type == batchv1.JobComplete && condition.Status == v1.ConditionTrue
933+
}) {
934+
// job not yet complete, keep waiting
935+
time.Sleep(retryInterval)
936+
continue
937+
}
924938
labelSelector = "job-name=" + job.Name
925-
if job.Status.Succeeded > 0 {
926-
logs, err := tc.gatherPodLogs(labelSelector)
927-
if err != nil {
928-
log.Printf("Unable to get logs associated with pod %s: %v", labelSelector, err)
929-
}
930-
return logs, nil
939+
logs, err := tc.gatherPodLogs(labelSelector, true)
940+
if err != nil {
941+
log.Printf("Unable to get logs associated with pod %s: %v", labelSelector, err)
931942
}
932-
if job.Status.Failed > 0 {
933-
_, err = tc.gatherPodLogs(labelSelector)
934-
if err != nil {
935-
log.Printf("Unable to get logs associated with pod %s: %v", labelSelector, err)
936-
}
943+
if !slices.ContainsFunc(job.Status.Conditions, func(condition batchv1.JobCondition) bool {
944+
return condition.Type == batchv1.JobSuccessCriteriaMet && condition.Status == v1.ConditionTrue
945+
}) {
946+
// Job did not succeed, return error
937947
events, _ := tc.getPodEvents(name)
938-
return "", fmt.Errorf("job %v failed: %v", job, events)
948+
return logs, fmt.Errorf("job %v failed: %v", job, events)
939949
}
940-
time.Sleep(retryInterval)
950+
return logs, nil
941951
}
942-
_, err = tc.gatherPodLogs(labelSelector)
952+
_, err = tc.gatherPodLogs(labelSelector, true)
943953
if err != nil {
944954
log.Printf("Unable to get logs associated with pod %s: %v", labelSelector, err)
945955
}
@@ -949,15 +959,15 @@ func (tc *testContext) waitUntilJobSucceeds(name string) (string, error) {
949959

950960
// gatherPodLogs writes the logs associated with the label selector of a given pod job or deployment to the Artifacts
951961
// dir. Returns the written logs.
952-
func (tc *testContext) gatherPodLogs(labelSelector string) (string, error) {
962+
func (tc *testContext) gatherPodLogs(labelSelector string, latestOnly bool) (string, error) {
953963
podArtifacts := filepath.Join(os.Getenv("ARTIFACT_DIR"), "pods")
954964
podDir := filepath.Join(podArtifacts, labelSelector)
955965
err := os.MkdirAll(podDir, os.ModePerm)
956966
if err != nil {
957967
return "", fmt.Errorf("error creating pod log collection directory %s: %w", podDir, err)
958968
}
959969

960-
logs, err := tc.getLogs(labelSelector)
970+
logs, err := tc.getLogs(labelSelector, latestOnly)
961971
if err != nil {
962972
return "", fmt.Errorf("unable to get logs for pod %s: %w", labelSelector, err)
963973
}

test/e2e/upgrade_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ func (tc *testContext) testParallelUpgradesChecker(t *testing.T) {
110110
failedPods, err := tc.client.K8s.CoreV1().Pods(tc.workloadNamespace).List(context.TODO(), metav1.ListOptions{
111111
LabelSelector: "job-name=" + parallelUpgradesCheckerJobName, FieldSelector: "status.phase=Failed"})
112112
require.NoError(t, err)
113-
_, err = tc.gatherPodLogs("job-name=" + parallelUpgradesCheckerJobName)
113+
_, err = tc.gatherPodLogs("job-name="+parallelUpgradesCheckerJobName, false)
114114
if err != nil {
115115
log.Printf("unable to gather logs: %v", err)
116116
}

0 commit comments

Comments
 (0)