From b2d07066da7636826e2657b3617d4f405a03c476 Mon Sep 17 00:00:00 2001 From: Sebastian Soto Date: Tue, 30 Sep 2025 10:28:34 -0400 Subject: [PATCH 1/2] Add timeout to metrics query Prevents client.Do from blocking indefinitely. --- test/e2e/metrics_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 2c7fc6a35a..249451f647 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -12,6 +12,7 @@ import ( "strconv" "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -267,7 +268,7 @@ func makePrometheusQuery(address, query, token string) ([]Result, error) { // InsecureSkipVerify is required to avoid errors due to bad certificate TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, } - client := &http.Client{Transport: tr} + client := &http.Client{Transport: tr, Timeout: 15 * time.Second} resp, err := client.Do(req) if err != nil { return nil, fmt.Errorf("error making GET request: %w", err) From 9e735d6d468feba7e8dca80ad86a51b0a425ce12 Mon Sep 17 00:00:00 2001 From: Sebastian Soto Date: Tue, 30 Sep 2025 11:00:47 -0400 Subject: [PATCH 2/2] Cleanup job failure logs When a job fails a wall of text is dumped in the test job logs, including the full job spec as well as all events associated with it. This hasn't proven to be useful for debugging, and makes the errors harder to read. All of this information can be found elsewhere in the artifact directory. --- test/e2e/network_test.go | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/test/e2e/network_test.go b/test/e2e/network_test.go index a6b204214d..b5927fd4a6 100644 --- a/test/e2e/network_test.go +++ b/test/e2e/network_test.go @@ -979,28 +979,11 @@ func (tc *testContext) waitUntilDeploymentScaled(name string) error { return false, nil }) if err != nil { - events, _ := tc.getPodEvents(name) - return fmt.Errorf("error waiting for deployment %v to scale: %v: %w", deployment, events, err) + return fmt.Errorf("error waiting for deployment %s to scale: %w", deployment.Name, err) } return nil } -// getPodEvents gets all events for any pod with the input in its name. Used for debugging purposes -func (tc *testContext) getPodEvents(name string) ([]v1.Event, error) { - eventList, err := tc.client.K8s.CoreV1().Events(tc.workloadNamespace).List(context.TODO(), metav1.ListOptions{ - FieldSelector: "involvedObject.kind=Pod"}) - if err != nil { - return []v1.Event{}, err - } - var podEvents []v1.Event - for _, event := range eventList.Items { - if strings.Contains(event.InvolvedObject.Name, name) { - podEvents = append(podEvents, event) - } - } - return podEvents, nil -} - // createLinuxCurlerJob creates a linux job to curl a specific endpoint. curl must be present in the container image. func (tc *testContext) createLinuxCurlerJob(jobSuffix, endpoint string, continuous bool) (*batchv1.Job, error) { // Retries a failed curl attempt once to avoid flakes @@ -1125,7 +1108,7 @@ func (tc *testContext) waitUntilJobSucceeds(name string) (string, error) { for i := 0; i < 60; i++ { job, err = tc.client.K8s.BatchV1().Jobs(tc.workloadNamespace).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { - return "", err + return "", fmt.Errorf("error getting job with name %s: %w", name, err) } if !slices.ContainsFunc(job.Status.Conditions, func(condition batchv1.JobCondition) bool { return condition.Type == batchv1.JobComplete && condition.Status == v1.ConditionTrue @@ -1143,8 +1126,7 @@ func (tc *testContext) waitUntilJobSucceeds(name string) (string, error) { return condition.Type == batchv1.JobSuccessCriteriaMet && condition.Status == v1.ConditionTrue }) { // Job did not succeed, return error - events, _ := tc.getPodEvents(name) - return logs, fmt.Errorf("job %v failed: %v", job, events) + return logs, fmt.Errorf("job %s failed", name) } return logs, nil } @@ -1152,8 +1134,7 @@ func (tc *testContext) waitUntilJobSucceeds(name string) (string, error) { if err != nil { log.Printf("Unable to get logs associated with pod %s: %v", labelSelector, err) } - events, _ := tc.getPodEvents(name) - return "", fmt.Errorf("job %v timed out: %v", job, events) + return "", fmt.Errorf("job %s timed out", name) } // gatherPodLogs writes the logs associated with the label selector of a given pod job or deployment to the Artifacts