From aa9e9a4b53cd7823c9667b263f52d4d34019149c Mon Sep 17 00:00:00 2001 From: Sebastian Soto Date: Tue, 30 Sep 2025 10:28:34 -0400 Subject: [PATCH 1/2] [test] Add timeout to metrics query Prevents client.Do from blocking indefinitely. --- test/e2e/metrics_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 2c7fc6a35a..249451f647 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -12,6 +12,7 @@ import ( "strconv" "strings" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -267,7 +268,7 @@ func makePrometheusQuery(address, query, token string) ([]Result, error) { // InsecureSkipVerify is required to avoid errors due to bad certificate TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, } - client := &http.Client{Transport: tr} + client := &http.Client{Transport: tr, Timeout: 15 * time.Second} resp, err := client.Do(req) if err != nil { return nil, fmt.Errorf("error making GET request: %w", err) From 92ec5dc86674053da7fb0723b2b8db279bd020de Mon Sep 17 00:00:00 2001 From: Sebastian Soto Date: Tue, 30 Sep 2025 11:00:47 -0400 Subject: [PATCH 2/2] [test] Cleanup job failure logs When a job fails a wall of text is dumped in the test job logs, including the full job spec as well as all events associated with it. This hasn't proven to be useful for debugging, and makes the errors harder to read. All of this information can be found elsewhere in the artifact directory. --- test/e2e/network_test.go | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) diff --git a/test/e2e/network_test.go b/test/e2e/network_test.go index 5b1fa49068..bd83f06348 100644 --- a/test/e2e/network_test.go +++ b/test/e2e/network_test.go @@ -979,28 +979,11 @@ func (tc *testContext) waitUntilDeploymentScaled(name string) error { return false, nil }) if err != nil { - events, _ := tc.getPodEvents(name) - return fmt.Errorf("error waiting for deployment %v to scale: %v: %w", deployment, events, err) + return fmt.Errorf("error waiting for deployment %s to scale: %w", deployment.Name, err) } return nil } -// getPodEvents gets all events for any pod with the input in its name. Used for debugging purposes -func (tc *testContext) getPodEvents(name string) ([]v1.Event, error) { - eventList, err := tc.client.K8s.CoreV1().Events(tc.workloadNamespace).List(context.TODO(), metav1.ListOptions{ - FieldSelector: "involvedObject.kind=Pod"}) - if err != nil { - return []v1.Event{}, err - } - var podEvents []v1.Event - for _, event := range eventList.Items { - if strings.Contains(event.InvolvedObject.Name, name) { - podEvents = append(podEvents, event) - } - } - return podEvents, nil -} - // createLinuxCurlerJob creates a linux job to curl a specific endpoint. curl must be present in the container image. func (tc *testContext) createLinuxCurlerJob(jobSuffix, endpoint string, continuous bool) (*batchv1.Job, error) { // Retries a failed curl attempt once to avoid flakes @@ -1125,7 +1108,7 @@ func (tc *testContext) waitUntilJobSucceeds(name string) (string, error) { for i := 0; i < 60; i++ { job, err = tc.client.K8s.BatchV1().Jobs(tc.workloadNamespace).Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { - return "", err + return "", fmt.Errorf("error getting job with name %s: %w", name, err) } if !slices.ContainsFunc(job.Status.Conditions, func(condition batchv1.JobCondition) bool { return condition.Type == batchv1.JobComplete && condition.Status == v1.ConditionTrue @@ -1143,8 +1126,7 @@ func (tc *testContext) waitUntilJobSucceeds(name string) (string, error) { return condition.Type == batchv1.JobSuccessCriteriaMet && condition.Status == v1.ConditionTrue }) { // Job did not succeed, return error - events, _ := tc.getPodEvents(name) - return logs, fmt.Errorf("job %v failed: %v", job, events) + return logs, fmt.Errorf("job %s failed", name) } return logs, nil } @@ -1152,8 +1134,7 @@ func (tc *testContext) waitUntilJobSucceeds(name string) (string, error) { if err != nil { log.Printf("Unable to get logs associated with pod %s: %v", labelSelector, err) } - events, _ := tc.getPodEvents(name) - return "", fmt.Errorf("job %v timed out: %v", job, events) + return "", fmt.Errorf("job %s timed out", name) } // gatherPodLogs writes the logs associated with the label selector of a given pod job or deployment to the Artifacts