Skip to content

Commit c35e929

Browse files
committed
test: integration: nrt: dump logs on failure
Some tests want to look into pod events to learn about the scheduler decision. Add dump of events on failure to help troubleshooting the flakes. Signed-off-by: Francesco Romani <[email protected]>
1 parent 1652401 commit c35e929

File tree

3 files changed

+81
-46
lines changed

3 files changed

+81
-46
lines changed

test/integration/noderesourcetopology_cache_test.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -338,8 +338,8 @@ func TestTopologyCachePluginWithoutUpdates(t *testing.T) {
338338
string(corev1.ResourceCPU): "16",
339339
string(corev1.ResourceMemory): "24Gi",
340340
},
341-
expectedNode: "fake-node-cache-1",
342341
schedulerName: discardReservedSchedulerName,
342+
expectedNode: "fake-node-cache-1",
343343
},
344344
{
345345
podName: "nrt-pod-2000",
@@ -508,7 +508,7 @@ func TestTopologyCachePluginWithoutUpdates(t *testing.T) {
508508

509509
defer func() {
510510
cleanupTest(t, testCtx)
511-
klog.Infof("test environment cleaned up")
511+
klog.Infof("test environment %q cleaned up", tt.name)
512512
}()
513513

514514
if err := createNodesFromNodeResourceTopologies(cs, testCtx.Ctx, tt.nodeResourceTopologies); err != nil {
@@ -554,8 +554,10 @@ func TestTopologyCachePluginWithoutUpdates(t *testing.T) {
554554
if p.isDelete {
555555
var err error
556556
klog.Infof("Waiting before to delete Pod %q", p.podName)
557-
err = podIsScheduled(1*time.Second, 20, cs, ns, p.podName)
557+
updatedPod, err := podIsScheduled(1*time.Second, 20, cs, ns, p.podName)
558558
if err != nil {
559+
// we need more context, but we don't want to clutter the logs
560+
t.Logf("%s: pod %s/%s to be scheduled, error: %v\nstatus=%s", tt.name, p.pod.Namespace, p.pod.Name, err, formatObject(updatedPod.Status))
559561
t.Errorf("Pod %q to be scheduled, error: %v", p.pod.Name, err)
560562
}
561563

@@ -579,15 +581,17 @@ func TestTopologyCachePluginWithoutUpdates(t *testing.T) {
579581
}
580582

581583
var action string = "scheduled"
582-
var checkPod func(interval time.Duration, times int, cs clientset.Interface, podNamespace, podName string) error = podIsScheduled
584+
var checkPod func(interval time.Duration, times int, cs clientset.Interface, podNamespace, podName string) (*corev1.Pod, error) = podIsScheduled
583585
if p.expectedNode == "" {
584586
action = "kept pending"
585587
checkPod = podIsPending
586588
}
587589

588-
err = checkPod(1*time.Second, 20, cs, p.pod.Namespace, p.pod.Name)
590+
updatedPod, err := checkPod(1*time.Second, 20, cs, p.pod.Namespace, p.pod.Name)
589591
if err != nil {
590-
t.Errorf("Pod %q to be %s, error: %v", p.pod.Name, action, err)
592+
// we need more context, but we don't want to clutter the logs
593+
t.Logf("%s: pod %s/%s to be %s, error: %v\nstatus=%s", tt.name, p.pod.Namespace, p.pod.Name, action, err, formatObject(updatedPod.Status))
594+
t.Errorf("Pod %s/%s to be %s, error: %v", p.pod.Namespace, p.pod.Name, action, err)
591595
}
592596
klog.Infof("Pod %v %s", p.pod.Name, action)
593597
}
@@ -604,7 +608,7 @@ func TestTopologyCachePluginWithoutUpdates(t *testing.T) {
604608
}
605609

606610
if !podMatchesExpectedNode(p.pod.Namespace, p.pod.Name, nodeName, p.expectedNode) {
607-
t.Errorf("misplaced pod: %q", p.pod.Name)
611+
t.Errorf("misplaced pod: %q got %q expected %q", p.pod.Name, nodeName, p.expectedNode)
608612
}
609613
}
610614

test/integration/noderesourcetopology_test.go

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,14 +18,14 @@ package integration
1818

1919
import (
2020
"context"
21-
"encoding/json"
2221
"fmt"
2322
"strings"
2423
"testing"
2524
"time"
2625

2726
v1 "k8s.io/api/core/v1"
2827
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
28+
"k8s.io/apimachinery/pkg/fields"
2929
"k8s.io/apimachinery/pkg/util/uuid"
3030
"k8s.io/apimachinery/pkg/util/wait"
3131
"k8s.io/client-go/dynamic"
@@ -1994,21 +1994,26 @@ func TestTopologyMatchPlugin(t *testing.T) {
19941994
// if tt.expectedNodes == 0 we don't expect the pod to get scheduled
19951995
} else {
19961996
// wait for the pod scheduling to failed.
1997+
var err error
1998+
var events []v1.Event
19971999
if err := wait.Poll(5*time.Second, 20*time.Second, func() (bool, error) {
1998-
events, err := podFailedScheduling(cs, ns, p.Name)
2000+
events, err = getPodEvents(cs, ns, p.Name)
19992001
if err != nil {
20002002
// This could be a connection error, so we want to retry.
20012003
klog.ErrorS(err, "Failed check pod scheduling status for pod", "pod", klog.KRef(ns, p.Name))
20022004
return false, nil
20032005
}
2004-
for _, e := range events {
2005-
if strings.Contains(e.Message, tt.errMsg) {
2006+
candidateEvents := filterPodFailedSchedulingEvents(events)
2007+
for _, ce := range candidateEvents {
2008+
if strings.Contains(ce.Message, tt.errMsg) {
20062009
return true, nil
20072010
}
2008-
klog.Warningf("Pod failed but error message does not contain substring: %q; got %q instead", tt.errMsg, e.Message)
2011+
klog.Warningf("Pod failed but error message does not contain substring: %q; got %q instead", tt.errMsg, ce.Message)
20092012
}
20102013
return false, nil
20112014
}); err != nil {
2015+
// we need more context to troubleshoot, but let's not clutter the actual error
2016+
t.Logf("pod %q scheduling should failed with error: %v got %v events:\n%s", p.Name, tt.errMsg, err, formatEvents(events))
20122017
t.Errorf("pod %q scheduling should failed, error: %v", p.Name, err)
20132018
}
20142019
}
@@ -2146,29 +2151,42 @@ func parseTestUserEntry(entries []nrtTestUserEntry, ns string) []nrtTestEntry {
21462151
return teList
21472152
}
21482153

2149-
func podFailedScheduling(c clientset.Interface, podNamespace, podName string) ([]v1.Event, error) {
2150-
var failedSchedulingEvents []v1.Event
2151-
opt := metav1.ListOptions{
2152-
FieldSelector: fmt.Sprintf("involvedObject.name=%s", podName),
2153-
TypeMeta: metav1.TypeMeta{Kind: "Pod"},
2154+
func getPodEvents(c clientset.Interface, podNamespace, podName string) ([]v1.Event, error) {
2155+
opts := metav1.ListOptions{
2156+
FieldSelector: fields.SelectorFromSet(map[string]string{
2157+
"involvedObject.name": podName,
2158+
"involvedObject.namespace": podNamespace,
2159+
// TODO: use uid
2160+
}).String(),
2161+
TypeMeta: metav1.TypeMeta{Kind: "Pod"},
21542162
}
2155-
events, err := c.CoreV1().Events(podNamespace).List(context.TODO(), opt)
2163+
evs, err := c.CoreV1().Events(podNamespace).List(context.TODO(), opts)
21562164
if err != nil {
2157-
return failedSchedulingEvents, err
2165+
return nil, err
21582166
}
2167+
return evs.Items, nil
2168+
}
21592169

2160-
for _, e := range events.Items {
2161-
if e.Reason == "FailedScheduling" {
2162-
failedSchedulingEvents = append(failedSchedulingEvents, e)
2170+
func filterPodFailedSchedulingEvents(events []v1.Event) []v1.Event {
2171+
var failedSchedulingEvents []v1.Event
2172+
for _, ev := range events {
2173+
if ev.Reason == "FailedScheduling" {
2174+
failedSchedulingEvents = append(failedSchedulingEvents, ev)
21632175
}
21642176
}
2165-
return failedSchedulingEvents, nil
2177+
return failedSchedulingEvents
21662178
}
21672179

2168-
func formatObject(obj interface{}) string {
2169-
bytes, err := json.Marshal(obj)
2170-
if err != nil {
2171-
return fmt.Sprintf("<ERROR: %s>", err)
2180+
func formatEvents(events []v1.Event) string {
2181+
var sb strings.Builder
2182+
for idx, ev := range events {
2183+
fmt.Fprintf(&sb, "%02d - %s\n", idx, eventToString(ev))
21722184
}
2173-
return string(bytes)
2185+
return sb.String()
2186+
}
2187+
2188+
func eventToString(ev v1.Event) string {
2189+
return fmt.Sprintf("type=%q action=%q message=%q reason=%q reportedBy={%s/%s}",
2190+
ev.Type, ev.Action, ev.Message, ev.Reason, ev.ReportingController, ev.ReportingInstance,
2191+
)
21742192
}

test/integration/nrtutils.go

Lines changed: 31 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package integration
1818

1919
import (
2020
"context"
21+
"encoding/json"
2122
"fmt"
2223
"sort"
2324
"strings"
@@ -202,31 +203,35 @@ func (n *nrtWrapper) Obj() *topologyv1alpha2.NodeResourceTopology {
202203
return &n.nrt
203204
}
204205

205-
func podIsScheduled(interval time.Duration, times int, cs clientset.Interface, podNamespace, podName string) error {
206-
return wait.Poll(interval, time.Duration(times)*interval, func() (bool, error) {
207-
return podScheduled(cs, podNamespace, podName), nil
206+
func podIsScheduled(interval time.Duration, times int, cs clientset.Interface, podNamespace, podName string) (*corev1.Pod, error) {
207+
var err error
208+
var pod *corev1.Pod
209+
waitErr := wait.Poll(interval, time.Duration(times)*interval, func() (bool, error) {
210+
pod, err = cs.CoreV1().Pods(podNamespace).Get(context.TODO(), podName, metav1.GetOptions{})
211+
if err != nil {
212+
// This could be a connection error so we want to retry.
213+
klog.ErrorS(err, "Failed to get pod", "pod", klog.KRef(podNamespace, podName))
214+
return false, err
215+
}
216+
return pod.Spec.NodeName != "", nil
208217
})
218+
return pod, waitErr
209219
}
210220

211-
func podIsPending(interval time.Duration, times int, cs clientset.Interface, podNamespace, podName string) error {
221+
func podIsPending(interval time.Duration, times int, cs clientset.Interface, podNamespace, podName string) (*corev1.Pod, error) {
222+
var err error
223+
var pod *corev1.Pod
212224
for attempt := 0; attempt < times; attempt++ {
213-
if !podPending(cs, podNamespace, podName) {
214-
return fmt.Errorf("pod %s/%s not pending", podNamespace, podName)
225+
pod, err = cs.CoreV1().Pods(podNamespace).Get(context.TODO(), podName, metav1.GetOptions{})
226+
if err != nil {
227+
return pod, fmt.Errorf("pod %s/%s not pending: %w", podNamespace, podName, err)
228+
}
229+
if pod.Spec.NodeName != "" {
230+
return pod, fmt.Errorf("pod %s/%s not pending: bound to %q", podNamespace, podName, pod.Spec.NodeName)
215231
}
216232
time.Sleep(interval)
217233
}
218-
return nil
219-
}
220-
221-
// podPending returns true if a node is not assigned to the given pod.
222-
func podPending(c clientset.Interface, podNamespace, podName string) bool {
223-
pod, err := c.CoreV1().Pods(podNamespace).Get(context.TODO(), podName, metav1.GetOptions{})
224-
if err != nil {
225-
// This could be a connection error so we want to retry.
226-
klog.ErrorS(err, "Failed to get pod", "pod", klog.KRef(podNamespace, podName))
227-
return false
228-
}
229-
return pod.Spec.NodeName == ""
234+
return pod, nil
230235
}
231236

232237
func accumulateResourcesToCapacity(nrt topologyv1alpha2.NodeResourceTopology) corev1.ResourceList {
@@ -301,3 +306,11 @@ func makeTestFullyAvailableNRTs() []*topologyv1alpha2.NodeResourceTopology {
301306
}).Obj(),
302307
)
303308
}
309+
310+
func formatObject(obj interface{}) string {
311+
bytes, err := json.Marshal(obj)
312+
if err != nil {
313+
return fmt.Sprintf("<ERROR: %s>", err)
314+
}
315+
return string(bytes)
316+
}

0 commit comments

Comments
 (0)