Skip to content

Commit 8f06197

Browse files
authored
[RayJob] Add Failure Feedback (log and event) for Failed k8s Creation Task (#2306)
1 parent 5d3bceb commit 8f06197

File tree

2 files changed

+62
-1
lines changed

2 files changed

+62
-1
lines changed

ray-operator/controllers/ray/rayjob_controller.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -528,10 +528,12 @@ func (r *RayJobReconciler) createNewK8sJob(ctx context.Context, rayJobInstance *
528528

529529
// Create the Kubernetes Job
530530
if err := r.Client.Create(ctx, job); err != nil {
531+
logger.Error(err, "Failed to create new Kubernetes Job")
532+
r.Recorder.Eventf(rayJobInstance, corev1.EventTypeWarning, "k8sJobCreationFailed", "Failed to create new Kubernetes Job %s: %v", job.Name, err)
531533
return err
532534
}
533535
logger.Info("Kubernetes Job created", "RayJob", rayJobInstance.Name, "Kubernetes Job", job.Name)
534-
r.Recorder.Eventf(rayJobInstance, corev1.EventTypeNormal, "Created", "Created Kubernetes Job %s", job.Name)
536+
r.Recorder.Eventf(rayJobInstance, corev1.EventTypeNormal, "k8sJobCreationCreated", "Created Kubernetes Job %s", job.Name)
535537
return nil
536538
}
537539

ray-operator/controllers/ray/rayjob_controller_unit_test.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package ray
22

33
import (
44
"context"
5+
"strings"
56
"testing"
67

78
"github.com/stretchr/testify/assert"
@@ -12,10 +13,13 @@ import (
1213
"k8s.io/apimachinery/pkg/types"
1314
"k8s.io/client-go/tools/record"
1415
"k8s.io/utils/ptr"
16+
"sigs.k8s.io/controller-runtime/pkg/client"
1517
clientFake "sigs.k8s.io/controller-runtime/pkg/client/fake"
18+
"sigs.k8s.io/controller-runtime/pkg/client/interceptor"
1619

1720
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
1821
utils "github.com/ray-project/kuberay/ray-operator/controllers/ray/utils"
22+
"github.com/ray-project/kuberay/ray-operator/pkg/client/clientset/versioned/scheme"
1923
)
2024

2125
func TestCreateK8sJobIfNeed(t *testing.T) {
@@ -370,3 +374,58 @@ func TestValidateRayJobSpec(t *testing.T) {
370374
})
371375
assert.Error(t, err, "The RayJob is invalid because the backoffLimit must be a positive integer.")
372376
}
377+
378+
func TestFailedCreatek8sJob(t *testing.T) {
379+
rayJob := &rayv1.RayJob{
380+
ObjectMeta: metav1.ObjectMeta{
381+
Name: "test-rayjob",
382+
Namespace: "default",
383+
},
384+
}
385+
386+
submitterTemplate := corev1.PodTemplateSpec{
387+
ObjectMeta: metav1.ObjectMeta{
388+
Name: "test-submit-pod",
389+
Namespace: "default",
390+
},
391+
Spec: corev1.PodSpec{
392+
Containers: []corev1.Container{
393+
{
394+
Name: "ray-submit",
395+
Image: "rayproject/ray:latest",
396+
},
397+
},
398+
},
399+
}
400+
401+
fakeClient := clientFake.NewClientBuilder().WithInterceptorFuncs(interceptor.Funcs{
402+
Create: func(_ context.Context, _ client.WithWatch, _ client.Object, _ ...client.CreateOption) error {
403+
return utils.ErrFailedCreateWorkerPod
404+
},
405+
}).WithScheme(scheme.Scheme).Build()
406+
407+
recorder := record.NewFakeRecorder(100)
408+
409+
reconciler := &RayJobReconciler{
410+
Client: fakeClient,
411+
Recorder: recorder,
412+
Scheme: scheme.Scheme,
413+
}
414+
415+
err := reconciler.createNewK8sJob(context.Background(), rayJob, submitterTemplate)
416+
417+
assert.NotNil(t, err, "Expected error due to simulated job creation failure")
418+
419+
var foundFailureEvent bool
420+
events := []string{}
421+
for len(recorder.Events) > 0 {
422+
event := <-recorder.Events
423+
if strings.Contains(event, "Failed to create new Kubernetes Job") {
424+
foundFailureEvent = true
425+
break
426+
}
427+
events = append(events, event)
428+
}
429+
430+
assert.Truef(t, foundFailureEvent, "Expected event to be generated for job creation failure, got events: %s", strings.Join(events, "\n"))
431+
}

0 commit comments

Comments
 (0)