Skip to content

Commit 5a5534f

Browse files
authored
[RayService] Create k8s events after creating/updating k8s resources (#2873)
Signed-off-by: Rueian <[email protected]>
1 parent b2a701d commit 5a5534f

File tree

3 files changed

+35
-10
lines changed

3 files changed

+35
-10
lines changed

ray-operator/controllers/ray/rayservice_controller.go

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -135,10 +135,10 @@ func (r *RayServiceReconciler) Reconcile(ctx context.Context, request ctrl.Reque
135135
// Check both active and pending Ray clusters to see if the head Pod is ready to serve requests.
136136
// This is important to ensure the reliability of the serve service because the head Pod cannot
137137
// rely on readiness probes to determine serve readiness.
138-
if err := r.updateHeadPodServeLabel(ctx, activeRayClusterInstance, rayServiceInstance.Spec.ExcludeHeadPodFromServeSvc); err != nil {
138+
if err := r.updateHeadPodServeLabel(ctx, rayServiceInstance, activeRayClusterInstance, rayServiceInstance.Spec.ExcludeHeadPodFromServeSvc); err != nil {
139139
return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
140140
}
141-
if err := r.updateHeadPodServeLabel(ctx, pendingRayClusterInstance, rayServiceInstance.Spec.ExcludeHeadPodFromServeSvc); err != nil {
141+
if err := r.updateHeadPodServeLabel(ctx, rayServiceInstance, pendingRayClusterInstance, rayServiceInstance.Spec.ExcludeHeadPodFromServeSvc); err != nil {
142142
return ctrl.Result{RequeueAfter: ServiceDefaultRequeueDuration}, err
143143
}
144144

@@ -517,7 +517,7 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi
517517
if activeRayCluster, err = constructRayClusterForRayService(rayServiceInstance, activeRayCluster.Name, r.Scheme); err != nil {
518518
return nil, nil, err
519519
}
520-
err = r.updateRayClusterInstance(ctx, activeRayCluster)
520+
err = r.updateRayClusterInstance(ctx, rayServiceInstance, activeRayCluster)
521521
return activeRayCluster, pendingRayCluster, err
522522
}
523523

@@ -527,7 +527,7 @@ func (r *RayServiceReconciler) reconcileRayCluster(ctx context.Context, rayServi
527527
if pendingRayCluster, err = constructRayClusterForRayService(rayServiceInstance, pendingRayCluster.Name, r.Scheme); err != nil {
528528
return nil, nil, err
529529
}
530-
err = r.updateRayClusterInstance(ctx, pendingRayCluster)
530+
err = r.updateRayClusterInstance(ctx, rayServiceInstance, pendingRayCluster)
531531
return activeRayCluster, pendingRayCluster, err
532532
}
533533

@@ -567,8 +567,10 @@ func (r *RayServiceReconciler) cleanUpRayClusterInstance(ctx context.Context, ra
567567
if reasonForDeletion != "" {
568568
logger.Info("reconcileRayCluster", "delete Ray cluster", rayClusterInstance.Name, "reason", reasonForDeletion)
569569
if err := r.Delete(ctx, &rayClusterInstance, client.PropagationPolicy(metav1.DeletePropagationBackground)); err != nil {
570+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToDeleteRayCluster), "Failed to delete the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err)
570571
return err
571572
}
573+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.DeletedRayCluster), "Deleted the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name)
572574
}
573575
}
574576
}
@@ -694,7 +696,7 @@ func shouldPrepareNewCluster(ctx context.Context, rayServiceInstance *rayv1.RayS
694696
}
695697

696698
// updateRayClusterInstance updates the RayCluster instance.
697-
func (r *RayServiceReconciler) updateRayClusterInstance(ctx context.Context, rayClusterInstance *rayv1.RayCluster) error {
699+
func (r *RayServiceReconciler) updateRayClusterInstance(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster) error {
698700
logger := ctrl.LoggerFrom(ctx)
699701
logger.Info("updateRayClusterInstance", "Name", rayClusterInstance.Name, "Namespace", rayClusterInstance.Namespace)
700702

@@ -721,7 +723,13 @@ func (r *RayServiceReconciler) updateRayClusterInstance(ctx context.Context, ray
721723
currentRayCluster.Annotations = rayClusterInstance.Annotations
722724

723725
// Update the RayCluster
724-
return r.Update(ctx, currentRayCluster)
726+
err = r.Update(ctx, currentRayCluster)
727+
if err != nil {
728+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateRayCluster), "Failed to update the RayCluster %s/%s: %v", currentRayCluster.Namespace, currentRayCluster.Name, err)
729+
} else {
730+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedRayCluster), "Updated the RayCluster %s/%s", currentRayCluster.Namespace, currentRayCluster.Name)
731+
}
732+
return err
725733
}
726734

727735
// createRayClusterInstance deletes the old RayCluster instance if exists. Only when no existing RayCluster, create a new RayCluster instance.
@@ -743,9 +751,11 @@ func (r *RayServiceReconciler) createRayClusterInstance(ctx context.Context, ray
743751
logger.Info("Ray cluster already exists, config changes. Need to recreate. Delete the pending one now.", "key", rayClusterKey.String(), "rayClusterInstance.Spec", rayClusterInstance.Spec, "rayServiceInstance.Spec.RayClusterSpec", rayServiceInstance.Spec.RayClusterSpec)
744752
delErr := r.Delete(ctx, rayClusterInstance, client.PropagationPolicy(metav1.DeletePropagationBackground))
745753
if delErr == nil {
754+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.DeletedRayCluster), "Deleted the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name)
746755
// Go to next loop and check if the ray cluster is deleted.
747756
return nil, nil
748757
} else if !errors.IsNotFound(delErr) {
758+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToDeleteRayCluster), "Failed to delete the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, delErr)
749759
return nil, delErr
750760
}
751761
// if error is `not found`, then continue.
@@ -760,10 +770,11 @@ func (r *RayServiceReconciler) createRayClusterInstance(ctx context.Context, ray
760770
return nil, err
761771
}
762772
if err = r.Create(ctx, rayClusterInstance); err != nil {
773+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateRayCluster), "Failed to create the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err)
763774
return nil, err
764775
}
765776
logger.Info("created rayCluster for rayService", "rayCluster", rayClusterInstance)
766-
777+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.CreatedRayCluster), "Created the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name)
767778
return rayClusterInstance, nil
768779
}
769780

@@ -980,8 +991,10 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService
980991
oldSvc.Spec = *newSvc.Spec.DeepCopy()
981992
logger.Info("Update Kubernetes Service", "serviceType", serviceType)
982993
if updateErr := r.Update(ctx, oldSvc); updateErr != nil {
994+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateService), "Failed to update the service %s/%s, %v", oldSvc.Namespace, oldSvc.Name, updateErr)
983995
return nil, updateErr
984996
}
997+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedService), "Updated the service %s/%s", oldSvc.Namespace, oldSvc.Name)
985998
// Return the updated service.
986999
return oldSvc, nil
9871000
} else if errors.IsNotFound(err) {
@@ -990,8 +1003,10 @@ func (r *RayServiceReconciler) reconcileServices(ctx context.Context, rayService
9901003
return nil, err
9911004
}
9921005
if err := r.Create(ctx, newSvc); err != nil {
1006+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToCreateService), "Failed to create the service %s/%s, %v", newSvc.Namespace, newSvc.Name, err)
9931007
return nil, err
9941008
}
1009+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.CreatedService), "Created the service %s/%s", newSvc.Namespace, newSvc.Name)
9951010
return newSvc, nil
9961011
}
9971012
return nil, err
@@ -1045,13 +1060,15 @@ func (r *RayServiceReconciler) reconcileServe(ctx context.Context, rayServiceIns
10451060

10461061
if shouldUpdate {
10471062
if err = r.updateServeDeployment(ctx, rayServiceInstance, rayDashboardClient, rayClusterInstance.Name); err != nil {
1063+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateServeApplications), "Failed to update serve applications to the RayCluster %s/%s: %v", rayClusterInstance.Namespace, rayClusterInstance.Name, err)
10481064
return false, serveApplications, err
10491065
}
1066+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedServeApplications), "Updated serve applications to the RayCluster %s/%s", rayClusterInstance.Namespace, rayClusterInstance.Name)
10501067
}
10511068
return isReady, serveApplications, nil
10521069
}
10531070

1054-
func (r *RayServiceReconciler) updateHeadPodServeLabel(ctx context.Context, rayClusterInstance *rayv1.RayCluster, excludeHeadPodFromServeSvc bool) error {
1071+
func (r *RayServiceReconciler) updateHeadPodServeLabel(ctx context.Context, rayServiceInstance *rayv1.RayService, rayClusterInstance *rayv1.RayCluster, excludeHeadPodFromServeSvc bool) error {
10551072
// `updateHeadPodServeLabel` updates the head Pod's serve label based on the health status of the proxy actor.
10561073
// If `excludeHeadPodFromServeSvc` is true, the head Pod will not be used to serve requests, regardless of proxy actor health.
10571074
// If `excludeHeadPodFromServeSvc` is false, the head Pod's serve label will be set based on the health check result.
@@ -1092,8 +1109,10 @@ func (r *RayServiceReconciler) updateHeadPodServeLabel(ctx context.Context, rayC
10921109
if oldLabel != newLabel {
10931110
headPod.Labels[utils.RayClusterServingServiceLabelKey] = newLabel
10941111
if updateErr := r.Update(ctx, headPod); updateErr != nil {
1112+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateHeadPodServeLabel), "Failed to update the serve label to %q for the Head Pod %s/%s: %v", newLabel, headPod.Namespace, headPod.Name, updateErr)
10951113
return updateErr
10961114
}
1115+
r.Recorder.Eventf(rayServiceInstance, corev1.EventTypeNormal, string(utils.UpdatedHeadPodServeLabel), "Updated the serve label to %q for the Head Pod %s/%s", newLabel, headPod.Namespace, headPod.Name)
10971116
}
10981117

10991118
return nil

ray-operator/controllers/ray/rayservice_controller_unit_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -720,7 +720,7 @@ func TestLabelHeadPodForServeStatus(t *testing.T) {
720720
},
721721
}
722722

723-
err := r.updateHeadPodServeLabel(ctx, &cluster, tc.excludeHeadPodFromServeSvc)
723+
err := r.updateHeadPodServeLabel(ctx, &rayv1.RayService{}, &cluster, tc.excludeHeadPodFromServeSvc)
724724
assert.NoError(t, err)
725725
// Get latest headPod status
726726
headPod, err = common.GetRayClusterHeadPod(ctx, r, &cluster)

ray-operator/controllers/ray/utils/constant.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,11 @@ const (
271271
FailedToUpdateRayCluster K8sEventType = "FailedToUpdateRayCluster"
272272

273273
// RayService event list
274-
InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec"
274+
InvalidRayServiceSpec K8sEventType = "InvalidRayServiceSpec"
275+
UpdatedHeadPodServeLabel K8sEventType = "UpdatedHeadPodServeLabel"
276+
UpdatedServeApplications K8sEventType = "UpdatedServeApplications"
277+
FailedToUpdateHeadPodServeLabel K8sEventType = "FailedToUpdateHeadPodServeLabel"
278+
FailedToUpdateServeApplications K8sEventType = "FailedToUpdateServeApplications"
275279

276280
// Generic Pod event list
277281
DeletedPod K8sEventType = "DeletedPod"
@@ -288,7 +292,9 @@ const (
288292

289293
// Service event list
290294
CreatedService K8sEventType = "CreatedService"
295+
UpdatedService K8sEventType = "UpdatedService"
291296
FailedToCreateService K8sEventType = "FailedToCreateService"
297+
FailedToUpdateService K8sEventType = "FailedToUpdateService"
292298

293299
// ServiceAccount event list
294300
CreatedServiceAccount K8sEventType = "CreatedServiceAccount"

0 commit comments

Comments
 (0)