@@ -129,6 +129,11 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
129
129
return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
130
130
}
131
131
132
+ if err := validateRayJobStatus (rayJobInstance ); err != nil {
133
+ logger .Error (err , "The RayJob status is invalid" )
134
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
135
+ }
136
+
132
137
// Please do NOT modify `originalRayJobInstance` in the following code.
133
138
originalRayJobInstance := rayJobInstance .DeepCopy ()
134
139
@@ -177,15 +182,30 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request)
177
182
rayJobInstance .Status .DashboardURL = clientURL
178
183
}
179
184
185
+ if rayJobInstance .Spec .SubmissionMode == rayv1 .UserMode {
186
+ logger .Info ("SubmissionMode is UserMode and the RayCluster is created. Transition the status from `Initializing` to `Waiting`." )
187
+ rayJobInstance .Status .JobDeploymentStatus = rayv1 .JobDeploymentStatusWaiting
188
+ break
189
+ }
190
+
180
191
if rayJobInstance .Spec .SubmissionMode == rayv1 .K8sJobMode {
181
192
if err := r .createK8sJobIfNeed (ctx , rayJobInstance , rayClusterInstance ); err != nil {
182
193
return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, err
183
194
}
184
195
}
185
196
186
- logger .Info ("Both RayCluster and the submitter K8s Job are created. Transition the status from `Initializing` to `Running`." ,
197
+ logger .Info ("Both RayCluster and the submitter K8s Job are created. Transition the status from `Initializing` to `Running`." , "SubmissionMode" , rayJobInstance . Spec . SubmissionMode ,
187
198
"RayJob" , rayJobInstance .Name , "RayCluster" , rayJobInstance .Status .RayClusterName )
188
199
rayJobInstance .Status .JobDeploymentStatus = rayv1 .JobDeploymentStatusRunning
200
+ case rayv1 .JobDeploymentStatusWaiting :
201
+ // Try to get the Ray job id from the Ray job annotations.
202
+ rayJobId , found := rayJobInstance .ObjectMeta .Annotations [utils .RayJobSubmissionIdLabelKey ]
203
+ logger .Info ("Get Ray job id from the Ray job annotations" , "RayJobId" , rayJobId , "Found" , found )
204
+ if ! found {
205
+ return ctrl.Result {RequeueAfter : RayJobDefaultRequeueDuration }, nil
206
+ }
207
+ rayJobInstance .Status .JobId = rayJobId
208
+ rayJobInstance .Status .JobDeploymentStatus = rayv1 .JobDeploymentStatusRunning
189
209
case rayv1 .JobDeploymentStatusRunning :
190
210
if shouldUpdate := r .updateStatusToSuspendingIfNeeded (ctx , rayJobInstance ); shouldUpdate {
191
211
break
@@ -606,6 +626,7 @@ func (r *RayJobReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcurren
606
626
// This function is the sole place where `JobDeploymentStatusInitializing` is defined. It initializes `Status.JobId` and `Status.RayClusterName`
607
627
// prior to job submissions and RayCluster creations. This is used to avoid duplicate job submissions and cluster creations. In addition, this
608
628
// function also sets `Status.StartTime` to support `ActiveDeadlineSeconds`.
629
+ // This function will set or generate JobId if SubmissionMode is not UserMode.
609
630
func (r * RayJobReconciler ) initRayJobStatusIfNeed (ctx context.Context , rayJob * rayv1.RayJob ) error {
610
631
logger := ctrl .LoggerFrom (ctx )
611
632
shouldUpdateStatus := rayJob .Status .JobId == "" || rayJob .Status .RayClusterName == "" || rayJob .Status .JobStatus == ""
@@ -615,7 +636,7 @@ func (r *RayJobReconciler) initRayJobStatusIfNeed(ctx context.Context, rayJob *r
615
636
return nil
616
637
}
617
638
618
- if rayJob .Status .JobId == "" {
639
+ if rayJob .Spec . SubmissionMode != rayv1 . UserMode && rayJob . Status .JobId == "" {
619
640
if rayJob .Spec .JobId != "" {
620
641
rayJob .Status .JobId = rayJob .Spec .JobId
621
642
} else {
@@ -811,3 +832,11 @@ func validateRayJobSpec(rayJob *rayv1.RayJob) error {
811
832
}
812
833
return nil
813
834
}
835
+
836
+ func validateRayJobStatus (rayJob * rayv1.RayJob ) error {
837
+ if rayJob .Status .JobDeploymentStatus == rayv1 .JobDeploymentStatusWaiting && rayJob .Spec .SubmissionMode != rayv1 .UserMode {
838
+ return fmt .Errorf ("invalid RayJob State: JobDeploymentStatus cannot be `Waiting` when SubmissionMode is not UserMode" )
839
+ }
840
+
841
+ return nil
842
+ }
0 commit comments