@@ -90,6 +90,10 @@ def run_rayjob_against_existing_cluster_oauth(self):
9090 ), f"Job submission failed, expected { job_name } , got { submission_result } "
9191 print (f"✅ Successfully submitted RayJob '{ job_name } '" )
9292
93+ # Wait a moment for the RayJob resource to be created in Kubernetes
94+ print ("⏳ Waiting for RayJob resource to be processed by KubeRay operator..." )
95+ sleep (5 )
96+
9397 # Monitor the job status until completion
9498 self .monitor_rayjob_completion (rayjob )
9599
@@ -103,16 +107,21 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 300):
103107
104108 Args:
105109 rayjob: The RayJob instance to monitor
106- timeout: Maximum time to wait in seconds (default: 15 minutes)
110+ timeout: Maximum time to wait in seconds (default: 5 minutes)
107111 """
108112 print (f"⏳ Monitoring RayJob '{ rayjob .name } ' status..." )
109113
110114 elapsed_time = 0
111115 check_interval = 10 # Check every 10 seconds
116+ job_found = False # Track if we've seen the job at least once
112117
113118 while elapsed_time < timeout :
114119 status , ready = rayjob .status (print_to_console = True )
115120
121+ # Track if we've found the job (not UNKNOWN status)
122+ if status != CodeflareRayJobStatus .UNKNOWN :
123+ job_found = True
124+
116125 # Check if job has completed (either successfully or failed)
117126 if status == CodeflareRayJobStatus .COMPLETE :
118127 print (f"✅ RayJob '{ rayjob .name } ' completed successfully!" )
@@ -122,15 +131,30 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 300):
122131 elif status == CodeflareRayJobStatus .RUNNING :
123132 print (f"🏃 RayJob '{ rayjob .name } ' is still running..." )
124133 elif status == CodeflareRayJobStatus .UNKNOWN :
125- print (f"❓ RayJob '{ rayjob .name } ' status is unknown" )
134+ if job_found :
135+ # If we've seen the job before but now it's unknown, that's concerning
136+ print (
137+ f"⚠️ RayJob '{ rayjob .name } ' status became unknown after being found"
138+ )
139+ else :
140+ # Job hasn't appeared yet, this is normal initially
141+ print (
142+ f"⏳ Waiting for RayJob '{ rayjob .name } ' to appear in Kubernetes..."
143+ )
126144
127145 # Wait before next check
128146 sleep (check_interval )
129147 elapsed_time += check_interval
130148
131149 # If we reach here, the job has timed out
132150 final_status , _ = rayjob .status (print_to_console = True )
133- raise TimeoutError (
134- f"⏰ RayJob '{ rayjob .name } ' did not complete within { timeout } seconds. "
135- f"Final status: { final_status } "
136- )
151+ if not job_found :
152+ raise TimeoutError (
153+ f"⏰ RayJob '{ rayjob .name } ' was never found in Kubernetes within { timeout } seconds. "
154+ f"Check if the RayJob resource was created successfully."
155+ )
156+ else :
157+ raise TimeoutError (
158+ f"⏰ RayJob '{ rayjob .name } ' did not complete within { timeout } seconds. "
159+ f"Final status: { final_status } "
160+ )
0 commit comments