@@ -107,11 +107,13 @@ def assert_rayjob_submit_against_existing_cluster(
107107 print (f"✅ Successfully submitted RayJob '{ job_name } ' against existing cluster" )
108108
109109 # Monitor the job status until completion
110- self .monitor_rayjob_completion (rayjob , timeout = 900 )
110+ self .monitor_rayjob_completion (
111+ rayjob , timeout = 360
112+ ) # 6 minutes for faster debugging
111113
112114 print (f"✅ RayJob '{ job_name } ' completed successfully against existing cluster!" )
113115
114- def monitor_rayjob_completion (self , rayjob : RayJob , timeout : int = 900 ):
116+ def monitor_rayjob_completion (self , rayjob : RayJob , timeout : int = 360 ):
115117 """
116118 Monitor a RayJob until it completes or fails.
117119
@@ -224,7 +226,48 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
224226 elif status == CodeflareRayJobStatus .RUNNING :
225227 print (f"🏃 RayJob '{ rayjob .name } ' is still running..." )
226228 elif status == CodeflareRayJobStatus .UNKNOWN :
227- print (f"❓ RayJob '{ rayjob .name } ' status is unknown" )
229+ print (f"❓ RayJob '{ rayjob .name } ' status is unknown - investigating..." )
230+
231+ # If we've been in Unknown status for too long, get debug info
232+ if elapsed_time > 120 : # After 2 minutes of Unknown status
233+ print (
234+ f"⚠️ Job has been in Unknown status for { elapsed_time } s - getting debug info..."
235+ )
236+
237+ # Get detailed YAML to understand why status is Unknown
238+ import subprocess
239+
240+ try :
241+ result = subprocess .run (
242+ [
243+ "kubectl" ,
244+ "get" ,
245+ "rayjobs" ,
246+ "-n" ,
247+ self .namespace ,
248+ rayjob .name ,
249+ "-o" ,
250+ "yaml" ,
251+ ],
252+ capture_output = True ,
253+ text = True ,
254+ timeout = 10 ,
255+ )
256+ if result .returncode == 0 :
257+ print (
258+ f"📋 RayJob YAML (Unknown status debug):\n { result .stdout } "
259+ )
260+ except Exception as e :
261+ print (f"❌ Error getting debug info: { e } " )
262+
263+ # Break out of Unknown status loop after 4 minutes
264+ if elapsed_time > 240 :
265+ print (
266+ f"⏰ Breaking out of Unknown status loop after { elapsed_time } s"
267+ )
268+ raise AssertionError (
269+ f"❌ RayJob '{ rayjob .name } ' stuck in Unknown status for too long"
270+ )
228271
229272 # Wait before next check
230273 sleep (check_interval )
0 commit comments