@@ -46,11 +46,12 @@ def run_rayjob_against_existing_cluster_kind(
4646 num_workers = 1 ,
4747 head_cpu_requests = "500m" ,
4848 head_cpu_limits = "500m" ,
49- worker_cpu_requests = 2 ,
50- worker_cpu_limits = 4 ,
51- worker_memory_requests = 4 ,
52- worker_memory_limits = 8 ,
49+ worker_cpu_requests = "500m" ,
50+ worker_cpu_limits = 1 ,
51+ worker_memory_requests = 1 ,
52+ worker_memory_limits = 4 ,
5353 worker_extended_resource_requests = {gpu_resource_name : number_of_gpus },
54+ image = "rayproject/ray:2.47.1" ,
5455 write_to_file = True ,
5556 verify_tls = False ,
5657 )
@@ -95,8 +96,7 @@ def assert_rayjob_submit_against_existing_cluster(
9596 "pip" : "./tests/e2e/mnist_pip_requirements.txt" ,
9697 "env_vars" : get_setup_env_variables (ACCELERATOR = accelerator ),
9798 },
98- shutdown_after_job_finishes = False ,
99- # entrypoint_num_gpus=number_of_gpus if number_of_gpus > 0 else None, # Temporarily disabled to test basic functionality
99+ shutdown_after_job_finishes = False , # Don't shutdown the existing cluster
100100 )
101101
102102 # Submit the job
@@ -107,16 +107,13 @@ def assert_rayjob_submit_against_existing_cluster(
107107 print (f"✅ Successfully submitted RayJob '{ job_name } ' against existing cluster" )
108108
109109 # Monitor the job status until completion
110- self .monitor_rayjob_completion (
111- rayjob , timeout = 360
112- ) # 6 minutes for faster debugging
110+ self .monitor_rayjob_completion (rayjob , timeout = 900 )
113111
114112 print (f"✅ RayJob '{ job_name } ' completed successfully against existing cluster!" )
115113
116- def monitor_rayjob_completion (self , rayjob : RayJob , timeout : int = 360 ):
114+ def monitor_rayjob_completion (self , rayjob : RayJob , timeout : int = 900 ):
117115 """
118116 Monitor a RayJob until it completes or fails.
119-
120117 Args:
121118 rayjob: The RayJob instance to monitor
122119 timeout: Maximum time to wait in seconds (default: 15 minutes)
@@ -134,209 +131,11 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 360):
134131 print (f"✅ RayJob '{ rayjob .name } ' completed successfully!" )
135132 return
136133 elif status == CodeflareRayJobStatus .FAILED :
137- # Get more details about the failure
138- print (f"❌ RayJob '{ rayjob .name } ' failed! Investigating..." )
139-
140- # Try to get failure details using kubectl
141- import subprocess
142-
143- try :
144- result = subprocess .run (
145- [
146- "kubectl" ,
147- "get" ,
148- "rayjobs" ,
149- "-n" ,
150- self .namespace ,
151- rayjob .name ,
152- "-o" ,
153- "yaml" ,
154- ],
155- capture_output = True ,
156- text = True ,
157- timeout = 10 ,
158- )
159- if result .returncode == 0 :
160- print (f"📋 RayJob YAML details:\n { result .stdout } " )
161-
162- # Try to get job submitter pod logs (these pods may be cleaned up quickly)
163- pod_result = subprocess .run (
164- [
165- "kubectl" ,
166- "get" ,
167- "pods" ,
168- "-n" ,
169- self .namespace ,
170- "-l" ,
171- f"ray.io/rayjob={ rayjob .name } " ,
172- "-o" ,
173- "name" ,
174- "--sort-by=.metadata.creationTimestamp" ,
175- ],
176- capture_output = True ,
177- text = True ,
178- timeout = 10 ,
179- )
180- if pod_result .returncode == 0 and pod_result .stdout .strip ():
181- pod_name = pod_result .stdout .strip ().split ("/" )[- 1 ]
182- log_result = subprocess .run (
183- [
184- "kubectl" ,
185- "logs" ,
186- "-n" ,
187- self .namespace ,
188- pod_name ,
189- "--tail=50" ,
190- ],
191- capture_output = True ,
192- text = True ,
193- timeout = 10 ,
194- )
195- if log_result .returncode == 0 :
196- print (f"📝 Pod logs for { pod_name } :\n { log_result .stdout } " )
197- else :
198- print (f"❌ Could not get pod logs: { log_result .stderr } " )
199- else :
200- print (f"❌ Could not find pods for RayJob: { pod_result .stderr } " )
201-
202- # Also try to get events related to the RayJob
203- events_result = subprocess .run (
204- [
205- "kubectl" ,
206- "get" ,
207- "events" ,
208- "-n" ,
209- self .namespace ,
210- "--field-selector" ,
211- f"involvedObject.name={ rayjob .name } " ,
212- "-o" ,
213- "wide" ,
214- ],
215- capture_output = True ,
216- text = True ,
217- timeout = 10 ,
218- )
219- if events_result .returncode == 0 and events_result .stdout .strip ():
220- print (f"📅 Events for RayJob:\n { events_result .stdout } " )
221-
222- except Exception as e :
223- print (f"❌ Error getting failure details: { e } " )
224-
225134 raise AssertionError (f"❌ RayJob '{ rayjob .name } ' failed!" )
226135 elif status == CodeflareRayJobStatus .RUNNING :
227136 print (f"🏃 RayJob '{ rayjob .name } ' is still running..." )
228137 elif status == CodeflareRayJobStatus .UNKNOWN :
229- print (f"❓ RayJob '{ rayjob .name } ' status is unknown - investigating..." )
230-
231- # If we've been in Unknown status for too long, get debug info
232- if elapsed_time > 120 : # After 2 minutes of Unknown status
233- print (
234- f"⚠️ Job has been in Unknown status for { elapsed_time } s - getting debug info..."
235- )
236-
237- # Get detailed YAML to understand why status is Unknown
238- import subprocess
239-
240- try :
241- result = subprocess .run (
242- [
243- "kubectl" ,
244- "get" ,
245- "rayjobs" ,
246- "-n" ,
247- self .namespace ,
248- rayjob .name ,
249- "-o" ,
250- "yaml" ,
251- ],
252- capture_output = True ,
253- text = True ,
254- timeout = 10 ,
255- )
256- if result .returncode == 0 :
257- print (
258- f"📋 RayJob YAML (Unknown status debug):\n { result .stdout } "
259- )
260-
261- # Also check for job pods that might be stuck
262- job_pods_result = subprocess .run (
263- [
264- "kubectl" ,
265- "get" ,
266- "pods" ,
267- "-n" ,
268- self .namespace ,
269- "-l" ,
270- f"ray.io/group=rayjob" ,
271- "-o" ,
272- "wide" ,
273- ],
274- capture_output = True ,
275- text = True ,
276- timeout = 10 ,
277- )
278- if job_pods_result .returncode == 0 :
279- print (f"🔍 RayJob-related pods:\n { job_pods_result .stdout } " )
280-
281- # Check for any pending pods in the namespace
282- pending_pods_result = subprocess .run (
283- [
284- "kubectl" ,
285- "get" ,
286- "pods" ,
287- "-n" ,
288- self .namespace ,
289- "--field-selector=status.phase=Pending" ,
290- "-o" ,
291- "wide" ,
292- ],
293- capture_output = True ,
294- text = True ,
295- timeout = 10 ,
296- )
297- if (
298- pending_pods_result .returncode == 0
299- and pending_pods_result .stdout .strip ()
300- ):
301- print (
302- f"⏸️ Pending pods in namespace:\n { pending_pods_result .stdout } "
303- )
304-
305- # Get events for the entire namespace to see scheduling issues
306- namespace_events_result = subprocess .run (
307- [
308- "kubectl" ,
309- "get" ,
310- "events" ,
311- "-n" ,
312- self .namespace ,
313- "--sort-by=.metadata.creationTimestamp" ,
314- "-o" ,
315- "wide" ,
316- ],
317- capture_output = True ,
318- text = True ,
319- timeout = 10 ,
320- )
321- if (
322- namespace_events_result .returncode == 0
323- and namespace_events_result .stdout .strip ()
324- ):
325- print (
326- f"📅 Recent namespace events:\n { namespace_events_result .stdout } "
327- )
328-
329- except Exception as e :
330- print (f"❌ Error getting debug info: { e } " )
331-
332- # Break out of Unknown status loop after 4 minutes
333- if elapsed_time > 240 :
334- print (
335- f"⏰ Breaking out of Unknown status loop after { elapsed_time } s"
336- )
337- raise AssertionError (
338- f"❌ RayJob '{ rayjob .name } ' stuck in Unknown status for too long"
339- )
138+ print (f"❓ RayJob '{ rayjob .name } ' status is unknown" )
340139
341140 # Wait before next check
342141 sleep (check_interval )
0 commit comments