Skip to content

Commit d4c928a

Browse files
committed
fix: test
1 parent 5ea3e31 commit d4c928a

File tree

1 file changed

+46
-3
lines changed

1 file changed

+46
-3
lines changed

tests/e2e/rayjob_existing_cluster_kind_test.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -107,11 +107,13 @@ def assert_rayjob_submit_against_existing_cluster(
107107
print(f"✅ Successfully submitted RayJob '{job_name}' against existing cluster")
108108

109109
# Monitor the job status until completion
110-
self.monitor_rayjob_completion(rayjob, timeout=900)
110+
self.monitor_rayjob_completion(
111+
rayjob, timeout=360
112+
) # 6 minutes for faster debugging
111113

112114
print(f"✅ RayJob '{job_name}' completed successfully against existing cluster!")
113115

114-
def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
116+
def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 360):
115117
"""
116118
Monitor a RayJob until it completes or fails.
117119
@@ -224,7 +226,48 @@ def monitor_rayjob_completion(self, rayjob: RayJob, timeout: int = 900):
224226
elif status == CodeflareRayJobStatus.RUNNING:
225227
print(f"🏃 RayJob '{rayjob.name}' is still running...")
226228
elif status == CodeflareRayJobStatus.UNKNOWN:
227-
print(f"❓ RayJob '{rayjob.name}' status is unknown")
229+
print(f"❓ RayJob '{rayjob.name}' status is unknown - investigating...")
230+
231+
# If we've been in Unknown status for too long, get debug info
232+
if elapsed_time > 120: # After 2 minutes of Unknown status
233+
print(
234+
f"⚠️ Job has been in Unknown status for {elapsed_time}s - getting debug info..."
235+
)
236+
237+
# Get detailed YAML to understand why status is Unknown
238+
import subprocess
239+
240+
try:
241+
result = subprocess.run(
242+
[
243+
"kubectl",
244+
"get",
245+
"rayjobs",
246+
"-n",
247+
self.namespace,
248+
rayjob.name,
249+
"-o",
250+
"yaml",
251+
],
252+
capture_output=True,
253+
text=True,
254+
timeout=10,
255+
)
256+
if result.returncode == 0:
257+
print(
258+
f"📋 RayJob YAML (Unknown status debug):\n{result.stdout}"
259+
)
260+
except Exception as e:
261+
print(f"❌ Error getting debug info: {e}")
262+
263+
# Break out of Unknown status loop after 4 minutes
264+
if elapsed_time > 240:
265+
print(
266+
f"⏰ Breaking out of Unknown status loop after {elapsed_time}s"
267+
)
268+
raise AssertionError(
269+
f"❌ RayJob '{rayjob.name}' stuck in Unknown status for too long"
270+
)
228271

229272
# Wait before next check
230273
sleep(check_interval)

0 commit comments

Comments
 (0)