@@ -105,9 +105,38 @@ def assert_rayjob_submit_against_existing_cluster(
105105 ), f"Job submission failed, expected { job_name } , got { submission_result } "
106106 print (f"✅ Successfully submitted RayJob '{ job_name } ' against existing cluster" )
107107
108- # Wait a moment for the RayJob resource to be created in Kubernetes
109- print ("⏳ Waiting for RayJob resource to be processed by KubeRay operator..." )
110- sleep (5 )
108+ # Debug: Check if RayJob resource was actually created
109+ import subprocess
110+ import time
111+
112+ print ("🔍 Checking if RayJob resource exists in Kubernetes..." )
113+ for attempt in range (6 ): # Check for 30 seconds
114+ try :
115+ # Check if RayJob resource exists
116+ result = subprocess .run (
117+ ["kubectl" , "get" , "rayjobs" , "-n" , self .namespace , job_name ],
118+ capture_output = True , text = True , timeout = 10
119+ )
120+ if result .returncode == 0 :
121+ print (f"✅ RayJob resource '{ job_name } ' found in Kubernetes!" )
122+ print (f"RayJob details:\n { result .stdout } " )
123+ break
124+ else :
125+ print (f"❌ Attempt { attempt + 1 } : RayJob resource '{ job_name } ' not found" )
126+ if attempt < 5 :
127+ time .sleep (5 )
128+ except Exception as e :
129+ print (f"❌ Error checking RayJob: { e } " )
130+
131+ # Also check what RayJob resources exist in the namespace
132+ try :
133+ result = subprocess .run (
134+ ["kubectl" , "get" , "rayjobs" , "-n" , self .namespace ],
135+ capture_output = True , text = True , timeout = 10
136+ )
137+ print (f"📋 All RayJobs in namespace '{ self .namespace } ':\n { result .stdout } " )
138+ except Exception as e :
139+ print (f"❌ Error listing RayJobs: { e } " )
111140
112141 # Monitor the job status until completion
113142 self .monitor_rayjob_completion (rayjob , timeout = 900 )
0 commit comments