@@ -62,39 +62,133 @@ def test_create_job(self, test_job_name, image_uri):
6262 ])
6363 assert result .returncode == 0
6464 logger .info (f"Created job: { test_job_name } " )
65-
65+
6666 # Wait a moment for the job to be created
6767 time .sleep (5 )
6868
6969 def test_list_jobs (self , test_job_name ):
70- """Test listing jobs and verifying the created job is present."""
70+ """Test listing jobs and verifying the created job is present with a valid status ."""
7171 list_result = execute_command (["hyp" , "list" , "hyp-pytorch-job" ])
7272 assert list_result .returncode == 0
73-
74- # Check if either the job name is in the output or at least the header is present
73+
74+ # Check if the job name is in the output
7575 assert test_job_name in list_result .stdout
76- logger .info ("Successfully listed jobs" )
76+
77+ # Check that the job status is not Unknown
78+ output_lines = list_result .stdout .strip ().split ('\n ' )
79+ job_status = None
80+ for line in output_lines :
81+ if test_job_name in line :
82+ # Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
83+ parts = line .split ()
84+ if len (parts ) >= 3 :
85+ job_status = parts [2 ].strip ()
86+ break
87+
88+ # Verify job status is not Unknown
89+ assert job_status is not None , f"Could not find status for job { test_job_name } "
90+ assert job_status != "Unknown" , f"Job { test_job_name } has Unknown status, which indicates a potential issue"
91+
92+ logger .info (f"Successfully listed jobs. Job { test_job_name } has status: { job_status } " )
93+
94+ def test_wait_for_job_running (self , test_job_name ):
95+ """Test that the job transitions to Running state before proceeding with pod tests."""
96+ max_attempts = 12 # Maximum number of attempts (2 minutes total with 10-second intervals)
97+ for attempt in range (1 , max_attempts + 1 ):
98+ logger .info (f"Checking job status (attempt { attempt } /{ max_attempts } )..." )
99+
100+ # Get the job status
101+ list_result = execute_command (["hyp" , "list" , "hyp-pytorch-job" ])
102+ assert list_result .returncode == 0
103+
104+ # Check if the job is in Running or Completed state
105+ output_lines = list_result .stdout .strip ().split ('\n ' )
106+ job_status = None
107+ for line in output_lines :
108+ if test_job_name in line :
109+ # Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
110+ parts = line .split ()
111+ if len (parts ) >= 3 :
112+ job_status = parts [2 ].strip ()
113+ break
114+
115+ logger .info (f"Current job status: { job_status } " )
116+
117+ # If job status is Unknown, fail immediately
118+ if job_status == "Unknown" :
119+ pytest .fail (f"Job { test_job_name } has Unknown status, which indicates a potential issue. Test failed." )
120+
121+ # If job is Running or Completed, we can proceed
122+ if job_status in ["Running" , "Completed" ]:
123+ logger .info (f"Job { test_job_name } is now in { job_status } state" )
124+ return
125+
126+ # If job is still in Created or another state, wait and try again
127+ logger .info (f"Job { test_job_name } is in { job_status } state, waiting..." )
128+ time .sleep (30 ) # Wait 30 seconds before checking again
129+
130+ # If we've exhausted all attempts, fail the test
131+ pytest .fail (f"Job { test_job_name } did not reach Running state within the timeout period" )
132+
133+ def test_wait_for_job_completion (self , test_job_name ):
134+ """Test that the job reaches Completed status within 10 minutes, with early failure if not Running."""
135+ max_attempts = 20 # Maximum number of attempts (10 minutes total with 30-second intervals)
136+ for attempt in range (1 , max_attempts + 1 ):
137+ logger .info (f"Checking job completion status (attempt { attempt } /{ max_attempts } )..." )
138+
139+ # Get the job status
140+ list_result = execute_command (["hyp" , "list" , "hyp-pytorch-job" ])
141+ assert list_result .returncode == 0
142+
143+ # Check the job status
144+ output_lines = list_result .stdout .strip ().split ('\n ' )
145+ job_status = None
146+ for line in output_lines :
147+ if test_job_name in line :
148+ # Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
149+ parts = line .split ()
150+ if len (parts ) >= 3 :
151+ job_status = parts [2 ].strip ()
152+ break
153+
154+ logger .info (f"Current job status: { job_status } " )
155+
156+ # If job is Completed, test passes
157+ if job_status == "Completed" :
158+ logger .info (f"Job { test_job_name } has successfully completed" )
159+ return
160+
161+ # If job is not Running or Completed, fail the test
162+ if job_status not in ["Running" , "Completed" ]:
163+ pytest .fail (f"Job { test_job_name } is in { job_status } state, which is not Running or Completed. Test failed." )
164+
165+ # If job is still Running, wait and try again
166+ logger .info (f"Job { test_job_name } is still running, waiting..." )
167+ time .sleep (30 ) # Wait 30 seconds before checking again
168+
169+ # If we've exhausted all attempts, fail the test
170+ pytest .fail (f"Job { test_job_name } did not reach Completed state within the 10-minute timeout period" )
77171
78172 def test_list_pods (self , test_job_name ):
79173 """Test listing pods for a specific job."""
80174 # Wait a moment to ensure pods are created
81175 time .sleep (10 )
82-
176+
83177 list_pods_result = execute_command ([
84178 "hyp" , "list-pods" , "hyp-pytorch-job" ,
85179 "--job-name" , test_job_name
86180 ])
87181 assert list_pods_result .returncode == 0
88-
182+
89183 # Verify the output contains expected headers and job name
90184 output = list_pods_result .stdout .strip ()
91185 assert f"Pods for job: { test_job_name } " in output
92186 assert "POD NAME" in output
93187 assert "NAMESPACE" in output
94-
188+
95189 # Verify at least one pod is listed (should contain the job name in the pod name)
96190 assert f"{ test_job_name } -pod-" in output
97-
191+
98192 logger .info (f"Successfully listed pods for job: { test_job_name } " )
99193
100194 # @pytest.mark.skip(reason="Skipping since there is ")
@@ -137,7 +231,7 @@ def test_describe_job(self, test_job_name):
137231 """Test describing a specific job and verifying the output."""
138232 describe_result = execute_command (["hyp" , "describe" , "hyp-pytorch-job" , "--job-name" , test_job_name ])
139233 assert describe_result .returncode == 0
140-
234+
141235 # Check if either the job name is in the output or metadata is present
142236 assert test_job_name in describe_result .stdout
143237 logger .info (f"Successfully described job: { test_job_name } " )
@@ -148,15 +242,13 @@ def test_delete_job(self, test_job_name):
148242 delete_result = execute_command (["hyp" , "delete" , "hyp-pytorch-job" , "--job-name" , test_job_name ])
149243 assert delete_result .returncode == 0
150244 logger .info (f"Successfully deleted job: { test_job_name } " )
151-
245+
152246 # Wait a moment for the job to be deleted
153247 time .sleep (5 )
154-
248+
155249 # Verify the job is no longer listed
156250 list_result = execute_command (["hyp" , "list" , "hyp-pytorch-job" ])
157251 assert list_result .returncode == 0
158-
159- # The job name should no longer be in the output
160- assert test_job_name not in list_result .stdout
161-
162252
253+ # The job name should no longer be in the output
254+ assert test_job_name not in list_result .stdout
0 commit comments