Skip to content

Commit 6525233

Browse files
committed
Update CLI tests
1 parent 27cc885 commit 6525233

File tree

1 file changed

+108
-16
lines changed

1 file changed

+108
-16
lines changed

test/integration_tests/training/cli/test_cli_training.py

Lines changed: 108 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -62,39 +62,133 @@ def test_create_job(self, test_job_name, image_uri):
6262
])
6363
assert result.returncode == 0
6464
logger.info(f"Created job: {test_job_name}")
65-
65+
6666
# Wait a moment for the job to be created
6767
time.sleep(5)
6868

6969
def test_list_jobs(self, test_job_name):
70-
"""Test listing jobs and verifying the created job is present."""
70+
"""Test listing jobs and verifying the created job is present with a valid status."""
7171
list_result = execute_command(["hyp", "list", "hyp-pytorch-job"])
7272
assert list_result.returncode == 0
73-
74-
# Check if either the job name is in the output or at least the header is present
73+
74+
# Check if the job name is in the output
7575
assert test_job_name in list_result.stdout
76-
logger.info("Successfully listed jobs")
76+
77+
# Check that the job status is not Unknown
78+
output_lines = list_result.stdout.strip().split('\n')
79+
job_status = None
80+
for line in output_lines:
81+
if test_job_name in line:
82+
# Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
83+
parts = line.split()
84+
if len(parts) >= 3:
85+
job_status = parts[2].strip()
86+
break
87+
88+
# Verify job status is not Unknown
89+
assert job_status is not None, f"Could not find status for job {test_job_name}"
90+
assert job_status != "Unknown", f"Job {test_job_name} has Unknown status, which indicates a potential issue"
91+
92+
logger.info(f"Successfully listed jobs. Job {test_job_name} has status: {job_status}")
93+
94+
def test_wait_for_job_running(self, test_job_name):
95+
"""Test that the job transitions to Running state before proceeding with pod tests."""
96+
max_attempts = 12 # Maximum number of attempts (2 minutes total with 10-second intervals)
97+
for attempt in range(1, max_attempts + 1):
98+
logger.info(f"Checking job status (attempt {attempt}/{max_attempts})...")
99+
100+
# Get the job status
101+
list_result = execute_command(["hyp", "list", "hyp-pytorch-job"])
102+
assert list_result.returncode == 0
103+
104+
# Check if the job is in Running or Completed state
105+
output_lines = list_result.stdout.strip().split('\n')
106+
job_status = None
107+
for line in output_lines:
108+
if test_job_name in line:
109+
# Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
110+
parts = line.split()
111+
if len(parts) >= 3:
112+
job_status = parts[2].strip()
113+
break
114+
115+
logger.info(f"Current job status: {job_status}")
116+
117+
# If job status is Unknown, fail immediately
118+
if job_status == "Unknown":
119+
pytest.fail(f"Job {test_job_name} has Unknown status, which indicates a potential issue. Test failed.")
120+
121+
# If job is Running or Completed, we can proceed
122+
if job_status in ["Running", "Completed"]:
123+
logger.info(f"Job {test_job_name} is now in {job_status} state")
124+
return
125+
126+
# If job is still in Created or another state, wait and try again
127+
logger.info(f"Job {test_job_name} is in {job_status} state, waiting...")
128+
time.sleep(30) # Wait 30 seconds before checking again
129+
130+
# If we've exhausted all attempts, fail the test
131+
pytest.fail(f"Job {test_job_name} did not reach Running state within the timeout period")
132+
133+
def test_wait_for_job_completion(self, test_job_name):
134+
"""Test that the job reaches Completed status within 10 minutes, with early failure if not Running."""
135+
max_attempts = 20 # Maximum number of attempts (10 minutes total with 30-second intervals)
136+
for attempt in range(1, max_attempts + 1):
137+
logger.info(f"Checking job completion status (attempt {attempt}/{max_attempts})...")
138+
139+
# Get the job status
140+
list_result = execute_command(["hyp", "list", "hyp-pytorch-job"])
141+
assert list_result.returncode == 0
142+
143+
# Check the job status
144+
output_lines = list_result.stdout.strip().split('\n')
145+
job_status = None
146+
for line in output_lines:
147+
if test_job_name in line:
148+
# Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
149+
parts = line.split()
150+
if len(parts) >= 3:
151+
job_status = parts[2].strip()
152+
break
153+
154+
logger.info(f"Current job status: {job_status}")
155+
156+
# If job is Completed, test passes
157+
if job_status == "Completed":
158+
logger.info(f"Job {test_job_name} has successfully completed")
159+
return
160+
161+
# If job is not Running or Completed, fail the test
162+
if job_status not in ["Running", "Completed"]:
163+
pytest.fail(f"Job {test_job_name} is in {job_status} state, which is not Running or Completed. Test failed.")
164+
165+
# If job is still Running, wait and try again
166+
logger.info(f"Job {test_job_name} is still running, waiting...")
167+
time.sleep(30) # Wait 30 seconds before checking again
168+
169+
# If we've exhausted all attempts, fail the test
170+
pytest.fail(f"Job {test_job_name} did not reach Completed state within the 10-minute timeout period")
77171

78172
def test_list_pods(self, test_job_name):
79173
"""Test listing pods for a specific job."""
80174
# Wait a moment to ensure pods are created
81175
time.sleep(10)
82-
176+
83177
list_pods_result = execute_command([
84178
"hyp", "list-pods", "hyp-pytorch-job",
85179
"--job-name", test_job_name
86180
])
87181
assert list_pods_result.returncode == 0
88-
182+
89183
# Verify the output contains expected headers and job name
90184
output = list_pods_result.stdout.strip()
91185
assert f"Pods for job: {test_job_name}" in output
92186
assert "POD NAME" in output
93187
assert "NAMESPACE" in output
94-
188+
95189
# Verify at least one pod is listed (should contain the job name in the pod name)
96190
assert f"{test_job_name}-pod-" in output
97-
191+
98192
logger.info(f"Successfully listed pods for job: {test_job_name}")
99193

100194
# @pytest.mark.skip(reason="Skipping since there is ")
@@ -137,7 +231,7 @@ def test_describe_job(self, test_job_name):
137231
"""Test describing a specific job and verifying the output."""
138232
describe_result = execute_command(["hyp", "describe", "hyp-pytorch-job", "--job-name", test_job_name])
139233
assert describe_result.returncode == 0
140-
234+
141235
# Check if either the job name is in the output or metadata is present
142236
assert test_job_name in describe_result.stdout
143237
logger.info(f"Successfully described job: {test_job_name}")
@@ -148,15 +242,13 @@ def test_delete_job(self, test_job_name):
148242
delete_result = execute_command(["hyp", "delete", "hyp-pytorch-job", "--job-name", test_job_name])
149243
assert delete_result.returncode == 0
150244
logger.info(f"Successfully deleted job: {test_job_name}")
151-
245+
152246
# Wait a moment for the job to be deleted
153247
time.sleep(5)
154-
248+
155249
# Verify the job is no longer listed
156250
list_result = execute_command(["hyp", "list", "hyp-pytorch-job"])
157251
assert list_result.returncode == 0
158-
159-
# The job name should no longer be in the output
160-
assert test_job_name not in list_result.stdout
161-
162252

253+
# The job name should no longer be in the output
254+
assert test_job_name not in list_result.stdout

0 commit comments

Comments
 (0)