Update CLI tests

adishaa · adishaa · commit 652523353562 · 2025-07-10T16:32:09.000-07:00
diff --git a/test/integration_tests/training/cli/test_cli_training.py b/test/integration_tests/training/cli/test_cli_training.py
@@ -62,39 +62,133 @@ def test_create_job(self, test_job_name, image_uri):
         ])
         assert result.returncode == 0
         logger.info(f"Created job: {test_job_name}")
-        
+
         # Wait a moment for the job to be created
         time.sleep(5)
 
     def test_list_jobs(self, test_job_name):
-        """Test listing jobs and verifying the created job is present."""
+        """Test listing jobs and verifying the created job is present with a valid status."""
         list_result = execute_command(["hyp", "list", "hyp-pytorch-job"])
         assert list_result.returncode == 0
-        
-        # Check if either the job name is in the output or at least the header is present
+
+        # Check if the job name is in the output
         assert test_job_name in list_result.stdout
-        logger.info("Successfully listed jobs")
+
+        # Check that the job status is not Unknown
+        output_lines = list_result.stdout.strip().split('\n')
+        job_status = None
+        for line in output_lines:
+            if test_job_name in line:
+                # Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
+                parts = line.split()
+                if len(parts) >= 3:
+                    job_status = parts[2].strip()
+                break
+
+        # Verify job status is not Unknown
+        assert job_status is not None, f"Could not find status for job {test_job_name}"
+        assert job_status != "Unknown", f"Job {test_job_name} has Unknown status, which indicates a potential issue"
+
+        logger.info(f"Successfully listed jobs. Job {test_job_name} has status: {job_status}")
+
+    def test_wait_for_job_running(self, test_job_name):
+        """Test that the job transitions to Running state before proceeding with pod tests."""
+        max_attempts = 12  # Maximum number of attempts (2 minutes total with 10-second intervals)
+        for attempt in range(1, max_attempts + 1):
+            logger.info(f"Checking job status (attempt {attempt}/{max_attempts})...")
+
+            # Get the job status
+            list_result = execute_command(["hyp", "list", "hyp-pytorch-job"])
+            assert list_result.returncode == 0
+
+            # Check if the job is in Running or Completed state
+            output_lines = list_result.stdout.strip().split('\n')
+            job_status = None
+            for line in output_lines:
+                if test_job_name in line:
+                    # Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
+                    parts = line.split()
+                    if len(parts) >= 3:
+                        job_status = parts[2].strip()
+                    break
+
+            logger.info(f"Current job status: {job_status}")
+
+            # If job status is Unknown, fail immediately
+            if job_status == "Unknown":
+                pytest.fail(f"Job {test_job_name} has Unknown status, which indicates a potential issue. Test failed.")
+
+            # If job is Running or Completed, we can proceed
+            if job_status in ["Running", "Completed"]:
+                logger.info(f"Job {test_job_name} is now in {job_status} state")
+                return
+
+            # If job is still in Created or another state, wait and try again
+            logger.info(f"Job {test_job_name} is in {job_status} state, waiting...")
+            time.sleep(30)  # Wait 30 seconds before checking again
+
+        # If we've exhausted all attempts, fail the test
+        pytest.fail(f"Job {test_job_name} did not reach Running state within the timeout period")
+
+    def test_wait_for_job_completion(self, test_job_name):
+        """Test that the job reaches Completed status within 10 minutes, with early failure if not Running."""
+        max_attempts = 20  # Maximum number of attempts (10 minutes total with 30-second intervals)
+        for attempt in range(1, max_attempts + 1):
+            logger.info(f"Checking job completion status (attempt {attempt}/{max_attempts})...")
+
+            # Get the job status
+            list_result = execute_command(["hyp", "list", "hyp-pytorch-job"])
+            assert list_result.returncode == 0
+
+            # Check the job status
+            output_lines = list_result.stdout.strip().split('\n')
+            job_status = None
+            for line in output_lines:
+                if test_job_name in line:
+                    # Extract the status from the line (assuming format: NAME NAMESPACE STATUS AGE)
+                    parts = line.split()
+                    if len(parts) >= 3:
+                        job_status = parts[2].strip()
+                    break
+
+            logger.info(f"Current job status: {job_status}")
+
+            # If job is Completed, test passes
+            if job_status == "Completed":
+                logger.info(f"Job {test_job_name} has successfully completed")
+                return
+
+            # If job is not Running or Completed, fail the test
+            if job_status not in ["Running", "Completed"]:
+                pytest.fail(f"Job {test_job_name} is in {job_status} state, which is not Running or Completed. Test failed.")
+
+            # If job is still Running, wait and try again
+            logger.info(f"Job {test_job_name} is still running, waiting...")
+            time.sleep(30)  # Wait 30 seconds before checking again
+
+        # If we've exhausted all attempts, fail the test
+        pytest.fail(f"Job {test_job_name} did not reach Completed state within the 10-minute timeout period")
 
     def test_list_pods(self, test_job_name):
         """Test listing pods for a specific job."""
         # Wait a moment to ensure pods are created
         time.sleep(10)
-        
+
         list_pods_result = execute_command([
             "hyp", "list-pods", "hyp-pytorch-job",
             "--job-name", test_job_name
         ])
         assert list_pods_result.returncode == 0
-        
+
         # Verify the output contains expected headers and job name
         output = list_pods_result.stdout.strip()
         assert f"Pods for job: {test_job_name}" in output
         assert "POD NAME" in output
         assert "NAMESPACE" in output
-        
+
         # Verify at least one pod is listed (should contain the job name in the pod name)
         assert f"{test_job_name}-pod-" in output
-        
+
         logger.info(f"Successfully listed pods for job: {test_job_name}")
 
     # @pytest.mark.skip(reason="Skipping since there is ")
@@ -137,7 +231,7 @@ def test_describe_job(self, test_job_name):
         """Test describing a specific job and verifying the output."""
         describe_result = execute_command(["hyp", "describe", "hyp-pytorch-job", "--job-name", test_job_name])
         assert describe_result.returncode == 0
-        
+
         # Check if either the job name is in the output or metadata is present
         assert test_job_name in describe_result.stdout
         logger.info(f"Successfully described job: {test_job_name}")
@@ -148,15 +242,13 @@ def test_delete_job(self, test_job_name):
         delete_result = execute_command(["hyp", "delete", "hyp-pytorch-job", "--job-name", test_job_name])
         assert delete_result.returncode == 0
         logger.info(f"Successfully deleted job: {test_job_name}")
-        
+
         # Wait a moment for the job to be deleted
         time.sleep(5)
-        
+
         # Verify the job is no longer listed
         list_result = execute_command(["hyp", "list", "hyp-pytorch-job"])
         assert list_result.returncode == 0
-        
-        # The job name should no longer be in the output
-        assert test_job_name not in list_result.stdout
-
 
+        # The job name should no longer be in the output
+        assert test_job_name not in list_result.stdout