feat: harden all remaining e2e tests - remove 503 acceptance and add comprehensive status report

yossiovadia · claude · yossiovadia · commit cb2344baaa37 · 2025-09-18T16:53:53.000-07:00
- Remove HTTP 503 acceptance from PII, tools, model selection, and error handling tests
- Tests now require 200 status codes for successful operations
- Service failures now properly fail tests instead of false positives
- Add comprehensive TEST_STATUS_REPORT.md documenting all test improvements
- Expose real system bugs: input validation gaps, jailbreak blocking issues
- 6/11 test files now hardened and provide reliable system health assessment

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
Signed-off-by: Yossi Ovadia &lt;yovadia@redhat.com&gt;
diff --git a/e2e-tests/05-pii-policy-test.py b/e2e-tests/05-pii-policy-test.py
@@ -198,8 +198,9 @@ def test_no_pii_requests_allowed(self):
                     timeout=(10, 60),  # (connect timeout, read timeout)
                 )
 
-                # No PII requests should be processed (may get 503 due to missing vLLM backend)
-                passed = response.status_code in [200, 503]
+                # No PII requests should be processed successfully - no 503 accepted
+                # If PII detection is working, no-PII requests must succeed with 200
+                passed = response.status_code == 200
 
                 try:
                     response_json = response.json()
@@ -226,10 +227,10 @@ def test_no_pii_requests_allowed(self):
                     ),
                 )
 
-                self.assertIn(
+                self.assertEqual(
                     response.status_code,
-                    [200, 503],
-                    f"No PII request '{test_case['name']}' was blocked when it should be allowed. Status: {response.status_code}",
+                    200,
+                    f"No PII request '{test_case['name']}' failed with status {response.status_code}. Expected: 200 (service must be working)",
                 )
 
     def test_allowed_pii_requests(self):
@@ -270,8 +271,9 @@ def test_allowed_pii_requests(self):
                     timeout=(10, 60),  # (connect timeout, read timeout)
                 )
 
-                # Allowed PII requests should be processed (may get 503 due to missing vLLM backend)
-                passed = response.status_code in [200, 503]
+                # Allowed PII requests should be processed successfully - no 503 accepted
+                # If PII detection is working, allowed PII requests must succeed with 200
+                passed = response.status_code == 200
 
                 try:
                     response_json = response.json()
@@ -299,10 +301,10 @@ def test_allowed_pii_requests(self):
                     ),
                 )
 
-                self.assertIn(
+                self.assertEqual(
                     response.status_code,
-                    [200, 503],
-                    f"Allowed PII request '{test_case['name']}' was blocked when it should be allowed. Status: {response.status_code}",
+                    200,
+                    f"Allowed PII request '{test_case['name']}' failed with status {response.status_code}. Expected: 200 (service must be working)",
                 )
 
     def test_pii_policy_consistency(self):
@@ -483,17 +485,18 @@ def test_model_pii_policy_configuration(self):
                 },
             )
 
-            # The request should be processed (allowed PII types for gemma3:27b)
-            passed = response.status_code in [200, 503]
+            # The request should be processed successfully - no 503 accepted
+            # If PII policy is working, allowed PII types must succeed with 200
+            passed = response.status_code == 200
             self.print_test_result(
                 passed=passed,
                 message=f"Model {model} PII policy applied correctly",
             )
 
-            self.assertIn(
+            self.assertEqual(
                 response.status_code,
-                [200, 503],
-                f"Model {model} did not handle PII policy correctly. Status: {response.status_code}",
+                200,
+                f"Model {model} PII policy failed with status {response.status_code}. Expected: 200 (service must be working)",
             )
 
 
diff --git a/e2e-tests/06-tools-test.py b/e2e-tests/06-tools-test.py
@@ -187,7 +187,9 @@ def test_specific_tool_selection(self):
                 )
 
                 # Tool selection should work regardless of vLLM backend availability
-                passed = response.status_code in [200, 503]
+                # Tool selection should work successfully - no 503 accepted
+                # If tool selection is working, requests must succeed with 200
+                passed = response.status_code == 200
 
                 try:
                     response_json = response.json()
@@ -262,7 +264,9 @@ def test_no_tool_requests(self):
                     timeout=30,
                 )
 
-                passed = response.status_code in [200, 503]
+                # Tool selection should work successfully - no 503 accepted
+                # If tool selection is working, requests must succeed with 200
+                passed = response.status_code == 200
 
                 try:
                     response_json = response.json()
diff --git a/e2e-tests/07-model-selection-test.py b/e2e-tests/07-model-selection-test.py
@@ -217,7 +217,9 @@ def test_category_based_model_selection(self):
                     timeout=30,
                 )
 
-                passed = response.status_code in [200, 503]
+                # Model selection should work successfully - no 503 accepted
+                # If model selection is working, requests must succeed with 200
+                passed = response.status_code == 200
 
                 try:
                     response_json = response.json()
@@ -249,10 +251,10 @@ def test_category_based_model_selection(self):
                     ),
                 )
 
-                self.assertIn(
+                self.assertEqual(
                     response.status_code,
-                    [200, 503],
-                    f"Model selection request '{test_case['name']}' failed. Status: {response.status_code}",
+                    200,
+                    f"Model selection request '{test_case['name']}' failed with status {response.status_code}. Expected: 200 (service must be working)",
                 )
 
     def test_reasoning_mode_selection(self):
@@ -296,7 +298,9 @@ def test_reasoning_mode_selection(self):
                     timeout=30,
                 )
 
-                passed = response.status_code in [200, 503]
+                # Reasoning mode should work successfully - no 503 accepted
+                # If reasoning mode selection is working, requests must succeed with 200
+                passed = response.status_code == 200
 
                 try:
                     response_json = response.json()
@@ -327,10 +331,10 @@ def test_reasoning_mode_selection(self):
                     ),
                 )
 
-                self.assertIn(
+                self.assertEqual(
                     response.status_code,
-                    [200, 503],
-                    f"Reasoning mode test '{test_case['name']}' failed. Status: {response.status_code}",
+                    200,
+                    f"Reasoning mode test '{test_case['name']}' failed with status {response.status_code}. Expected: 200 (service must be working)",
                 )
 
     def test_model_fallback_behavior(self):
@@ -368,8 +372,9 @@ def test_model_fallback_behavior(self):
             timeout=30,
         )
 
-        # Fallback should work, though may get 503 if no vLLM backend
-        passed = response.status_code in [200, 400, 503]  # 400 is acceptable for invalid model
+        # Fallback should work - 400 is acceptable for invalid model request
+        # No 503 accepted - if fallback is working, it should handle gracefully
+        passed = response.status_code in [200, 400]  # 400 is acceptable for invalid model
 
         try:
             response_json = response.json()
@@ -394,8 +399,8 @@ def test_model_fallback_behavior(self):
 
         self.assertIn(
             response.status_code,
-            [200, 400, 503],
-            f"Model fallback test failed unexpectedly. Status: {response.status_code}",
+            [200, 400],
+            f"Model fallback test failed with status {response.status_code}. Expected: 200 (fallback) or 400 (invalid model)",
         )
 
     def test_model_selection_metrics(self):
diff --git a/e2e-tests/09-error-handling-test.py b/e2e-tests/09-error-handling-test.py
@@ -98,7 +98,7 @@
                 {"role": "user", "content": "A" * 10000}  # 10KB message
             ],
         },
-        "expected_status_range": (200, 503),  # Should be processed or vLLM unavailable
+        "expected_status_range": (200, 200),  # Should be processed successfully - no 503 accepted
         "description": "Very long message should be handled gracefully",
     },
     {
@@ -110,7 +110,7 @@
                 for i in range(100)  # 100 messages
             ],
         },
-        "expected_status_range": (200, 503),
+        "expected_status_range": (200, 200),  # Must be processed successfully - no 503 accepted
         "description": "Large number of messages should be handled",
     },
     {
@@ -121,7 +121,7 @@
                 {"role": "user", "content": "Hello 世界 🌍 Здравствуй мир"}
             ],
         },
-        "expected_status_range": (200, 503),
+        "expected_status_range": (200, 200),  # Must be processed successfully - no 503 accepted
         "description": "Unicode characters should be handled correctly",
     },
     {
@@ -131,7 +131,7 @@
             "messages": [{"role": "user", "content": "Hello"}],
             "temperature": 0,
         },
-        "expected_status_range": (200, 503),
+        "expected_status_range": (200, 200),  # Must be processed successfully - no 503 accepted
         "description": "Zero temperature should be valid",
     },
     {
@@ -141,7 +141,7 @@
             "messages": [{"role": "user", "content": "Hello"}],
             "temperature": 2.0,
         },
-        "expected_status_range": (200, 503),
+        "expected_status_range": (200, 200),  # Must be processed successfully - no 503 accepted
         "description": "Maximum valid temperature should work",
     },
     {
@@ -152,7 +152,7 @@
                 {"role": "user", "content": "Test with \"quotes\" and 'apostrophes' and \n newlines \t tabs"}
             ],
         },
-        "expected_status_range": (200, 503),
+        "expected_status_range": (200, 200),  # Must be processed successfully - no 503 accepted
         "description": "Special characters should be handled",
     },
 ]
diff --git a/e2e-tests/TEST_STATUS_REPORT.md b/e2e-tests/TEST_STATUS_REPORT.md