fix: increase e2e test timeouts and update config health check

yossiovadia · claude · yossiovadia · commit 933bbd78b6ff · 2025-09-18T08:46:23.000-07:00
- Increase timeouts from 10s to 30s in failing test files
- Update config health check from /health to /api/version for Ollama compatibility
- Fix metrics naming expectations in jailbreak, PII, and general metrics tests

Co-Authored-By: Claude &lt;noreply@anthropic.com&gt;
diff --git a/config/config.yaml b/config/config.yaml
@@ -46,14 +46,14 @@ vllm_endpoints:
       - "phi4"
       - "gemma3:27b"
     weight: 1  # Load balancing weight
-    health_check_path: "/health"  # Optional health check endpoint
+    health_check_path: "/api/version"  # Optional health check endpoint
   - name: "endpoint2"
     address: "127.0.0.1"
     port: 11434
     models:
       - "mistral-small3.1"
     weight: 1
-    health_check_path: "/health"
+    health_check_path: "/api/version"
   - name: "endpoint3"
     address: "127.0.0.1"
     port: 11434
diff --git a/e2e-tests/02-router-classification-test.py b/e2e-tests/02-router-classification-test.py
@@ -68,7 +68,7 @@ def setUp(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers={"Content-Type": "application/json"},
                 json=payload,
-                timeout=60,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             if response.status_code >= 500:
@@ -129,7 +129,7 @@ def test_classification_consistency(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers={"Content-Type": "application/json"},
                 json=payload,
-                timeout=10,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             passed = response.status_code < 400
@@ -185,7 +185,7 @@ def test_category_classification(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers={"Content-Type": "application/json"},
                 json=payload,
-                timeout=60,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             passed = response.status_code < 400
diff --git a/e2e-tests/03-jailbreak-test.py b/e2e-tests/03-jailbreak-test.py
@@ -92,7 +92,7 @@ def setUp(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers={"Content-Type": "application/json"},
                 json=payload,
-                timeout=60,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             if response.status_code >= 500:
@@ -161,7 +161,7 @@ def test_benign_requests_allowed(self):
                     f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                     headers=headers,
                     json=payload,
-                    timeout=10,
+                    timeout=(10, 60),  # (connect timeout, read timeout)
                 )
 
                 # Benign requests should be processed (may get 503 due to missing vLLM backend)
@@ -212,16 +212,21 @@ def test_jailbreak_detection_metrics(self):
 
         # Look for specific jailbreak metrics
         jailbreak_metrics = [
-            "llm_router_jailbreak_detected_total",
-            "llm_router_jailbreak_blocked_total", 
-            "llm_router_jailbreak_classification_duration_seconds",
-            "llm_router_requests_total",
+            "llm_classifier_latency_seconds_count",  # Classification timing
+            "llm_request_errors_total",  # Blocked requests with reason="jailbreak_block"
+            "llm_model_requests_total",  # Total requests
         ]
 
         metrics_found = {}
         for metric in jailbreak_metrics:
             for line in metrics_text.split("\n"):
                 if metric in line and not line.startswith("#"):
+                    # For classifier metrics, ensure it's specifically for jailbreak
+                    if "classifier" in metric and "jailbreak" not in line:
+                        continue
+                    # For error metrics, ensure it's specifically jailbreak_block
+                    if "errors" in metric and "jailbreak_block" not in line:
+                        continue
                     # Extract metric value
                     try:
                         parts = line.strip().split()
@@ -287,7 +292,7 @@ def test_jailbreak_classification_consistency(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers=headers,
                 json=payload,
-                timeout=10,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             # Record the response status for consistency checking
diff --git a/e2e-tests/05-pii-policy-test.py b/e2e-tests/05-pii-policy-test.py
@@ -126,7 +126,7 @@ def setUp(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers={"Content-Type": "application/json"},
                 json=payload,
-                timeout=60,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             if response.status_code >= 500:
@@ -195,7 +195,7 @@ def test_no_pii_requests_allowed(self):
                     f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                     headers=headers,
                     json=payload,
-                    timeout=10,
+                    timeout=(10, 60),  # (connect timeout, read timeout)
                 )
 
                 # No PII requests should be processed (may get 503 due to missing vLLM backend)
@@ -267,7 +267,7 @@ def test_allowed_pii_requests(self):
                     f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                     headers=headers,
                     json=payload,
-                    timeout=10,
+                    timeout=(10, 60),  # (connect timeout, read timeout)
                 )
 
                 # Allowed PII requests should be processed (may get 503 due to missing vLLM backend)
@@ -337,7 +337,7 @@ def test_pii_policy_consistency(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers=headers,
                 json=payload,
-                timeout=10,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             # Record the response status for consistency checking
@@ -378,16 +378,21 @@ def test_pii_detection_metrics(self):
 
         # Look for specific PII metrics
         pii_metrics = [
-            "llm_router_pii_detected_total",
-            "llm_router_pii_blocked_total",
-            "llm_router_pii_classification_duration_seconds",
-            "llm_router_requests_total",
+            "llm_classifier_latency_seconds_count",  # Classification timing
+            "llm_request_errors_total",  # Blocked requests with reason="pii_block"
+            "llm_model_requests_total",  # Total requests
         ]
 
         metrics_found = {}
         for metric in pii_metrics:
             for line in metrics_text.split("\n"):
                 if metric in line and not line.startswith("#"):
+                    # For classifier metrics, ensure it's specifically for pii
+                    if "classifier" in metric and "pii" not in line:
+                        continue
+                    # For error metrics, ensure it's specifically pii_block
+                    if "errors" in metric and "pii_block" not in line:
+                        continue
                     # Extract metric value
                     try:
                         parts = line.strip().split()
@@ -459,7 +464,7 @@ def test_model_pii_policy_configuration(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers=headers,
                 json=payload,
-                timeout=10,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             try:
diff --git a/e2e-tests/08-metrics-test.py b/e2e-tests/08-metrics-test.py
@@ -28,24 +28,21 @@
 # Expected metric families that should be present
 EXPECTED_METRIC_FAMILIES = [
     # Core routing metrics
-    "llm_router_requests_total",
-    "llm_router_routing_decision",
-    "llm_router_model_selection_count",
-    
+    "llm_model_requests_total",
+    "llm_model_routing_latency_seconds",
+    "llm_routing_reason_codes_total",
+
     # Classification metrics
-    "llm_router_classification_duration_seconds",
-    "llm_router_category_classification_total",
-    
-    # Security metrics
-    "llm_router_jailbreak",
-    "llm_router_pii",
-    
+    "llm_classifier_latency_seconds",
+
     # Cache metrics (if enabled)
-    "llm_router_cache",
-    
+    "llm_cache_hits_total",
+    "llm_cache_misses_total",
+    "llm_cache_operations_total",
+
     # Performance metrics
-    "llm_router_request_duration_seconds",
-    "llm_router_response_size_bytes",
+    "llm_model_completion_latency_seconds",
+    "llm_model_tokens_total",
     
     # System metrics
     "go_",  # Go runtime metrics
@@ -118,7 +115,7 @@ def setUp(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers={"Content-Type": "application/json"},
                 json=payload,
-                timeout=60,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             if response.status_code >= 500:
@@ -263,7 +260,7 @@ def test_metrics_increase_with_requests(self):
         baseline_response = requests.get(ROUTER_METRICS_URL, timeout=5)
         baseline_metrics = baseline_response.text
 
-        baseline_requests = extract_metric_value(baseline_metrics, "llm_router_requests_total") or 0
+        baseline_requests = extract_metric_value(baseline_metrics, "llm_model_requests_total") or 0
         
         self.print_subtest_header("Baseline Metrics")
         print(f"Baseline requests total: {baseline_requests}")
@@ -291,7 +288,7 @@ def test_metrics_increase_with_requests(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers=headers,
                 json=payload,
-                timeout=10,
+                timeout=(10, 60),  # (connect timeout, read timeout)
             )
 
             self.print_response_info(
@@ -310,7 +307,7 @@ def test_metrics_increase_with_requests(self):
         updated_response = requests.get(ROUTER_METRICS_URL, timeout=5)
         updated_metrics = updated_response.text
 
-        updated_requests = extract_metric_value(updated_metrics, "llm_router_requests_total") or 0
+        updated_requests = extract_metric_value(updated_metrics, "llm_model_requests_total") or 0
 
         print(f"\nUpdated requests total: {updated_requests}")
         requests_increase = updated_requests - baseline_requests
@@ -337,9 +334,9 @@ def test_performance_metrics_present(self):
         metrics_text = response.text
 
         performance_metrics = [
-            "llm_router_request_duration_seconds",
-            "llm_router_classification_duration_seconds",
-            "llm_router_routing_latency_ms",
+            "llm_model_completion_latency_seconds",
+            "llm_classifier_latency_seconds",
+            "llm_model_routing_latency_seconds",
         ]
 
         found_metrics = {}