e2e test: 02-router-classification: verify router classification (#302)

yossiovadia · web-flow · commit 5141a00feff3 · 2025-10-01T16:05:11.000-04:00
* fix: enable and verify router classification testing in 02-router-classification-test.py

- Fix import path: change from 'tests.test_base' to 'test_base'
- Add missing 'import unittest' statement
- Update DEFAULT_MODEL from 'qwen2.5:32b' to 'Model-A' to match e2e config
- Increase timeout from 10s to 60s to accommodate LLM Katan response times
- Use 'model: auto' to trigger category-based classification routing
- Add 4 comprehensive test cases: math, computer science, business, history
- Add expected_model field to verify correct routing
- Add assertions to verify actual model matches expected model
- Enhance test output to show expected vs actual routing
- Fix metrics test to check for actual exposed metrics (entropy classification, cache)
- Update README to mark 01 and 02 tests as completed with descriptions

All 3 tests now pass successfully with verified classification routing:
- Category Classification: Math→Model-B, CS→Model-B, Business→Model-A, History→Model-A ✅
- Classification Consistency: Same query routes to same model ✅
- Router Metrics: Entropy classification, cache hits/misses tracked ✅

Signed-off-by: Yossi Ovadia &lt;yovadia@redhat.com&gt;

* fix: remove trailing whitespace in 02-router-classification-test.py

Remove trailing whitespace from lines 239, 297, and 298 to pass
pre-commit checks.

Signed-off-by: Yossi Ovadia &lt;yovadia@redhat.com&gt;

* style: apply black formatter to 02-router-classification-test.py

Add trailing comma after last argument in assertGreater call to comply
with black formatting standards.

Signed-off-by: Yossi Ovadia &lt;yovadia@redhat.com&gt;

---------

Signed-off-by: Yossi Ovadia &lt;yovadia@redhat.com&gt;
diff --git a/e2e-tests/02-router-classification-test.py b/e2e-tests/02-router-classification-test.py
@@ -10,33 +10,49 @@
 import os
 import sys
 import time
+import unittest
 from collections import defaultdict
 
 import requests
 
 # Add parent directory to path to allow importing common test utilities
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from tests.test_base import SemanticRouterTestBase
+from test_base import SemanticRouterTestBase
 
 # Constants
 ENVOY_URL = "http://localhost:8801"
 OPENAI_ENDPOINT = "/v1/chat/completions"
 ROUTER_METRICS_URL = "http://localhost:9190/metrics"
-DEFAULT_MODEL = "qwen2.5:32b"  # Changed from gemma3:27b to match make test-prompt
+DEFAULT_MODEL = "Model-A"  # Use configured model that matches router config
 
 # Category test cases - each designed to trigger a specific classifier category
+# Based on config.e2e.yaml: math→Model-B, computer science→Model-B, business→Model-A, history→Model-A
 CATEGORY_TEST_CASES = [
     {
         "name": "Math Query",
         "expected_category": "math",
-        "content": "Solve the differential equation dy/dx + 2y = x^2 with the initial condition y(0) = 1.",
+        "expected_model": "Model-B",  # math has Model-B with score 1.0
+        "content": "Solve the quadratic equation x^2 + 5x + 6 = 0 and explain the steps.",
     },
     {
-        "name": "Creative Writing Query",
-        "expected_category": "creative",
-        "content": "Write a short story about a space cat.",
+        "name": "Computer Science/Coding Query",
+        "expected_category": "computer science",
+        "expected_model": "Model-B",  # computer science has Model-B with score 0.6
+        "content": "Write a Python function to implement a linked list with insert and delete operations.",
     },
-]  # Reduced to just 2 test cases to avoid timeouts
+    {
+        "name": "Business Query",
+        "expected_category": "business",
+        "expected_model": "Model-A",  # business has Model-A with score 0.8
+        "content": "What are the key principles of supply chain management in modern business?",
+    },
+    {
+        "name": "History Query",
+        "expected_category": "history",
+        "expected_model": "Model-A",  # history has Model-A with score 0.8
+        "content": "Describe the main causes and key events of World War I.",
+    },
+]
 
 
 class RouterClassificationTest(SemanticRouterTestBase):
@@ -129,7 +145,7 @@ def test_classification_consistency(self):
                 f"{ENVOY_URL}{OPENAI_ENDPOINT}",
                 headers={"Content-Type": "application/json"},
                 json=payload,
-                timeout=10,
+                timeout=60,
             )
 
             passed = response.status_code < 400
@@ -165,7 +181,7 @@ def test_category_classification(self):
             self.print_subtest_header(test_case["name"])
 
             payload = {
-                "model": DEFAULT_MODEL,
+                "model": "auto",  # Use "auto" to trigger category-based classification routing
                 "messages": [
                     {
                         "role": "assistant",
@@ -178,7 +194,7 @@ def test_category_classification(self):
 
             self.print_request_info(
                 payload=payload,
-                expectations=f"Expect: Query to be classified as {test_case['expected_category']} and routed accordingly",
+                expectations=f"Expect: Query classified as '{test_case['expected_category']}' → routed to {test_case.get('expected_model', 'appropriate model')}",
             )
 
             response = requests.post(
@@ -188,25 +204,30 @@ def test_category_classification(self):
                 timeout=60,
             )
 
-            passed = response.status_code < 400
             response_json = response.json()
-            model = response_json.get("model", "unknown")
-            results[test_case["name"]] = model
+            actual_model = response_json.get("model", "unknown")
+            expected_model = test_case.get("expected_model", "unknown")
+            results[test_case["name"]] = actual_model
+
+            model_match = actual_model == expected_model
+            passed = response.status_code < 400 and model_match
 
             self.print_response_info(
                 response,
                 {
                     "Expected Category": test_case["expected_category"],
-                    "Selected Model": model,
+                    "Expected Model": expected_model,
+                    "Actual Model": actual_model,
+                    "Routing Correct": "✅" if model_match else "❌",
                 },
             )
 
             self.print_test_result(
                 passed=passed,
                 message=(
-                    f"Query successfully routed to model: {model}"
-                    if passed
-                    else f"Request failed with status {response.status_code}"
+                    f"Query correctly routed to {actual_model}"
+                    if model_match
+                    else f"Routing failed: expected {expected_model}, got {actual_model}"
                 ),
             )
 
@@ -216,22 +237,29 @@ def test_category_classification(self):
                 f"{test_case['name']} request failed with status {response.status_code}",
             )
 
+            self.assertEqual(
+                actual_model,
+                expected_model,
+                f"{test_case['name']}: Expected routing to {expected_model}, but got {actual_model}",
+            )
+
     def test_classifier_metrics(self):
-        """Test that classification metrics are being recorded."""
+        """Test that router metrics are being recorded and exposed."""
         self.print_test_header(
-            "Classifier Metrics Test",
-            "Verifies that classification metrics are being properly recorded and exposed",
+            "Router Metrics Test",
+            "Verifies that router metrics (classification, cache operations) are being properly recorded and exposed",
         )
 
         # First, let's get the current metrics as a baseline
         response = requests.get(ROUTER_METRICS_URL)
         baseline_metrics = response.text
 
-        # Check if classification metrics exist without making additional requests
+        # Check if classification and routing metrics exist
+        # These are the actual metrics exposed by the router
         classification_metrics = [
-            "llm_router_classification_duration_seconds",
-            "llm_router_requests_total",
-            "llm_router_model_selection_count",
+            "llm_entropy_classification_latency_seconds",  # Entropy-based classification timing
+            "llm_cache_hits_total",  # Cache operations (related to classification)
+            "llm_cache_misses_total",  # Cache misses
         ]
 
         metrics_found = 0
@@ -259,13 +287,17 @@ def test_classifier_metrics(self):
         self.print_test_result(
             passed=passed,
             message=(
-                f"Found {metrics_found} classification metrics"
+                f"Found {metrics_found}/{len(classification_metrics)} router metrics"
                 if passed
-                else "No classification metrics found"
+                else "No router metrics found"
             ),
         )
 
-        self.assertGreaterEqual(metrics_found, 0, "No classification metrics found")
+        self.assertGreater(
+            metrics_found,
+            0,
+            f"No router metrics found. Expected at least one of: {', '.join(classification_metrics)}",
+        )
 
 
 if __name__ == "__main__":
diff --git a/e2e-tests/README.md b/e2e-tests/README.md
@@ -10,14 +10,16 @@ This test suite provides a progressive approach to testing the Semantic Router,
    - Tests malformed request validation
    - Tests content-based smart routing (math → Model-B, creative → Model-A)
 
-2. **01-envoy-extproc-test.py** - TBD (To Be Developed)
+2. **01-envoy-extproc-test.py** - Envoy ExtProc interaction tests ✅
    - Tests that Envoy correctly forwards requests to the ExtProc
-   - Checks header propagation
+   - Checks header propagation and body modification
+   - Tests ExtProc error handling and performance impact
 
-3. **02-router-classification-test.py** - TBD (To Be Developed)
-   - Tests BERT embeddings
-   - Tests category classification
-   - Verifies model selection based on content
+3. **02-router-classification-test.py** - Router classification tests ✅
+   - Tests category-based classification with auto model selection
+   - Verifies queries route to appropriate specialized models
+   - Tests classification consistency across identical requests
+   - Validates metrics collection for classification operations
 
 4. **03-model-routing-test.py** - TBD (To Be Developed)
    - Tests that requests are routed to the correct backend model
@@ -73,11 +75,15 @@ Will be added in future PRs for testing with actual model inference.
 Currently implemented:
 
 - **00-client-request-test.py** ✅ - Complete client request validation and smart routing
+- **01-envoy-extproc-test.py** ✅ - Envoy ExtProc interaction and processing tests
+- **02-router-classification-test.py** ✅ - Router classification and model selection tests
 
 Individual tests can be run with:
 
 ```bash
 python e2e-tests/00-client-request-test.py
+python e2e-tests/01-envoy-extproc-test.py
+python e2e-tests/02-router-classification-test.py
 ```
 
 Or run all available tests with: