From 308ba5fd8b8e31e26673fe9e47dd6fbd337a0749 Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Wed, 1 Oct 2025 11:20:18 -0700 Subject: [PATCH 1/3] fix: enable and verify router classification testing in 02-router-classification-test.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix import path: change from 'tests.test_base' to 'test_base' - Add missing 'import unittest' statement - Update DEFAULT_MODEL from 'qwen2.5:32b' to 'Model-A' to match e2e config - Increase timeout from 10s to 60s to accommodate LLM Katan response times - Use 'model: auto' to trigger category-based classification routing - Add 4 comprehensive test cases: math, computer science, business, history - Add expected_model field to verify correct routing - Add assertions to verify actual model matches expected model - Enhance test output to show expected vs actual routing - Fix metrics test to check for actual exposed metrics (entropy classification, cache) - Update README to mark 01 and 02 tests as completed with descriptions All 3 tests now pass successfully with verified classification routing: - Category Classification: Math→Model-B, CS→Model-B, Business→Model-A, History→Model-A ✅ - Classification Consistency: Same query routes to same model ✅ - Router Metrics: Entropy classification, cache hits/misses tracked ✅ Signed-off-by: Yossi Ovadia --- e2e-tests/02-router-classification-test.py | 86 +++++++++++++++------- e2e-tests/README.md | 18 +++-- 2 files changed, 71 insertions(+), 33 deletions(-) diff --git a/e2e-tests/02-router-classification-test.py b/e2e-tests/02-router-classification-test.py index 040a522c..d907ce54 100644 --- a/e2e-tests/02-router-classification-test.py +++ b/e2e-tests/02-router-classification-test.py @@ -10,33 +10,49 @@ import os import sys import time +import unittest from collections import defaultdict import requests # Add parent directory to path to allow importing common test utilities sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from tests.test_base import SemanticRouterTestBase +from test_base import SemanticRouterTestBase # Constants ENVOY_URL = "http://localhost:8801" OPENAI_ENDPOINT = "/v1/chat/completions" ROUTER_METRICS_URL = "http://localhost:9190/metrics" -DEFAULT_MODEL = "qwen2.5:32b" # Changed from gemma3:27b to match make test-prompt +DEFAULT_MODEL = "Model-A" # Use configured model that matches router config # Category test cases - each designed to trigger a specific classifier category +# Based on config.e2e.yaml: math→Model-B, computer science→Model-B, business→Model-A, history→Model-A CATEGORY_TEST_CASES = [ { "name": "Math Query", "expected_category": "math", - "content": "Solve the differential equation dy/dx + 2y = x^2 with the initial condition y(0) = 1.", + "expected_model": "Model-B", # math has Model-B with score 1.0 + "content": "Solve the quadratic equation x^2 + 5x + 6 = 0 and explain the steps.", }, { - "name": "Creative Writing Query", - "expected_category": "creative", - "content": "Write a short story about a space cat.", + "name": "Computer Science/Coding Query", + "expected_category": "computer science", + "expected_model": "Model-B", # computer science has Model-B with score 0.6 + "content": "Write a Python function to implement a linked list with insert and delete operations.", }, -] # Reduced to just 2 test cases to avoid timeouts + { + "name": "Business Query", + "expected_category": "business", + "expected_model": "Model-A", # business has Model-A with score 0.8 + "content": "What are the key principles of supply chain management in modern business?", + }, + { + "name": "History Query", + "expected_category": "history", + "expected_model": "Model-A", # history has Model-A with score 0.8 + "content": "Describe the main causes and key events of World War I.", + }, +] class RouterClassificationTest(SemanticRouterTestBase): @@ -129,7 +145,7 @@ def test_classification_consistency(self): f"{ENVOY_URL}{OPENAI_ENDPOINT}", headers={"Content-Type": "application/json"}, json=payload, - timeout=10, + timeout=60, ) passed = response.status_code < 400 @@ -165,7 +181,7 @@ def test_category_classification(self): self.print_subtest_header(test_case["name"]) payload = { - "model": DEFAULT_MODEL, + "model": "auto", # Use "auto" to trigger category-based classification routing "messages": [ { "role": "assistant", @@ -178,7 +194,7 @@ def test_category_classification(self): self.print_request_info( payload=payload, - expectations=f"Expect: Query to be classified as {test_case['expected_category']} and routed accordingly", + expectations=f"Expect: Query classified as '{test_case['expected_category']}' → routed to {test_case.get('expected_model', 'appropriate model')}", ) response = requests.post( @@ -188,25 +204,30 @@ def test_category_classification(self): timeout=60, ) - passed = response.status_code < 400 response_json = response.json() - model = response_json.get("model", "unknown") - results[test_case["name"]] = model + actual_model = response_json.get("model", "unknown") + expected_model = test_case.get("expected_model", "unknown") + results[test_case["name"]] = actual_model + + model_match = actual_model == expected_model + passed = response.status_code < 400 and model_match self.print_response_info( response, { "Expected Category": test_case["expected_category"], - "Selected Model": model, + "Expected Model": expected_model, + "Actual Model": actual_model, + "Routing Correct": "✅" if model_match else "❌", }, ) self.print_test_result( passed=passed, message=( - f"Query successfully routed to model: {model}" - if passed - else f"Request failed with status {response.status_code}" + f"Query correctly routed to {actual_model}" + if model_match + else f"Routing failed: expected {expected_model}, got {actual_model}" ), ) @@ -215,23 +236,30 @@ def test_category_classification(self): 400, f"{test_case['name']} request failed with status {response.status_code}", ) + + self.assertEqual( + actual_model, + expected_model, + f"{test_case['name']}: Expected routing to {expected_model}, but got {actual_model}", + ) def test_classifier_metrics(self): - """Test that classification metrics are being recorded.""" + """Test that router metrics are being recorded and exposed.""" self.print_test_header( - "Classifier Metrics Test", - "Verifies that classification metrics are being properly recorded and exposed", + "Router Metrics Test", + "Verifies that router metrics (classification, cache operations) are being properly recorded and exposed", ) # First, let's get the current metrics as a baseline response = requests.get(ROUTER_METRICS_URL) baseline_metrics = response.text - # Check if classification metrics exist without making additional requests + # Check if classification and routing metrics exist + # These are the actual metrics exposed by the router classification_metrics = [ - "llm_router_classification_duration_seconds", - "llm_router_requests_total", - "llm_router_model_selection_count", + "llm_entropy_classification_latency_seconds", # Entropy-based classification timing + "llm_cache_hits_total", # Cache operations (related to classification) + "llm_cache_misses_total", # Cache misses ] metrics_found = 0 @@ -259,13 +287,17 @@ def test_classifier_metrics(self): self.print_test_result( passed=passed, message=( - f"Found {metrics_found} classification metrics" + f"Found {metrics_found}/{len(classification_metrics)} router metrics" if passed - else "No classification metrics found" + else "No router metrics found" ), ) - self.assertGreaterEqual(metrics_found, 0, "No classification metrics found") + self.assertGreater( + metrics_found, + 0, + f"No router metrics found. Expected at least one of: {', '.join(classification_metrics)}" + ) if __name__ == "__main__": diff --git a/e2e-tests/README.md b/e2e-tests/README.md index a86a8c8d..2392ea12 100644 --- a/e2e-tests/README.md +++ b/e2e-tests/README.md @@ -10,14 +10,16 @@ This test suite provides a progressive approach to testing the Semantic Router, - Tests malformed request validation - Tests content-based smart routing (math → Model-B, creative → Model-A) -2. **01-envoy-extproc-test.py** - TBD (To Be Developed) +2. **01-envoy-extproc-test.py** - Envoy ExtProc interaction tests ✅ - Tests that Envoy correctly forwards requests to the ExtProc - - Checks header propagation + - Checks header propagation and body modification + - Tests ExtProc error handling and performance impact -3. **02-router-classification-test.py** - TBD (To Be Developed) - - Tests BERT embeddings - - Tests category classification - - Verifies model selection based on content +3. **02-router-classification-test.py** - Router classification tests ✅ + - Tests category-based classification with auto model selection + - Verifies queries route to appropriate specialized models + - Tests classification consistency across identical requests + - Validates metrics collection for classification operations 4. **03-model-routing-test.py** - TBD (To Be Developed) - Tests that requests are routed to the correct backend model @@ -73,11 +75,15 @@ Will be added in future PRs for testing with actual model inference. Currently implemented: - **00-client-request-test.py** ✅ - Complete client request validation and smart routing +- **01-envoy-extproc-test.py** ✅ - Envoy ExtProc interaction and processing tests +- **02-router-classification-test.py** ✅ - Router classification and model selection tests Individual tests can be run with: ```bash python e2e-tests/00-client-request-test.py +python e2e-tests/01-envoy-extproc-test.py +python e2e-tests/02-router-classification-test.py ``` Or run all available tests with: From 6890d214b29b0ac1dc666467bdebcb7ad1f16b76 Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Wed, 1 Oct 2025 12:53:47 -0700 Subject: [PATCH 2/3] fix: remove trailing whitespace in 02-router-classification-test.py Remove trailing whitespace from lines 239, 297, and 298 to pass pre-commit checks. Signed-off-by: Yossi Ovadia --- e2e-tests/02-router-classification-test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/e2e-tests/02-router-classification-test.py b/e2e-tests/02-router-classification-test.py index d907ce54..d61872c3 100644 --- a/e2e-tests/02-router-classification-test.py +++ b/e2e-tests/02-router-classification-test.py @@ -236,7 +236,7 @@ def test_category_classification(self): 400, f"{test_case['name']} request failed with status {response.status_code}", ) - + self.assertEqual( actual_model, expected_model, @@ -294,8 +294,8 @@ def test_classifier_metrics(self): ) self.assertGreater( - metrics_found, - 0, + metrics_found, + 0, f"No router metrics found. Expected at least one of: {', '.join(classification_metrics)}" ) From a316521fefbb456c0392559b3e7bc045f20dcfd5 Mon Sep 17 00:00:00 2001 From: Yossi Ovadia Date: Wed, 1 Oct 2025 12:57:40 -0700 Subject: [PATCH 3/3] style: apply black formatter to 02-router-classification-test.py Add trailing comma after last argument in assertGreater call to comply with black formatting standards. Signed-off-by: Yossi Ovadia --- e2e-tests/02-router-classification-test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e-tests/02-router-classification-test.py b/e2e-tests/02-router-classification-test.py index d61872c3..461df730 100644 --- a/e2e-tests/02-router-classification-test.py +++ b/e2e-tests/02-router-classification-test.py @@ -296,7 +296,7 @@ def test_classifier_metrics(self): self.assertGreater( metrics_found, 0, - f"No router metrics found. Expected at least one of: {', '.join(classification_metrics)}" + f"No router metrics found. Expected at least one of: {', '.join(classification_metrics)}", )