vllm-project
diff --git a/‎Makefile‎
Lines changed: 20 additions & 5 deletions b/‎Makefile‎
Lines changed: 20 additions & 5 deletions
diff --git a/‎bench/run_bench.sh‎
100644100755
Lines changed: 83 additions & 29 deletions b/‎bench/run_bench.sh‎
100644100755
Lines changed: 83 additions & 29 deletions
diff --git a/‎config/config.yaml‎
Lines changed: 12 additions & 1 deletion b/‎config/config.yaml‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎src/semantic-router/pkg/api/server.go‎
Lines changed: 64 additions & 0 deletions b/‎src/semantic-router/pkg/api/server.go‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/api/server_test.go‎
Lines changed: 14 additions & 6 deletions b/‎src/semantic-router/pkg/api/server_test.go‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 33 additions & 0 deletions b/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 33 additions & 0 deletions
@@ -11,20 +11,35 @@ build: rust build-router
 
 # Build the Rust library
 rust:
-	@echo "Building Rust library..."
-	cd candle-binding && cargo build --release
+	@echo "Ensuring rust is installed..."
+	@bash -c 'if ! command -v rustc >/dev/null 2>&1; then \
+		echo "rustc not found, installing..."; \
+		curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
+	fi && \
+	if [ -f "$$HOME/.cargo/env" ]; then \
+		echo "Loading Rust environment from $$HOME/.cargo/env..." && \
+		. $$HOME/.cargo/env; \
+	fi && \
+	if ! command -v cargo >/dev/null 2>&1; then \
+		echo "Error: cargo not found in PATH" && exit 1; \
+	fi && \
+	echo "Building Rust library..." && \
+	cd candle-binding && cargo build --release'
 
 # Build router
 build-router: rust
 	@echo "Building router..."
 	@mkdir -p bin
 	@cd src/semantic-router && go build -o ../../bin/router cmd/main.go
 
+# Config file path with default
+CONFIG_FILE ?= config/config.yaml
+
 # Run the router
-run-router: build-router
-	@echo "Running router..."
+run-router: build-router download-models
+	@echo "Running router with config: ${CONFIG_FILE}"
 	@export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release && \
-		./bin/router -config=config/config.yaml
+		./bin/router -config=${CONFIG_FILE}
 
 # Prepare Envoy
 prepare-envoy:
 
@@ -1,34 +1,88 @@
 #!/bin/bash
 
-set -x 
+# Example usage:
+# Quick run:
+# SAMPLES_PER_CATEGORY=5 CONCURRENT_REQUESTS=4 VLLM_MODELS="openai/gpt-oss-20b" ROUTER_MODELS="auto" ./run_bench.sh
+# Long run:
+# SAMPLES_PER_CATEGORY=100 CONCURRENT_REQUESTS=4 VLLM_MODELS="openai/gpt-oss-20b" ROUTER_MODELS="auto" ./run_bench.sh
+# To test only router:
+# BENCHMARK_ROUTER_ONLY=true ./run_bench.sh
 
-export ROUTER_API_KEY="1234567890"
-export VLLM_API_KEY="1234567890"
-export ROUTER_ENDPOINT="http://localhost:8801/v1"
-export VLLM_ENDPOINT="http://localhost:8000/v1"
-export ROUTER_MODELS="auto"
-export VLLM_MODELS="openai/gpt-oss-20b"
+set -x -e
+
+export ROUTER_API_KEY="${ROUTER_API_KEY:-1234567890}"
+export VLLM_API_KEY="${VLLM_API_KEY:-1234567890}"
+export ROUTER_ENDPOINT="${ROUTER_ENDPOINT:-http://localhost:8801/v1}"
+export VLLM_ENDPOINT="${VLLM_ENDPOINT:-http://localhost:8000/v1}"
+export ROUTER_MODELS="${ROUTER_MODELS:-auto}"
+export VLLM_MODELS="${VLLM_MODELS:-openai/gpt-oss-20b}"
+export SAMPLES_PER_CATEGORY="${SAMPLES_PER_CATEGORY:-5}"
+export CONCURRENT_REQUESTS="${CONCURRENT_REQUESTS:-4}"
+export BENCHMARK_ROUTER_ONLY="${BENCHMARK_ROUTER_ONLY:-false}"
 
 # Run the benchmark
-python router_reason_bench.py \
-  --run-router \
-  --router-endpoint "$ROUTER_ENDPOINT" \
-  --router-api-key "$ROUTER_API_KEY" \
-  --router-models "$ROUTER_MODELS" \
-  --run-vllm \
-  --vllm-endpoint "$VLLM_ENDPOINT" \
-  --vllm-api-key "$VLLM_API_KEY" \
-  --vllm-models "$VLLM_MODELS" \
-  --samples-per-category 5 \
-  --vllm-exec-modes NR XC \
-  --concurrent-requests 4 \
-  --output-dir results/reasonbench
-
-# Generate plots
-VLLM_MODEL_FIRST="${VLLM_MODELS%% *}"
-ROUTER_MODEL_FIRST="${ROUTER_MODELS%% *}"
-VLLM_MODELS_SAFE="${VLLM_MODEL_FIRST//\//_}"
-ROUTER_MODELS_SAFE="${ROUTER_MODEL_FIRST//\//_}"
-python bench_plot.py \
-  --summary "results/reasonbench/vllm::${VLLM_MODELS_SAFE}/summary.json" \
-  --router-summary "results/reasonbench/router::${ROUTER_MODELS_SAFE}/summary.json"
+if [ "${BENCHMARK_ROUTER_ONLY}" = "true" ]; then
+  echo "Running router-only benchmark"
+  python bench/router_reason_bench.py \
+    --run-router \
+    --router-endpoint "$ROUTER_ENDPOINT" \
+    --router-api-key "$ROUTER_API_KEY" \
+    --router-models "$ROUTER_MODELS" \
+    --samples-per-category "$SAMPLES_PER_CATEGORY" \
+    --concurrent-requests "$CONCURRENT_REQUESTS" \
+    --output-dir results/reasonbench
+else
+  echo "Running full benchmark (router + vLLM)..."
+  python bench/router_reason_bench.py \
+    --run-router \
+    --router-endpoint "$ROUTER_ENDPOINT" \
+    --router-api-key "$ROUTER_API_KEY" \
+    --router-models "$ROUTER_MODELS" \
+    --run-vllm \
+    --vllm-endpoint "$VLLM_ENDPOINT" \
+    --vllm-api-key "$VLLM_API_KEY" \
+    --vllm-models "$VLLM_MODELS" \
+    --samples-per-category "$SAMPLES_PER_CATEGORY" \
+    --vllm-exec-modes NR XC \
+    --concurrent-requests "$CONCURRENT_REQUESTS" \
+    --output-dir results/reasonbench
+fi
+
+# Generate plots if summary files exist
+echo "Checking for plot generation..."
+echo "VLLM_MODELS: $VLLM_MODELS"
+echo "ROUTER_MODELS: $ROUTER_MODELS"
+
+# Get first model name and make it path-safe
+VLLM_MODEL_FIRST=$(echo "$VLLM_MODELS" | cut -d' ' -f1)
+ROUTER_MODEL_FIRST=$(echo "$ROUTER_MODELS" | cut -d' ' -f1)
+echo "First models: VLLM=$VLLM_MODEL_FIRST, Router=$ROUTER_MODEL_FIRST"
+
+# Replace / with _ for path safety
+VLLM_MODELS_SAFE=$(echo "$VLLM_MODEL_FIRST" | tr '/' '_')
+ROUTER_MODELS_SAFE=$(echo "$ROUTER_MODEL_FIRST" | tr '/' '_')
+echo "Safe paths: VLLM=$VLLM_MODELS_SAFE, Router=$ROUTER_MODELS_SAFE"
+
+# Construct the full paths
+VLLM_SUMMARY="results/reasonbench/vllm::${VLLM_MODELS_SAFE}/summary.json"
+ROUTER_SUMMARY="results/reasonbench/router::${ROUTER_MODELS_SAFE}/summary.json"
+echo "Looking for summaries at:"
+echo "VLLM: $VLLM_SUMMARY"
+echo "Router: $ROUTER_SUMMARY"
+
+# Check if at least one summary file exists and generate plots
+if [ -f "$ROUTER_SUMMARY" ]; then
+  echo "Found router summary, generating plots..."
+  if [ -f "$VLLM_SUMMARY" ]; then
+    echo "Found both summaries, generating comparison plots..."
+    python bench/bench_plot.py \
+      --summary "$VLLM_SUMMARY" \
+      --router-summary "$ROUTER_SUMMARY"
+  else
+    echo "vLLM summary not found, generating router-only plots..."
+    python bench/bench_plot.py \
+      --router-summary "$ROUTER_SUMMARY"
+  fi
+else
+  echo "No router summary found, skipping plot generation"
+fi
@@ -244,4 +244,15 @@ api:
   batch_classification:
     max_batch_size: 100          # Maximum number of texts in a single batch
     concurrency_threshold: 5     # Switch to concurrent processing when batch size > this value
-    max_concurrency: 8           # Maximum number of concurrent goroutines
+    max_concurrency: 8           # Maximum number of concurrent goroutines
+        
+    # Metrics configuration for monitoring batch classification performance
+    metrics:
+      enabled: true              # Enable comprehensive metrics collection
+      detailed_goroutine_tracking: true  # Track individual goroutine lifecycle 
+      high_resolution_timing: false      # Use nanosecond precision timing 
+      sample_rate: 1.0                   # Collect metrics for all requests (1.0 = 100%, 0.5 = 50%)
+      
+      # Histogram buckets for metrics (directly configure what you need)
+      duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]     
@@ -11,6 +11,7 @@ import (
 	"time"
 
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/config"
+	"github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/services"
 )
 
@@ -91,6 +92,20 @@ func StartClassificationAPI(configPath string, port int) error {
 		classificationSvc = services.NewPlaceholderClassificationService()
 	}
 
+	// Initialize batch metrics configuration
+	if cfg != nil && cfg.API.BatchClassification.Metrics.Enabled {
+		metricsConfig := metrics.BatchMetricsConfig{
+			Enabled:                   cfg.API.BatchClassification.Metrics.Enabled,
+			DetailedGoroutineTracking: cfg.API.BatchClassification.Metrics.DetailedGoroutineTracking,
+			DurationBuckets:           cfg.API.BatchClassification.Metrics.DurationBuckets,
+			SizeBuckets:               cfg.API.BatchClassification.Metrics.SizeBuckets,
+			BatchSizeRanges:           cfg.API.BatchClassification.Metrics.BatchSizeRanges,
+			HighResolutionTiming:      cfg.API.BatchClassification.Metrics.HighResolutionTiming,
+			SampleRate:                cfg.API.BatchClassification.Metrics.SampleRate,
+		}
+		metrics.SetBatchMetricsConfig(metricsConfig)
+	}
+
 	// Create server instance
 	apiServer := &ClassificationAPIServer{
 		classificationSvc: classificationSvc,
@@ -231,6 +246,8 @@ func (s *ClassificationAPIServer) handleBatchClassification(w http.ResponseWrite
 
 	// Input validation
 	if len(req.Texts) == 0 {
+		// Record validation error in metrics
+		metrics.RecordBatchClassificationError("validation", "empty_texts")
 		s.writeErrorResponse(w, http.StatusBadRequest, "INVALID_INPUT", "texts array cannot be empty")
 		return
 	}
@@ -242,6 +259,8 @@ func (s *ClassificationAPIServer) handleBatchClassification(w http.ResponseWrite
 	}
 
 	if len(req.Texts) > maxBatchSize {
+		// Record validation error in metrics
+		metrics.RecordBatchClassificationError("validation", "batch_too_large")
 		s.writeErrorResponse(w, http.StatusBadRequest, "BATCH_TOO_LARGE",
 			fmt.Sprintf("batch size cannot exceed %d texts", maxBatchSize))
 		return
@@ -494,10 +513,26 @@ func (s *ClassificationAPIServer) getSystemInfo() SystemInfo {
 
 // processSequentially handles small batches with sequential processing
 func (s *ClassificationAPIServer) processSequentially(texts []string, options *ClassificationOptions) ([]services.Classification, error) {
+	start := time.Now()
+	processingType := "sequential"
+	batchSize := len(texts)
+
+	// Record request and batch size metrics
+	metrics.RecordBatchClassificationRequest(processingType)
+	metrics.RecordBatchSizeDistribution(processingType, batchSize)
+
+	// Defer recording processing time and text count
+	defer func() {
+		duration := time.Since(start).Seconds()
+		metrics.RecordBatchClassificationDuration(processingType, batchSize, duration)
+		metrics.RecordBatchClassificationTexts(processingType, batchSize)
+	}()
+
 	results := make([]services.Classification, len(texts))
 	for i, text := range texts {
 		result, err := s.classifySingleText(text, options)
 		if err != nil {
+			metrics.RecordBatchClassificationError(processingType, "classification_failed")
 			return nil, fmt.Errorf("failed to classify text at index %d: %w", i, err)
 		}
 		results[i] = result
@@ -507,6 +542,22 @@ func (s *ClassificationAPIServer) processSequentially(texts []string, options *C
 
 // processConcurrently handles large batches with concurrent processing
 func (s *ClassificationAPIServer) processConcurrently(texts []string, options *ClassificationOptions) ([]services.Classification, error) {
+	start := time.Now()
+	processingType := "concurrent"
+	batchSize := len(texts)
+	batchID := fmt.Sprintf("batch_%d", time.Now().UnixNano())
+
+	// Record request and batch size metrics
+	metrics.RecordBatchClassificationRequest(processingType)
+	metrics.RecordBatchSizeDistribution(processingType, batchSize)
+
+	// Defer recording processing time and text count
+	defer func() {
+		duration := time.Since(start).Seconds()
+		metrics.RecordBatchClassificationDuration(processingType, batchSize, duration)
+		metrics.RecordBatchClassificationTexts(processingType, batchSize)
+	}()
+
 	// Get max concurrency from config, default to 8
 	maxConcurrency := 8
 	if s.config != nil && s.config.API.BatchClassification.MaxConcurrency > 0 {
@@ -523,6 +574,18 @@ func (s *ClassificationAPIServer) processConcurrently(texts []string, options *C
 		wg.Add(1)
 		go func(index int, txt string) {
 			defer wg.Done()
+
+			// Record goroutine start (if detailed tracking is enabled)
+			metricsConfig := metrics.GetBatchMetricsConfig()
+			if metricsConfig.DetailedGoroutineTracking {
+				metrics.ConcurrentGoroutines.WithLabelValues(batchID).Inc()
+
+				defer func() {
+					// Record goroutine end
+					metrics.ConcurrentGoroutines.WithLabelValues(batchID).Dec()
+				}()
+			}
+
 			semaphore <- struct{}{}
 			defer func() { <-semaphore }()
 
@@ -532,6 +595,7 @@ func (s *ClassificationAPIServer) processConcurrently(texts []string, options *C
 			result, err := s.classifySingleText(txt, options)
 			if err != nil {
 				errors[index] = err
+				metrics.RecordBatchClassificationError(processingType, "classification_failed")
 				return
 			}
 			results[index] = result
 
@@ -217,13 +217,17 @@ func TestBatchClassificationConfiguration(t *testing.T) {
 			config: &config.RouterConfig{
 				API: config.APIConfig{
 					BatchClassification: struct {
-						MaxBatchSize         int `yaml:"max_batch_size,omitempty"`
-						ConcurrencyThreshold int `yaml:"concurrency_threshold,omitempty"`
-						MaxConcurrency       int `yaml:"max_concurrency,omitempty"`
+						MaxBatchSize         int                                     `yaml:"max_batch_size,omitempty"`
+						ConcurrencyThreshold int                                     `yaml:"concurrency_threshold,omitempty"`
+						MaxConcurrency       int                                     `yaml:"max_concurrency,omitempty"`
+						Metrics              config.BatchClassificationMetricsConfig `yaml:"metrics,omitempty"`
 					}{
 						MaxBatchSize:         3, // Custom small limit
 						ConcurrencyThreshold: 2,
 						MaxConcurrency:       4,
+						Metrics: config.BatchClassificationMetricsConfig{
+							Enabled: true,
+						},
 					},
 				},
 			},
@@ -253,13 +257,17 @@ func TestBatchClassificationConfiguration(t *testing.T) {
 			config: &config.RouterConfig{
 				API: config.APIConfig{
 					BatchClassification: struct {
-						MaxBatchSize         int `yaml:"max_batch_size,omitempty"`
-						ConcurrencyThreshold int `yaml:"concurrency_threshold,omitempty"`
-						MaxConcurrency       int `yaml:"max_concurrency,omitempty"`
+						MaxBatchSize         int                                     `yaml:"max_batch_size,omitempty"`
+						ConcurrencyThreshold int                                     `yaml:"concurrency_threshold,omitempty"`
+						MaxConcurrency       int                                     `yaml:"max_concurrency,omitempty"`
+						Metrics              config.BatchClassificationMetricsConfig `yaml:"metrics,omitempty"`
 					}{
 						MaxBatchSize:         10,
 						ConcurrencyThreshold: 3,
 						MaxConcurrency:       2,
+						Metrics: config.BatchClassificationMetricsConfig{
+							Enabled: true,
+						},
 					},
 				},
 			},
 
@@ -94,9 +94,42 @@ type APIConfig struct {
 
 		// Maximum number of concurrent goroutines for batch processing
 		MaxConcurrency int `yaml:"max_concurrency,omitempty"`
+
+		// Metrics configuration for batch classification monitoring
+		Metrics BatchClassificationMetricsConfig `yaml:"metrics,omitempty"`
 	} `yaml:"batch_classification"`
 }
 
+// BatchClassificationMetricsConfig represents configuration for batch classification metrics
+type BatchClassificationMetricsConfig struct {
+	// Sample rate for metrics collection (0.0-1.0, 1.0 means collect all metrics)
+	SampleRate float64 `yaml:"sample_rate,omitempty"`
+
+	// Batch size range labels for metrics (optional - uses sensible defaults if not specified)
+	// Default ranges: "1", "2-5", "6-10", "11-20", "21-50", "50+"
+	BatchSizeRanges []BatchSizeRangeConfig `yaml:"batch_size_ranges,omitempty"`
+
+	// Histogram buckets for metrics (directly configured)
+	DurationBuckets []float64 `yaml:"duration_buckets,omitempty"`
+	SizeBuckets     []float64 `yaml:"size_buckets,omitempty"`
+
+	// Enable detailed metrics collection
+	Enabled bool `yaml:"enabled,omitempty"`
+
+	// Enable detailed goroutine tracking (may impact performance)
+	DetailedGoroutineTracking bool `yaml:"detailed_goroutine_tracking,omitempty"`
+
+	// Enable high-resolution timing (nanosecond precision)
+	HighResolutionTiming bool `yaml:"high_resolution_timing,omitempty"`
+}
+
+// BatchSizeRangeConfig defines a batch size range with its boundaries and label
+type BatchSizeRangeConfig struct {
+	Min   int    `yaml:"min"`
+	Max   int    `yaml:"max"` // -1 means no upper limit
+	Label string `yaml:"label"`
+}
+
 // PromptGuardConfig represents configuration for the prompt guard jailbreak detection
 type PromptGuardConfig struct {
 	// Enable prompt guard jailbreak detection