Skip to content

Commit 985eaac

Browse files
authored
Merge branch 'main' into docs/improve-contributing
2 parents f9ea384 + 420f7bc commit 985eaac

File tree

10 files changed

+1012
-70
lines changed

10 files changed

+1012
-70
lines changed

Makefile

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,35 @@ build: rust build-router
1111

1212
# Build the Rust library
1313
rust:
14-
@echo "Building Rust library..."
15-
cd candle-binding && cargo build --release
14+
@echo "Ensuring rust is installed..."
15+
@bash -c 'if ! command -v rustc >/dev/null 2>&1; then \
16+
echo "rustc not found, installing..."; \
17+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
18+
fi && \
19+
if [ -f "$$HOME/.cargo/env" ]; then \
20+
echo "Loading Rust environment from $$HOME/.cargo/env..." && \
21+
. $$HOME/.cargo/env; \
22+
fi && \
23+
if ! command -v cargo >/dev/null 2>&1; then \
24+
echo "Error: cargo not found in PATH" && exit 1; \
25+
fi && \
26+
echo "Building Rust library..." && \
27+
cd candle-binding && cargo build --release'
1628

1729
# Build router
1830
build-router: rust
1931
@echo "Building router..."
2032
@mkdir -p bin
2133
@cd src/semantic-router && go build -o ../../bin/router cmd/main.go
2234

35+
# Config file path with default
36+
CONFIG_FILE ?= config/config.yaml
37+
2338
# Run the router
24-
run-router: build-router
25-
@echo "Running router..."
39+
run-router: build-router download-models
40+
@echo "Running router with config: ${CONFIG_FILE}"
2641
@export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release && \
27-
./bin/router -config=config/config.yaml
42+
./bin/router -config=${CONFIG_FILE}
2843

2944
# Prepare Envoy
3045
prepare-envoy:

bench/run_bench.sh

100644100755
Lines changed: 83 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,88 @@
11
#!/bin/bash
22

3-
set -x
3+
# Example usage:
4+
# Quick run:
5+
# SAMPLES_PER_CATEGORY=5 CONCURRENT_REQUESTS=4 VLLM_MODELS="openai/gpt-oss-20b" ROUTER_MODELS="auto" ./run_bench.sh
6+
# Long run:
7+
# SAMPLES_PER_CATEGORY=100 CONCURRENT_REQUESTS=4 VLLM_MODELS="openai/gpt-oss-20b" ROUTER_MODELS="auto" ./run_bench.sh
8+
# To test only router:
9+
# BENCHMARK_ROUTER_ONLY=true ./run_bench.sh
410

5-
export ROUTER_API_KEY="1234567890"
6-
export VLLM_API_KEY="1234567890"
7-
export ROUTER_ENDPOINT="http://localhost:8801/v1"
8-
export VLLM_ENDPOINT="http://localhost:8000/v1"
9-
export ROUTER_MODELS="auto"
10-
export VLLM_MODELS="openai/gpt-oss-20b"
11+
set -x -e
12+
13+
export ROUTER_API_KEY="${ROUTER_API_KEY:-1234567890}"
14+
export VLLM_API_KEY="${VLLM_API_KEY:-1234567890}"
15+
export ROUTER_ENDPOINT="${ROUTER_ENDPOINT:-http://localhost:8801/v1}"
16+
export VLLM_ENDPOINT="${VLLM_ENDPOINT:-http://localhost:8000/v1}"
17+
export ROUTER_MODELS="${ROUTER_MODELS:-auto}"
18+
export VLLM_MODELS="${VLLM_MODELS:-openai/gpt-oss-20b}"
19+
export SAMPLES_PER_CATEGORY="${SAMPLES_PER_CATEGORY:-5}"
20+
export CONCURRENT_REQUESTS="${CONCURRENT_REQUESTS:-4}"
21+
export BENCHMARK_ROUTER_ONLY="${BENCHMARK_ROUTER_ONLY:-false}"
1122

1223
# Run the benchmark
13-
python router_reason_bench.py \
14-
--run-router \
15-
--router-endpoint "$ROUTER_ENDPOINT" \
16-
--router-api-key "$ROUTER_API_KEY" \
17-
--router-models "$ROUTER_MODELS" \
18-
--run-vllm \
19-
--vllm-endpoint "$VLLM_ENDPOINT" \
20-
--vllm-api-key "$VLLM_API_KEY" \
21-
--vllm-models "$VLLM_MODELS" \
22-
--samples-per-category 5 \
23-
--vllm-exec-modes NR XC \
24-
--concurrent-requests 4 \
25-
--output-dir results/reasonbench
26-
27-
# Generate plots
28-
VLLM_MODEL_FIRST="${VLLM_MODELS%% *}"
29-
ROUTER_MODEL_FIRST="${ROUTER_MODELS%% *}"
30-
VLLM_MODELS_SAFE="${VLLM_MODEL_FIRST//\//_}"
31-
ROUTER_MODELS_SAFE="${ROUTER_MODEL_FIRST//\//_}"
32-
python bench_plot.py \
33-
--summary "results/reasonbench/vllm::${VLLM_MODELS_SAFE}/summary.json" \
34-
--router-summary "results/reasonbench/router::${ROUTER_MODELS_SAFE}/summary.json"
24+
if [ "${BENCHMARK_ROUTER_ONLY}" = "true" ]; then
25+
echo "Running router-only benchmark"
26+
python bench/router_reason_bench.py \
27+
--run-router \
28+
--router-endpoint "$ROUTER_ENDPOINT" \
29+
--router-api-key "$ROUTER_API_KEY" \
30+
--router-models "$ROUTER_MODELS" \
31+
--samples-per-category "$SAMPLES_PER_CATEGORY" \
32+
--concurrent-requests "$CONCURRENT_REQUESTS" \
33+
--output-dir results/reasonbench
34+
else
35+
echo "Running full benchmark (router + vLLM)..."
36+
python bench/router_reason_bench.py \
37+
--run-router \
38+
--router-endpoint "$ROUTER_ENDPOINT" \
39+
--router-api-key "$ROUTER_API_KEY" \
40+
--router-models "$ROUTER_MODELS" \
41+
--run-vllm \
42+
--vllm-endpoint "$VLLM_ENDPOINT" \
43+
--vllm-api-key "$VLLM_API_KEY" \
44+
--vllm-models "$VLLM_MODELS" \
45+
--samples-per-category "$SAMPLES_PER_CATEGORY" \
46+
--vllm-exec-modes NR XC \
47+
--concurrent-requests "$CONCURRENT_REQUESTS" \
48+
--output-dir results/reasonbench
49+
fi
50+
51+
# Generate plots if summary files exist
52+
echo "Checking for plot generation..."
53+
echo "VLLM_MODELS: $VLLM_MODELS"
54+
echo "ROUTER_MODELS: $ROUTER_MODELS"
55+
56+
# Get first model name and make it path-safe
57+
VLLM_MODEL_FIRST=$(echo "$VLLM_MODELS" | cut -d' ' -f1)
58+
ROUTER_MODEL_FIRST=$(echo "$ROUTER_MODELS" | cut -d' ' -f1)
59+
echo "First models: VLLM=$VLLM_MODEL_FIRST, Router=$ROUTER_MODEL_FIRST"
60+
61+
# Replace / with _ for path safety
62+
VLLM_MODELS_SAFE=$(echo "$VLLM_MODEL_FIRST" | tr '/' '_')
63+
ROUTER_MODELS_SAFE=$(echo "$ROUTER_MODEL_FIRST" | tr '/' '_')
64+
echo "Safe paths: VLLM=$VLLM_MODELS_SAFE, Router=$ROUTER_MODELS_SAFE"
65+
66+
# Construct the full paths
67+
VLLM_SUMMARY="results/reasonbench/vllm::${VLLM_MODELS_SAFE}/summary.json"
68+
ROUTER_SUMMARY="results/reasonbench/router::${ROUTER_MODELS_SAFE}/summary.json"
69+
echo "Looking for summaries at:"
70+
echo "VLLM: $VLLM_SUMMARY"
71+
echo "Router: $ROUTER_SUMMARY"
72+
73+
# Check if at least one summary file exists and generate plots
74+
if [ -f "$ROUTER_SUMMARY" ]; then
75+
echo "Found router summary, generating plots..."
76+
if [ -f "$VLLM_SUMMARY" ]; then
77+
echo "Found both summaries, generating comparison plots..."
78+
python bench/bench_plot.py \
79+
--summary "$VLLM_SUMMARY" \
80+
--router-summary "$ROUTER_SUMMARY"
81+
else
82+
echo "vLLM summary not found, generating router-only plots..."
83+
python bench/bench_plot.py \
84+
--router-summary "$ROUTER_SUMMARY"
85+
fi
86+
else
87+
echo "No router summary found, skipping plot generation"
88+
fi

config/config.yaml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,4 +244,15 @@ api:
244244
batch_classification:
245245
max_batch_size: 100 # Maximum number of texts in a single batch
246246
concurrency_threshold: 5 # Switch to concurrent processing when batch size > this value
247-
max_concurrency: 8 # Maximum number of concurrent goroutines
247+
max_concurrency: 8 # Maximum number of concurrent goroutines
248+
249+
# Metrics configuration for monitoring batch classification performance
250+
metrics:
251+
enabled: true # Enable comprehensive metrics collection
252+
detailed_goroutine_tracking: true # Track individual goroutine lifecycle
253+
high_resolution_timing: false # Use nanosecond precision timing
254+
sample_rate: 1.0 # Collect metrics for all requests (1.0 = 100%, 0.5 = 50%)
255+
256+
# Histogram buckets for metrics (directly configure what you need)
257+
duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
258+
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]

src/semantic-router/pkg/api/server.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"time"
1212

1313
"github.com/vllm-project/semantic-router/semantic-router/pkg/config"
14+
"github.com/vllm-project/semantic-router/semantic-router/pkg/metrics"
1415
"github.com/vllm-project/semantic-router/semantic-router/pkg/services"
1516
)
1617

@@ -91,6 +92,20 @@ func StartClassificationAPI(configPath string, port int) error {
9192
classificationSvc = services.NewPlaceholderClassificationService()
9293
}
9394

95+
// Initialize batch metrics configuration
96+
if cfg != nil && cfg.API.BatchClassification.Metrics.Enabled {
97+
metricsConfig := metrics.BatchMetricsConfig{
98+
Enabled: cfg.API.BatchClassification.Metrics.Enabled,
99+
DetailedGoroutineTracking: cfg.API.BatchClassification.Metrics.DetailedGoroutineTracking,
100+
DurationBuckets: cfg.API.BatchClassification.Metrics.DurationBuckets,
101+
SizeBuckets: cfg.API.BatchClassification.Metrics.SizeBuckets,
102+
BatchSizeRanges: cfg.API.BatchClassification.Metrics.BatchSizeRanges,
103+
HighResolutionTiming: cfg.API.BatchClassification.Metrics.HighResolutionTiming,
104+
SampleRate: cfg.API.BatchClassification.Metrics.SampleRate,
105+
}
106+
metrics.SetBatchMetricsConfig(metricsConfig)
107+
}
108+
94109
// Create server instance
95110
apiServer := &ClassificationAPIServer{
96111
classificationSvc: classificationSvc,
@@ -231,6 +246,8 @@ func (s *ClassificationAPIServer) handleBatchClassification(w http.ResponseWrite
231246

232247
// Input validation
233248
if len(req.Texts) == 0 {
249+
// Record validation error in metrics
250+
metrics.RecordBatchClassificationError("validation", "empty_texts")
234251
s.writeErrorResponse(w, http.StatusBadRequest, "INVALID_INPUT", "texts array cannot be empty")
235252
return
236253
}
@@ -242,6 +259,8 @@ func (s *ClassificationAPIServer) handleBatchClassification(w http.ResponseWrite
242259
}
243260

244261
if len(req.Texts) > maxBatchSize {
262+
// Record validation error in metrics
263+
metrics.RecordBatchClassificationError("validation", "batch_too_large")
245264
s.writeErrorResponse(w, http.StatusBadRequest, "BATCH_TOO_LARGE",
246265
fmt.Sprintf("batch size cannot exceed %d texts", maxBatchSize))
247266
return
@@ -494,10 +513,26 @@ func (s *ClassificationAPIServer) getSystemInfo() SystemInfo {
494513

495514
// processSequentially handles small batches with sequential processing
496515
func (s *ClassificationAPIServer) processSequentially(texts []string, options *ClassificationOptions) ([]services.Classification, error) {
516+
start := time.Now()
517+
processingType := "sequential"
518+
batchSize := len(texts)
519+
520+
// Record request and batch size metrics
521+
metrics.RecordBatchClassificationRequest(processingType)
522+
metrics.RecordBatchSizeDistribution(processingType, batchSize)
523+
524+
// Defer recording processing time and text count
525+
defer func() {
526+
duration := time.Since(start).Seconds()
527+
metrics.RecordBatchClassificationDuration(processingType, batchSize, duration)
528+
metrics.RecordBatchClassificationTexts(processingType, batchSize)
529+
}()
530+
497531
results := make([]services.Classification, len(texts))
498532
for i, text := range texts {
499533
result, err := s.classifySingleText(text, options)
500534
if err != nil {
535+
metrics.RecordBatchClassificationError(processingType, "classification_failed")
501536
return nil, fmt.Errorf("failed to classify text at index %d: %w", i, err)
502537
}
503538
results[i] = result
@@ -507,6 +542,22 @@ func (s *ClassificationAPIServer) processSequentially(texts []string, options *C
507542

508543
// processConcurrently handles large batches with concurrent processing
509544
func (s *ClassificationAPIServer) processConcurrently(texts []string, options *ClassificationOptions) ([]services.Classification, error) {
545+
start := time.Now()
546+
processingType := "concurrent"
547+
batchSize := len(texts)
548+
batchID := fmt.Sprintf("batch_%d", time.Now().UnixNano())
549+
550+
// Record request and batch size metrics
551+
metrics.RecordBatchClassificationRequest(processingType)
552+
metrics.RecordBatchSizeDistribution(processingType, batchSize)
553+
554+
// Defer recording processing time and text count
555+
defer func() {
556+
duration := time.Since(start).Seconds()
557+
metrics.RecordBatchClassificationDuration(processingType, batchSize, duration)
558+
metrics.RecordBatchClassificationTexts(processingType, batchSize)
559+
}()
560+
510561
// Get max concurrency from config, default to 8
511562
maxConcurrency := 8
512563
if s.config != nil && s.config.API.BatchClassification.MaxConcurrency > 0 {
@@ -523,6 +574,18 @@ func (s *ClassificationAPIServer) processConcurrently(texts []string, options *C
523574
wg.Add(1)
524575
go func(index int, txt string) {
525576
defer wg.Done()
577+
578+
// Record goroutine start (if detailed tracking is enabled)
579+
metricsConfig := metrics.GetBatchMetricsConfig()
580+
if metricsConfig.DetailedGoroutineTracking {
581+
metrics.ConcurrentGoroutines.WithLabelValues(batchID).Inc()
582+
583+
defer func() {
584+
// Record goroutine end
585+
metrics.ConcurrentGoroutines.WithLabelValues(batchID).Dec()
586+
}()
587+
}
588+
526589
semaphore <- struct{}{}
527590
defer func() { <-semaphore }()
528591

@@ -532,6 +595,7 @@ func (s *ClassificationAPIServer) processConcurrently(texts []string, options *C
532595
result, err := s.classifySingleText(txt, options)
533596
if err != nil {
534597
errors[index] = err
598+
metrics.RecordBatchClassificationError(processingType, "classification_failed")
535599
return
536600
}
537601
results[index] = result

src/semantic-router/pkg/api/server_test.go

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -217,13 +217,17 @@ func TestBatchClassificationConfiguration(t *testing.T) {
217217
config: &config.RouterConfig{
218218
API: config.APIConfig{
219219
BatchClassification: struct {
220-
MaxBatchSize int `yaml:"max_batch_size,omitempty"`
221-
ConcurrencyThreshold int `yaml:"concurrency_threshold,omitempty"`
222-
MaxConcurrency int `yaml:"max_concurrency,omitempty"`
220+
MaxBatchSize int `yaml:"max_batch_size,omitempty"`
221+
ConcurrencyThreshold int `yaml:"concurrency_threshold,omitempty"`
222+
MaxConcurrency int `yaml:"max_concurrency,omitempty"`
223+
Metrics config.BatchClassificationMetricsConfig `yaml:"metrics,omitempty"`
223224
}{
224225
MaxBatchSize: 3, // Custom small limit
225226
ConcurrencyThreshold: 2,
226227
MaxConcurrency: 4,
228+
Metrics: config.BatchClassificationMetricsConfig{
229+
Enabled: true,
230+
},
227231
},
228232
},
229233
},
@@ -253,13 +257,17 @@ func TestBatchClassificationConfiguration(t *testing.T) {
253257
config: &config.RouterConfig{
254258
API: config.APIConfig{
255259
BatchClassification: struct {
256-
MaxBatchSize int `yaml:"max_batch_size,omitempty"`
257-
ConcurrencyThreshold int `yaml:"concurrency_threshold,omitempty"`
258-
MaxConcurrency int `yaml:"max_concurrency,omitempty"`
260+
MaxBatchSize int `yaml:"max_batch_size,omitempty"`
261+
ConcurrencyThreshold int `yaml:"concurrency_threshold,omitempty"`
262+
MaxConcurrency int `yaml:"max_concurrency,omitempty"`
263+
Metrics config.BatchClassificationMetricsConfig `yaml:"metrics,omitempty"`
259264
}{
260265
MaxBatchSize: 10,
261266
ConcurrencyThreshold: 3,
262267
MaxConcurrency: 2,
268+
Metrics: config.BatchClassificationMetricsConfig{
269+
Enabled: true,
270+
},
263271
},
264272
},
265273
},

src/semantic-router/pkg/config/config.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,42 @@ type APIConfig struct {
9494

9595
// Maximum number of concurrent goroutines for batch processing
9696
MaxConcurrency int `yaml:"max_concurrency,omitempty"`
97+
98+
// Metrics configuration for batch classification monitoring
99+
Metrics BatchClassificationMetricsConfig `yaml:"metrics,omitempty"`
97100
} `yaml:"batch_classification"`
98101
}
99102

103+
// BatchClassificationMetricsConfig represents configuration for batch classification metrics
104+
type BatchClassificationMetricsConfig struct {
105+
// Sample rate for metrics collection (0.0-1.0, 1.0 means collect all metrics)
106+
SampleRate float64 `yaml:"sample_rate,omitempty"`
107+
108+
// Batch size range labels for metrics (optional - uses sensible defaults if not specified)
109+
// Default ranges: "1", "2-5", "6-10", "11-20", "21-50", "50+"
110+
BatchSizeRanges []BatchSizeRangeConfig `yaml:"batch_size_ranges,omitempty"`
111+
112+
// Histogram buckets for metrics (directly configured)
113+
DurationBuckets []float64 `yaml:"duration_buckets,omitempty"`
114+
SizeBuckets []float64 `yaml:"size_buckets,omitempty"`
115+
116+
// Enable detailed metrics collection
117+
Enabled bool `yaml:"enabled,omitempty"`
118+
119+
// Enable detailed goroutine tracking (may impact performance)
120+
DetailedGoroutineTracking bool `yaml:"detailed_goroutine_tracking,omitempty"`
121+
122+
// Enable high-resolution timing (nanosecond precision)
123+
HighResolutionTiming bool `yaml:"high_resolution_timing,omitempty"`
124+
}
125+
126+
// BatchSizeRangeConfig defines a batch size range with its boundaries and label
127+
type BatchSizeRangeConfig struct {
128+
Min int `yaml:"min"`
129+
Max int `yaml:"max"` // -1 means no upper limit
130+
Label string `yaml:"label"`
131+
}
132+
100133
// PromptGuardConfig represents configuration for the prompt guard jailbreak detection
101134
type PromptGuardConfig struct {
102135
// Enable prompt guard jailbreak detection

0 commit comments

Comments
 (0)