Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/config.development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ semantic_cache:
max_entries: 100
ttl_seconds: 600
eviction_policy: "fifo"
use_hnsw: true # Enable HNSW for faster search
hnsw_m: 16
hnsw_ef_construction: 200

tools:
enabled: false
Expand Down
4 changes: 4 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ semantic_cache:
max_entries: 1000 # Only applies to memory backend
ttl_seconds: 3600
eviction_policy: "fifo"
# HNSW index configuration (for memory backend only)
use_hnsw: true # Enable HNSW index for faster similarity search
hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)

tools:
enabled: true
Expand Down
7 changes: 5 additions & 2 deletions src/semantic-router/pkg/cache/cache_factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,17 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
switch config.BackendType {
case InMemoryCacheType, "":
// Use in-memory cache as the default backend
observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f",
config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold)
observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f, UseHNSW: %t",
config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold, config.UseHNSW)
options := InMemoryCacheOptions{
Enabled: config.Enabled,
SimilarityThreshold: config.SimilarityThreshold,
MaxEntries: config.MaxEntries,
TTLSeconds: config.TTLSeconds,
EvictionPolicy: config.EvictionPolicy,
UseHNSW: config.UseHNSW,
HNSWM: config.HNSWM,
HNSWEfConstruction: config.HNSWEfConstruction,
}
return NewInMemoryCache(options), nil

Expand Down
9 changes: 9 additions & 0 deletions src/semantic-router/pkg/cache/cache_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,13 @@ type CacheConfig struct {

// BackendConfigPath points to backend-specific configuration files
BackendConfigPath string `yaml:"backend_config_path,omitempty"`

// UseHNSW enables HNSW index for faster search in memory backend
UseHNSW bool `yaml:"use_hnsw,omitempty"`

// HNSWM is the number of bi-directional links per node (default: 16)
HNSWM int `yaml:"hnsw_m,omitempty"`

// HNSWEfConstruction is the size of dynamic candidate list during construction (default: 200)
HNSWEfConstruction int `yaml:"hnsw_ef_construction,omitempty"`
}
216 changes: 216 additions & 0 deletions src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
package cache

import (
"fmt"
"os"
"testing"

candle_binding "github.com/vllm-project/semantic-router/candle-binding"
)

// ContentLength defines different query content sizes
type ContentLength int

const (
ShortContent ContentLength = 20 // ~20 words
MediumContent ContentLength = 50 // ~50 words
LongContent ContentLength = 100 // ~100 words
)

func (c ContentLength) String() string {
switch c {
case ShortContent:
return "short"
case MediumContent:
return "medium"
case LongContent:
return "long"
default:
return "unknown"
}
}

// GenerateQuery generates a query of specified length
func generateQuery(length ContentLength, index int) string {
words := []string{
"machine", "learning", "artificial", "intelligence", "neural", "network",
"deep", "training", "model", "algorithm", "data", "science", "prediction",
"classification", "regression", "supervised", "unsupervised", "reinforcement",
"optimization", "gradient", "descent", "backpropagation", "activation",
"function", "layer", "convolutional", "recurrent", "transformer", "attention",
"embedding", "vector", "semantic", "similarity", "clustering", "feature",
}

query := fmt.Sprintf("Query %d: ", index)
for i := 0; i < int(length); i++ {
query += words[i%len(words)] + " "
}
return query
}

// BenchmarkComprehensive runs comprehensive benchmarks across multiple dimensions
func BenchmarkComprehensive(b *testing.B) {
// Initialize BERT model
useCPU := os.Getenv("USE_CPU") != "false" // Default to CPU
modelName := "sentence-transformers/all-MiniLM-L6-v2"
if err := candle_binding.InitModel(modelName, useCPU); err != nil {
b.Skipf("Failed to initialize BERT model: %v", err)
}

// Determine hardware type
hardware := "cpu"
if !useCPU {
hardware = "gpu"
}

// Test configurations
cacheSizes := []int{100, 500, 1000, 5000}
contentLengths := []ContentLength{ShortContent, MediumContent, LongContent}
hnswConfigs := []struct {
name string
m int
ef int
}{
{"default", 16, 200},
{"fast", 8, 100},
{"accurate", 32, 400},
}

// Open CSV file for results
csvFile, err := os.OpenFile("../../benchmark_results/benchmark_data.csv",
os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
if err != nil {
b.Logf("Warning: Could not open CSV file: %v", err)
} else {
defer csvFile.Close()
}

// Run benchmarks
for _, cacheSize := range cacheSizes {
for _, contentLen := range contentLengths {
// Generate test data
testQueries := make([]string, cacheSize)
for i := 0; i < cacheSize; i++ {
testQueries[i] = generateQuery(contentLen, i)
}

// Benchmark Linear Search
b.Run(fmt.Sprintf("%s/Linear/%s/%dEntries", hardware, contentLen.String(), cacheSize), func(b *testing.B) {
cache := NewInMemoryCache(InMemoryCacheOptions{
Enabled: true,
MaxEntries: cacheSize * 2,
SimilarityThreshold: 0.85,
TTLSeconds: 0,
UseHNSW: false,
})

// Populate cache
for i, query := range testQueries {
reqID := fmt.Sprintf("req%d", i)
_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
}

searchQuery := generateQuery(contentLen, cacheSize/2)
b.ResetTimer()

for i := 0; i < b.N; i++ {
_, _, _ = cache.FindSimilar("test-model", searchQuery)
}

b.StopTimer()

// Write to CSV
if csvFile != nil {
nsPerOp := float64(b.Elapsed().Nanoseconds()) / float64(b.N)

line := fmt.Sprintf("%s,%s,%d,linear,0,0,%.0f,0,0,%d,1.0\n",
hardware, contentLen.String(), cacheSize, nsPerOp, b.N)
csvFile.WriteString(line)
}
})

// Benchmark HNSW with different configurations
for _, hnswCfg := range hnswConfigs {
b.Run(fmt.Sprintf("%s/HNSW_%s/%s/%dEntries", hardware, hnswCfg.name, contentLen.String(), cacheSize), func(b *testing.B) {
cache := NewInMemoryCache(InMemoryCacheOptions{
Enabled: true,
MaxEntries: cacheSize * 2,
SimilarityThreshold: 0.85,
TTLSeconds: 0,
UseHNSW: true,
HNSWM: hnswCfg.m,
HNSWEfConstruction: hnswCfg.ef,
})

// Populate cache
for i, query := range testQueries {
reqID := fmt.Sprintf("req%d", i)
_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
}

searchQuery := generateQuery(contentLen, cacheSize/2)
b.ResetTimer()

for i := 0; i < b.N; i++ {
_, _, _ = cache.FindSimilar("test-model", searchQuery)
}

b.StopTimer()

// Write to CSV
if csvFile != nil {
nsPerOp := float64(b.Elapsed().Nanoseconds()) / float64(b.N)

line := fmt.Sprintf("%s,%s,%d,hnsw_%s,%d,%d,%.0f,0,0,%d,0.0\n",
hardware, contentLen.String(), cacheSize, hnswCfg.name,
hnswCfg.m, hnswCfg.ef, nsPerOp, b.N)
csvFile.WriteString(line)
}
})
}
}
}
}

// BenchmarkIndexConstruction benchmarks HNSW index build time
func BenchmarkIndexConstruction(b *testing.B) {
if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
b.Skipf("Failed to initialize BERT model: %v", err)
}

cacheSizes := []int{100, 500, 1000, 5000}
contentLengths := []ContentLength{ShortContent, MediumContent, LongContent}

for _, cacheSize := range cacheSizes {
for _, contentLen := range contentLengths {
testQueries := make([]string, cacheSize)
for i := 0; i < cacheSize; i++ {
testQueries[i] = generateQuery(contentLen, i)
}

b.Run(fmt.Sprintf("BuildIndex/%s/%dEntries", contentLen.String(), cacheSize), func(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
b.StopTimer()
cache := NewInMemoryCache(InMemoryCacheOptions{
Enabled: true,
MaxEntries: cacheSize * 2,
SimilarityThreshold: 0.85,
TTLSeconds: 0,
UseHNSW: true,
HNSWM: 16,
HNSWEfConstruction: 200,
})
b.StartTimer()

// Build index by adding entries
for j, query := range testQueries {
reqID := fmt.Sprintf("req%d", j)
_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
}
}
})
}
}
}

Loading
Loading