vllm-project · rootfs · Oct 20, 2025
@@ -14,6 +14,9 @@ semantic_cache:
   max_entries: 100
   ttl_seconds: 600
   eviction_policy: "fifo"
+  use_hnsw: true # Enable HNSW for faster search
+  hnsw_m: 16
+  hnsw_ef_construction: 200
 
 tools:
   enabled: false

@@ -10,6 +10,10 @@ semantic_cache:
   max_entries: 1000 # Only applies to memory backend
   ttl_seconds: 3600
   eviction_policy: "fifo"
+  # HNSW index configuration (for memory backend only)
+  use_hnsw: true # Enable HNSW index for faster similarity search
+  hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
+  hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)
 
 tools:
   enabled: true

@@ -24,14 +24,17 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
 	switch config.BackendType {
 	case InMemoryCacheType, "":
 		// Use in-memory cache as the default backend
-		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f",
-			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold)
+		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f, UseHNSW: %t",
+			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold, config.UseHNSW)
 		options := InMemoryCacheOptions{
 			Enabled:             config.Enabled,
 			SimilarityThreshold: config.SimilarityThreshold,
 			MaxEntries:          config.MaxEntries,
 			TTLSeconds:          config.TTLSeconds,
 			EvictionPolicy:      config.EvictionPolicy,
+			UseHNSW:             config.UseHNSW,
+			HNSWM:               config.HNSWM,
+			HNSWEfConstruction:  config.HNSWEfConstruction,
 		}
 		return NewInMemoryCache(options), nil
 

@@ -96,4 +96,13 @@ type CacheConfig struct {
 
 	// BackendConfigPath points to backend-specific configuration files
 	BackendConfigPath string `yaml:"backend_config_path,omitempty"`
+
+	// UseHNSW enables HNSW index for faster search in memory backend
+	UseHNSW bool `yaml:"use_hnsw,omitempty"`
+
+	// HNSWM is the number of bi-directional links per node (default: 16)
+	HNSWM int `yaml:"hnsw_m,omitempty"`
+
+	// HNSWEfConstruction is the size of dynamic candidate list during construction (default: 200)
+	HNSWEfConstruction int `yaml:"hnsw_ef_construction,omitempty"`
 }
@@ -0,0 +1,216 @@
+package cache
+
+import (
+	"fmt"
+	"os"
+	"testing"
+
+	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
+)
+
+// ContentLength defines different query content sizes
+type ContentLength int
+
+const (
+	ShortContent  ContentLength = 20  // ~20 words
+	MediumContent ContentLength = 50  // ~50 words
+	LongContent   ContentLength = 100 // ~100 words
+)
+
+func (c ContentLength) String() string {
+	switch c {
+	case ShortContent:
+		return "short"
+	case MediumContent:
+		return "medium"
+	case LongContent:
+		return "long"
+	default:
+		return "unknown"
+	}
+}
+
+// GenerateQuery generates a query of specified length
+func generateQuery(length ContentLength, index int) string {
+	words := []string{
+		"machine", "learning", "artificial", "intelligence", "neural", "network",
+		"deep", "training", "model", "algorithm", "data", "science", "prediction",
+		"classification", "regression", "supervised", "unsupervised", "reinforcement",
+		"optimization", "gradient", "descent", "backpropagation", "activation",
+		"function", "layer", "convolutional", "recurrent", "transformer", "attention",
+		"embedding", "vector", "semantic", "similarity", "clustering", "feature",
+	}
+
+	query := fmt.Sprintf("Query %d: ", index)
+	for i := 0; i < int(length); i++ {
+		query += words[i%len(words)] + " "
+	}
+	return query
+}
+
+// BenchmarkComprehensive runs comprehensive benchmarks across multiple dimensions
+func BenchmarkComprehensive(b *testing.B) {
+	// Initialize BERT model
+	useCPU := os.Getenv("USE_CPU") != "false" // Default to CPU
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	// Determine hardware type
+	hardware := "cpu"
+	if !useCPU {
+		hardware = "gpu"
+	}
+
+	// Test configurations
+	cacheSizes := []int{100, 500, 1000, 5000}
+	contentLengths := []ContentLength{ShortContent, MediumContent, LongContent}
+	hnswConfigs := []struct {
+		name string
+		m    int
+		ef   int
+	}{
+		{"default", 16, 200},
+		{"fast", 8, 100},
+		{"accurate", 32, 400},
+	}
+
+	// Open CSV file for results
+	csvFile, err := os.OpenFile("../../benchmark_results/benchmark_data.csv", 
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+	if err != nil {
+		b.Logf("Warning: Could not open CSV file: %v", err)
+	} else {
+		defer csvFile.Close()
+	}
+
+	// Run benchmarks
+	for _, cacheSize := range cacheSizes {
+		for _, contentLen := range contentLengths {
+			// Generate test data
+			testQueries := make([]string, cacheSize)
+			for i := 0; i < cacheSize; i++ {
+				testQueries[i] = generateQuery(contentLen, i)
+			}
+
+			// Benchmark Linear Search
+			b.Run(fmt.Sprintf("%s/Linear/%s/%dEntries", hardware, contentLen.String(), cacheSize), func(b *testing.B) {
+				cache := NewInMemoryCache(InMemoryCacheOptions{
+					Enabled:             true,
+					MaxEntries:          cacheSize * 2,
+					SimilarityThreshold: 0.85,
+					TTLSeconds:          0,
+					UseHNSW:             false,
+				})
+
+				// Populate cache
+				for i, query := range testQueries {
+					reqID := fmt.Sprintf("req%d", i)
+					_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+				}
+
+				searchQuery := generateQuery(contentLen, cacheSize/2)
+				b.ResetTimer()
+
+				for i := 0; i < b.N; i++ {
+					_, _, _ = cache.FindSimilar("test-model", searchQuery)
+				}
+
+				b.StopTimer()
+
+				// Write to CSV
+				if csvFile != nil {
+					nsPerOp := float64(b.Elapsed().Nanoseconds()) / float64(b.N)
+
+					line := fmt.Sprintf("%s,%s,%d,linear,0,0,%.0f,0,0,%d,1.0\n",
+						hardware, contentLen.String(), cacheSize, nsPerOp, b.N)
+					csvFile.WriteString(line)
+				}
+			})
+
+			// Benchmark HNSW with different configurations
+			for _, hnswCfg := range hnswConfigs {
+				b.Run(fmt.Sprintf("%s/HNSW_%s/%s/%dEntries", hardware, hnswCfg.name, contentLen.String(), cacheSize), func(b *testing.B) {
+					cache := NewInMemoryCache(InMemoryCacheOptions{
+						Enabled:             true,
+						MaxEntries:          cacheSize * 2,
+						SimilarityThreshold: 0.85,
+						TTLSeconds:          0,
+						UseHNSW:             true,
+						HNSWM:               hnswCfg.m,
+						HNSWEfConstruction:  hnswCfg.ef,
+					})
+
+					// Populate cache
+					for i, query := range testQueries {
+						reqID := fmt.Sprintf("req%d", i)
+						_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+					}
+
+					searchQuery := generateQuery(contentLen, cacheSize/2)
+					b.ResetTimer()
+
+					for i := 0; i < b.N; i++ {
+						_, _, _ = cache.FindSimilar("test-model", searchQuery)
+					}
+
+					b.StopTimer()
+
+					// Write to CSV
+					if csvFile != nil {
+						nsPerOp := float64(b.Elapsed().Nanoseconds()) / float64(b.N)
+
+						line := fmt.Sprintf("%s,%s,%d,hnsw_%s,%d,%d,%.0f,0,0,%d,0.0\n",
+							hardware, contentLen.String(), cacheSize, hnswCfg.name, 
+							hnswCfg.m, hnswCfg.ef, nsPerOp, b.N)
+						csvFile.WriteString(line)
+					}
+				})
+			}
+		}
+	}
+}
+
+// BenchmarkIndexConstruction benchmarks HNSW index build time
+func BenchmarkIndexConstruction(b *testing.B) {
+	if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	cacheSizes := []int{100, 500, 1000, 5000}
+	contentLengths := []ContentLength{ShortContent, MediumContent, LongContent}
+
+	for _, cacheSize := range cacheSizes {
+		for _, contentLen := range contentLengths {
+			testQueries := make([]string, cacheSize)
+			for i := 0; i < cacheSize; i++ {
+				testQueries[i] = generateQuery(contentLen, i)
+			}
+
+			b.Run(fmt.Sprintf("BuildIndex/%s/%dEntries", contentLen.String(), cacheSize), func(b *testing.B) {
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					b.StopTimer()
+					cache := NewInMemoryCache(InMemoryCacheOptions{
+						Enabled:             true,
+						MaxEntries:          cacheSize * 2,
+						SimilarityThreshold: 0.85,
+						TTLSeconds:          0,
+						UseHNSW:             true,
+						HNSWM:               16,
+						HNSWEfConstruction:  200,
+					})
+					b.StartTimer()
+
+					// Build index by adding entries
+					for j, query := range testQueries {
+						reqID := fmt.Sprintf("req%d", j)
+						_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+					}
+				}
+			})
+		}
+	}
+}
+