From cdbeda0a4ae179800df04694954d491a2661024e Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Mon, 20 Oct 2025 16:52:47 +0000
Subject: [PATCH 01/13] feat: add HNSW index to inmemory semantic cache and
 implement hybrid cache that use in-memory index and milvus based doc store

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 candle-binding/Cargo.lock                     |  52 +
 candle-binding/Cargo.toml                     |   6 +-
 config/config.development.yaml                |   3 +
 config/config.hybrid.yaml                     |  58 ++
 config/config.yaml                            |  11 +-
 src/semantic-router/go.mod                    |   2 +-
 src/semantic-router/go.sum                    |   2 +
 .../pkg/cache/cache_factory.go                |  21 +-
 .../pkg/cache/cache_interface.go              |  15 +
 .../pkg/cache/comprehensive_benchmark_test.go | 324 +++++++
 src/semantic-router/pkg/cache/hybrid_cache.go | 898 ++++++++++++++++++
 .../pkg/cache/hybrid_cache_test.go            | 447 +++++++++
 .../cache/hybrid_vs_milvus_benchmark_test.go  | 869 +++++++++++++++++
 .../pkg/cache/inmemory_cache.go               | 558 ++++++++++-
 .../cache/inmemory_cache_integration_test.go  | 387 ++++++++
 .../pkg/cache/large_scale_benchmark_test.go   | 511 ++++++++++
 src/semantic-router/pkg/cache/milvus_cache.go | 218 +++++
 .../pkg/cache/simd_benchmark_test.go          | 141 +++
 .../pkg/cache/simd_distance_amd64.go          |  60 ++
 .../pkg/cache/simd_distance_amd64.s           | 114 +++
 .../pkg/cache/simd_distance_generic.go        |  22 +
 tools/make/milvus.mk                          | 109 +++
 .../tutorials/semantic-cache/hybrid-cache.md  | 416 ++++++++
 .../semantic-cache/in-memory-cache.md         |  73 ++
 24 files changed, 5279 insertions(+), 38 deletions(-)
 create mode 100644 config/config.hybrid.yaml
 create mode 100644 src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
 create mode 100644 src/semantic-router/pkg/cache/hybrid_cache.go
 create mode 100644 src/semantic-router/pkg/cache/hybrid_cache_test.go
 create mode 100644 src/semantic-router/pkg/cache/hybrid_vs_milvus_benchmark_test.go
 create mode 100644 src/semantic-router/pkg/cache/large_scale_benchmark_test.go
 create mode 100644 src/semantic-router/pkg/cache/simd_benchmark_test.go
 create mode 100644 src/semantic-router/pkg/cache/simd_distance_amd64.go
 create mode 100644 src/semantic-router/pkg/cache/simd_distance_amd64.s
 create mode 100644 src/semantic-router/pkg/cache/simd_distance_generic.go
 create mode 100644 website/docs/tutorials/semantic-cache/hybrid-cache.md

diff --git a/candle-binding/Cargo.lock b/candle-binding/Cargo.lock
index 28d8b6cd..0636ef1c 100644
--- a/candle-binding/Cargo.lock
+++ b/candle-binding/Cargo.lock
@@ -97,6 +97,17 @@ version = "0.22.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
 
+[[package]]
+name = "bindgen_cuda"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f8489af5b7d17a81bffe37e0f4d6e1e4de87c87329d05447f22c35d95a1227d"
+dependencies = [
+ "glob",
+ "num_cpus",
+ "rayon",
+]
+
 [[package]]
 name = "bit-set"
 version = "0.5.3"
@@ -169,6 +180,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06ccf5ee3532e66868516d9b315f73aec9f34ea1a37ae98514534d458915dbf1"
 dependencies = [
  "byteorder",
+ "candle-kernels",
+ "cudarc",
  "gemm 0.17.1",
  "half",
  "memmap2",
@@ -180,10 +193,20 @@ dependencies = [
  "safetensors",
  "thiserror 1.0.69",
  "ug",
+ "ug-cuda",
  "yoke 0.7.5",
  "zip",
 ]
 
+[[package]]
+name = "candle-kernels"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a10885bd902fad1b8518ba2b22369aaed88a3d94e123533ad3ca73db33b1c8ca"
+dependencies = [
+ "bindgen_cuda",
+]
+
 [[package]]
 name = "candle-nn"
 version = "0.8.4"
@@ -346,6 +369,16 @@ version = "0.2.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
 
+[[package]]
+name = "cudarc"
+version = "0.13.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "486c221362668c63a1636cfa51463b09574433b39029326cff40864b3ba12b6e"
+dependencies = [
+ "half",
+ "libloading",
+]
+
 [[package]]
 name = "darling"
 version = "0.20.11"
@@ -966,6 +999,12 @@ version = "0.32.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7"
 
+[[package]]
+name = "glob"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280"
+
 [[package]]
 name = "h2"
 version = "0.4.12"
@@ -2695,6 +2734,19 @@ dependencies = [
  "yoke 0.7.5",
 ]
 
+[[package]]
+name = "ug-cuda"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50758486d7941f8b0a636ba7e29455c07071f41590beac1fd307ec893e8db69a"
+dependencies = [
+ "cudarc",
+ "half",
+ "serde",
+ "thiserror 1.0.69",
+ "ug",
+]
+
 [[package]]
 name = "unicode-ident"
 version = "1.0.19"
diff --git a/candle-binding/Cargo.toml b/candle-binding/Cargo.toml
index 9b9364f4..f4746d33 100644
--- a/candle-binding/Cargo.toml
+++ b/candle-binding/Cargo.toml
@@ -11,9 +11,9 @@ crate-type = ["staticlib", "cdylib"]
 
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-core = "0.8.4"
-candle-nn = "0.8.4"
-candle-transformers = "0.8.4"
+candle-core = { version = "0.8.4", features = ["cuda"] }
+candle-nn = { version = "0.8.4", features = ["cuda"] }
+candle-transformers = { version = "0.8.4", features = ["cuda"] }
 tokenizers = { version = "0.21.0", features = ["http"] }
 hf-hub = "0.4.1"
 safetensors = "0.4.1"
diff --git a/config/config.development.yaml b/config/config.development.yaml
index 9c03ecdc..49f1372a 100644
--- a/config/config.development.yaml
+++ b/config/config.development.yaml
@@ -14,6 +14,9 @@ semantic_cache:
   max_entries: 100
   ttl_seconds: 600
   eviction_policy: "fifo"
+  use_hnsw: true # Enable HNSW for faster search
+  hnsw_m: 16
+  hnsw_ef_construction: 200
 
 tools:
   enabled: false
diff --git a/config/config.hybrid.yaml b/config/config.hybrid.yaml
new file mode 100644
index 00000000..5e7c288b
--- /dev/null
+++ b/config/config.hybrid.yaml
@@ -0,0 +1,58 @@
+bert_model:
+  model_id: models/all-MiniLM-L12-v2
+  threshold: 0.6
+  use_cpu: true
+
+semantic_cache:
+  enabled: true
+  backend_type: "hybrid" # Hybrid HNSW + Milvus backend
+  similarity_threshold: 0.85
+  ttl_seconds: 3600
+  
+  # Hybrid cache specific settings
+  max_memory_entries: 100000 # Max entries in HNSW index (100K)
+  
+  # HNSW parameters
+  hnsw_m: 16 # Number of bi-directional links
+  hnsw_ef_construction: 200 # Construction quality parameter
+  
+  # Milvus configuration file path
+  backend_config_path: "config/milvus.yaml"
+
+tools:
+  enabled: true
+  top_k: 3
+  similarity_threshold: 0.2
+  tools_db_path: "config/tools_db.json"
+  fallback_to_empty: true
+
+prompt_guard:
+  enabled: true
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+# vLLM Endpoints Configuration
+vllm_endpoints:
+  - name: "endpoint1"
+    address: "172.28.0.20"
+    port: 8002
+    weight: 1
+
+model_config:
+  "qwen3":
+    reasoning_family: "qwen3"
+    preferred_endpoints: ["endpoint1"]
+    pii_policy:
+      allow_by_default: true
+
+# Classifier configuration
+classifier:
+  enabled: true
+  model_path: "models/qwen3-router_model/router_qwen_generative_model.safetensors"
+  tokenizer_path: "models/qwen3-router_model"
+  use_cpu: true
+  threshold: 0.7
+
diff --git a/config/config.yaml b/config/config.yaml
index 06c1b60f..e6c4d724 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -5,11 +5,20 @@ bert_model:
 
 semantic_cache:
   enabled: true
-  backend_type: "memory" # Options: "memory" or "milvus"
+  backend_type: "memory" # Options: "memory", "milvus", or "hybrid"
   similarity_threshold: 0.8
   max_entries: 1000 # Only applies to memory backend
   ttl_seconds: 3600
   eviction_policy: "fifo"
+  # HNSW index configuration (for memory backend only)
+  use_hnsw: true # Enable HNSW index for faster similarity search
+  hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
+  hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)
+  
+  # Hybrid cache configuration (when backend_type: "hybrid")
+  # Combines in-memory HNSW for fast search with Milvus for scalable storage
+  # max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
+  # backend_config_path: "config/milvus.yaml" # Path to Milvus config
 
 tools:
   enabled: true
diff --git a/src/semantic-router/go.mod b/src/semantic-router/go.mod
index d5f8a9c0..2c7dc291 100644
--- a/src/semantic-router/go.mod
+++ b/src/semantic-router/go.mod
@@ -93,7 +93,7 @@ require (
 	go.yaml.in/yaml/v2 v2.4.2 // indirect
 	golang.org/x/net v0.43.0 // indirect
 	golang.org/x/sync v0.16.0 // indirect
-	golang.org/x/sys v0.35.0 // indirect
+	golang.org/x/sys v0.37.0 // indirect
 	golang.org/x/text v0.28.0 // indirect
 	golang.org/x/tools v0.35.0 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20250929231259-57b25ae835d4 // indirect
diff --git a/src/semantic-router/go.sum b/src/semantic-router/go.sum
index d062bf92..d1f42cc1 100644
--- a/src/semantic-router/go.sum
+++ b/src/semantic-router/go.sum
@@ -428,6 +428,8 @@ golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20220209214540-3681064d5158/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
 golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
+golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
+golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
diff --git a/src/semantic-router/pkg/cache/cache_factory.go b/src/semantic-router/pkg/cache/cache_factory.go
index f3343c5a..5158a5f8 100644
--- a/src/semantic-router/pkg/cache/cache_factory.go
+++ b/src/semantic-router/pkg/cache/cache_factory.go
@@ -24,14 +24,17 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
 	switch config.BackendType {
 	case InMemoryCacheType, "":
 		// Use in-memory cache as the default backend
-		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f",
-			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold)
+		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f, UseHNSW: %t",
+			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold, config.UseHNSW)
 		options := InMemoryCacheOptions{
 			Enabled:             config.Enabled,
 			SimilarityThreshold: config.SimilarityThreshold,
 			MaxEntries:          config.MaxEntries,
 			TTLSeconds:          config.TTLSeconds,
 			EvictionPolicy:      config.EvictionPolicy,
+			UseHNSW:             config.UseHNSW,
+			HNSWM:               config.HNSWM,
+			HNSWEfConstruction:  config.HNSWEfConstruction,
 		}
 		return NewInMemoryCache(options), nil
 
@@ -46,6 +49,20 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
 		}
 		return NewMilvusCache(options)
 
+	case HybridCacheType:
+		observability.Debugf("Creating Hybrid cache backend - MaxMemory: %d, TTL: %ds, Threshold: %.3f",
+			config.MaxMemoryEntries, config.TTLSeconds, config.SimilarityThreshold)
+		options := HybridCacheOptions{
+			Enabled:             config.Enabled,
+			SimilarityThreshold: config.SimilarityThreshold,
+			TTLSeconds:          config.TTLSeconds,
+			MaxMemoryEntries:    config.MaxMemoryEntries,
+			HNSWM:               config.HNSWM,
+			HNSWEfConstruction:  config.HNSWEfConstruction,
+			MilvusConfigPath:    config.BackendConfigPath,
+		}
+		return NewHybridCache(options)
+
 	default:
 		observability.Debugf("Unsupported cache backend type: %s", config.BackendType)
 		return nil, fmt.Errorf("unsupported cache backend type: %s", config.BackendType)
diff --git a/src/semantic-router/pkg/cache/cache_interface.go b/src/semantic-router/pkg/cache/cache_interface.go
index fcdf0073..f74a92a0 100644
--- a/src/semantic-router/pkg/cache/cache_interface.go
+++ b/src/semantic-router/pkg/cache/cache_interface.go
@@ -63,6 +63,9 @@ const (
 
 	// MilvusCacheType specifies the Milvus vector database backend
 	MilvusCacheType CacheBackendType = "milvus"
+
+	// HybridCacheType specifies the hybrid HNSW + Milvus backend
+	HybridCacheType CacheBackendType = "hybrid"
 )
 
 // EvictionPolicyType defines the available eviction policies
@@ -101,4 +104,16 @@ type CacheConfig struct {
 
 	// BackendConfigPath points to backend-specific configuration files
 	BackendConfigPath string `yaml:"backend_config_path,omitempty"`
+
+	// UseHNSW enables HNSW index for faster search in memory backend
+	UseHNSW bool `yaml:"use_hnsw,omitempty"`
+
+	// HNSWM is the number of bi-directional links per node (default: 16)
+	HNSWM int `yaml:"hnsw_m,omitempty"`
+
+	// HNSWEfConstruction is the size of dynamic candidate list during construction (default: 200)
+	HNSWEfConstruction int `yaml:"hnsw_ef_construction,omitempty"`
+
+	// Hybrid cache specific settings
+	MaxMemoryEntries int `yaml:"max_memory_entries,omitempty"` // Max entries in HNSW for hybrid cache
 }
diff --git a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
new file mode 100644
index 00000000..a2a82fc9
--- /dev/null
+++ b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
@@ -0,0 +1,324 @@
+package cache
+
+import (
+	"fmt"
+	"os"
+	"testing"
+
+	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
+)
+
+// ContentLength defines different query content sizes
+type ContentLength int
+
+const (
+	ShortContent  ContentLength = 20  // ~20 words
+	MediumContent ContentLength = 50  // ~50 words
+	LongContent   ContentLength = 100 // ~100 words
+)
+
+func (c ContentLength) String() string {
+	switch c {
+	case ShortContent:
+		return "short"
+	case MediumContent:
+		return "medium"
+	case LongContent:
+		return "long"
+	default:
+		return "unknown"
+	}
+}
+
+// GenerateQuery generates a query with maximum semantic diversity using hash-based randomization
+func generateQuery(length ContentLength, index int) string {
+	// Hash the index to get pseudo-random values (deterministic but well-distributed)
+	hash := uint64(index)
+	hash = hash*2654435761 + 1013904223 // Knuth's multiplicative hash
+
+	// Expanded templates for maximum diversity
+	templates := []string{
+		// Technical how-to questions
+		"How to implement %s using %s and %s for %s applications in production environments",
+		"What are the best practices for %s when building %s systems with %s constraints",
+		"Can you explain the architecture of %s systems that integrate %s and %s components",
+		"How do I configure %s to work with %s while ensuring %s compatibility",
+		"What is the recommended approach for %s development using %s and %s technologies",
+
+		// Comparison questions
+		"Explain the difference between %s and %s in the context of %s development",
+		"Compare and contrast %s approaches versus %s methods for %s use cases",
+		"What is the performance impact of %s versus %s for %s workloads",
+		"Which is better for %s: %s or %s, considering %s requirements",
+		"When should I use %s instead of %s for %s scenarios",
+
+		// Debugging/troubleshooting
+		"Can you help me debug %s issues related to %s when using %s framework",
+		"Why is my %s failing when I integrate %s with %s system",
+		"How to troubleshoot %s errors in %s when deploying to %s environment",
+		"What causes %s problems in %s architecture with %s configuration",
+
+		// Optimization questions
+		"How do I optimize %s for %s while maintaining %s requirements",
+		"What are the performance bottlenecks in %s when using %s with %s",
+		"How can I improve %s throughput in %s systems running %s",
+		"What are common pitfalls when optimizing %s with %s in %s environments",
+
+		// Design/architecture questions
+		"How should I design %s to handle %s and support %s functionality",
+		"What are the scalability considerations for %s when implementing %s with %s",
+		"How to architect %s systems that require %s and %s capabilities",
+		"What design patterns work best for %s in %s architectures with %s",
+	}
+
+	// Massively expanded topics for semantic diversity
+	topics := []string{
+		// ML/AI
+		"machine learning", "deep learning", "neural networks", "reinforcement learning",
+		"computer vision", "NLP", "transformers", "embeddings", "fine-tuning",
+
+		// Infrastructure
+		"microservices", "distributed systems", "message queues", "event streaming",
+		"container orchestration", "service mesh", "API gateway", "load balancing",
+		"database sharding", "data replication", "consensus algorithms", "circuit breakers",
+
+		// Data
+		"data pipelines", "ETL", "data warehousing", "real-time analytics",
+		"stream processing", "batch processing", "data lakes", "data modeling",
+
+		// Security
+		"authentication", "authorization", "encryption", "TLS", "OAuth",
+		"API security", "zero trust", "secrets management", "key rotation",
+
+		// Observability
+		"monitoring", "logging", "tracing", "metrics", "alerting",
+		"observability", "profiling", "debugging", "APM",
+
+		// Performance
+		"caching strategies", "rate limiting", "connection pooling", "query optimization",
+		"memory management", "garbage collection", "CPU profiling", "I/O optimization",
+
+		// Reliability
+		"high availability", "fault tolerance", "disaster recovery", "backups",
+		"failover", "redundancy", "chaos engineering", "SLA management",
+
+		// Cloud/DevOps
+		"CI/CD", "GitOps", "infrastructure as code", "configuration management",
+		"auto-scaling", "serverless", "edge computing", "multi-cloud",
+
+		// Databases
+		"SQL databases", "NoSQL", "graph databases", "time series databases",
+		"vector databases", "in-memory databases", "database indexing", "query planning",
+	}
+
+	// Additional random modifiers for even more diversity
+	modifiers := []string{
+		"large-scale", "enterprise", "cloud-native", "production-grade",
+		"real-time", "distributed", "fault-tolerant", "high-performance",
+		"mission-critical", "scalable", "secure", "compliant",
+	}
+
+	// Use hash to pseudo-randomly select (but deterministic for same index)
+	templateIdx := int(hash % uint64(len(templates)))
+	hash = hash * 16807 % 2147483647 // LCG for next random
+
+	topic1Idx := int(hash % uint64(len(topics)))
+	hash = hash * 16807 % 2147483647
+
+	topic2Idx := int(hash % uint64(len(topics)))
+	hash = hash * 16807 % 2147483647
+
+	topic3Idx := int(hash % uint64(len(topics)))
+	hash = hash * 16807 % 2147483647
+
+	// Build query with selected template and topics
+	query := fmt.Sprintf(templates[templateIdx],
+		topics[topic1Idx],
+		topics[topic2Idx],
+		topics[topic3Idx],
+		modifiers[int(hash%uint64(len(modifiers)))])
+
+	// Add unique identifier to guarantee uniqueness
+	query += fmt.Sprintf(" [Request ID: REQ-%d]", index)
+
+	// Add extra context for longer queries
+	if length > MediumContent {
+		hash = hash * 16807 % 2147483647
+		extraTopicIdx := int(hash % uint64(len(topics)))
+		query += fmt.Sprintf(" Also considering %s integration and %s compatibility requirements.",
+			topics[extraTopicIdx],
+			modifiers[int(hash%uint64(len(modifiers)))])
+	}
+
+	return query
+}
+
+// BenchmarkComprehensive runs comprehensive benchmarks across multiple dimensions
+func BenchmarkComprehensive(b *testing.B) {
+	// Initialize BERT model
+	useCPU := os.Getenv("USE_CPU") != "false" // Default to CPU
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	// Determine hardware type
+	hardware := "cpu"
+	if !useCPU {
+		hardware = "gpu"
+	}
+
+	// Test configurations
+	cacheSizes := []int{100, 500, 1000, 5000}
+	contentLengths := []ContentLength{ShortContent, MediumContent, LongContent}
+	hnswConfigs := []struct {
+		name string
+		m    int
+		ef   int
+	}{
+		{"default", 16, 200},
+		{"fast", 8, 100},
+		{"accurate", 32, 400},
+	}
+
+	// Open CSV file for results
+	csvFile, err := os.OpenFile("../../benchmark_results/benchmark_data.csv",
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+	if err != nil {
+		b.Logf("Warning: Could not open CSV file: %v", err)
+	} else {
+		defer csvFile.Close()
+	}
+
+	// Run benchmarks
+	for _, cacheSize := range cacheSizes {
+		for _, contentLen := range contentLengths {
+			// Generate test data
+			testQueries := make([]string, cacheSize)
+			for i := 0; i < cacheSize; i++ {
+				testQueries[i] = generateQuery(contentLen, i)
+			}
+
+			// Benchmark Linear Search
+			b.Run(fmt.Sprintf("%s/Linear/%s/%dEntries", hardware, contentLen.String(), cacheSize), func(b *testing.B) {
+				cache := NewInMemoryCache(InMemoryCacheOptions{
+					Enabled:             true,
+					MaxEntries:          cacheSize * 2,
+					SimilarityThreshold: 0.85,
+					TTLSeconds:          0,
+					UseHNSW:             false,
+				})
+
+				// Populate cache
+				for i, query := range testQueries {
+					reqID := fmt.Sprintf("req%d", i)
+					_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+				}
+
+				searchQuery := generateQuery(contentLen, cacheSize/2)
+				b.ResetTimer()
+
+				for i := 0; i < b.N; i++ {
+					_, _, _ = cache.FindSimilar("test-model", searchQuery)
+				}
+
+				b.StopTimer()
+
+				// Write to CSV
+				if csvFile != nil {
+					nsPerOp := float64(b.Elapsed().Nanoseconds()) / float64(b.N)
+
+					line := fmt.Sprintf("%s,%s,%d,linear,0,0,%.0f,0,0,%d,1.0\n",
+						hardware, contentLen.String(), cacheSize, nsPerOp, b.N)
+					if _, err := csvFile.WriteString(line); err != nil {
+						b.Logf("Warning: failed to write to CSV: %v", err)
+					}
+				}
+			})
+
+			// Benchmark HNSW with different configurations
+			for _, hnswCfg := range hnswConfigs {
+				b.Run(fmt.Sprintf("%s/HNSW_%s/%s/%dEntries", hardware, hnswCfg.name, contentLen.String(), cacheSize), func(b *testing.B) {
+					cache := NewInMemoryCache(InMemoryCacheOptions{
+						Enabled:             true,
+						MaxEntries:          cacheSize * 2,
+						SimilarityThreshold: 0.85,
+						TTLSeconds:          0,
+						UseHNSW:             true,
+						HNSWM:               hnswCfg.m,
+						HNSWEfConstruction:  hnswCfg.ef,
+					})
+
+					// Populate cache
+					for i, query := range testQueries {
+						reqID := fmt.Sprintf("req%d", i)
+						_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+					}
+
+					searchQuery := generateQuery(contentLen, cacheSize/2)
+					b.ResetTimer()
+
+					for i := 0; i < b.N; i++ {
+						_, _, _ = cache.FindSimilar("test-model", searchQuery)
+					}
+
+					b.StopTimer()
+
+					// Write to CSV
+					if csvFile != nil {
+						nsPerOp := float64(b.Elapsed().Nanoseconds()) / float64(b.N)
+
+						line := fmt.Sprintf("%s,%s,%d,hnsw_%s,%d,%d,%.0f,0,0,%d,0.0\n",
+							hardware, contentLen.String(), cacheSize, hnswCfg.name,
+							hnswCfg.m, hnswCfg.ef, nsPerOp, b.N)
+						if _, err := csvFile.WriteString(line); err != nil {
+							b.Logf("Warning: failed to write to CSV: %v", err)
+						}
+					}
+				})
+			}
+		}
+	}
+}
+
+// BenchmarkIndexConstruction benchmarks HNSW index build time
+func BenchmarkIndexConstruction(b *testing.B) {
+	if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	cacheSizes := []int{100, 500, 1000, 5000}
+	contentLengths := []ContentLength{ShortContent, MediumContent, LongContent}
+
+	for _, cacheSize := range cacheSizes {
+		for _, contentLen := range contentLengths {
+			testQueries := make([]string, cacheSize)
+			for i := 0; i < cacheSize; i++ {
+				testQueries[i] = generateQuery(contentLen, i)
+			}
+
+			b.Run(fmt.Sprintf("BuildIndex/%s/%dEntries", contentLen.String(), cacheSize), func(b *testing.B) {
+				b.ResetTimer()
+				for i := 0; i < b.N; i++ {
+					b.StopTimer()
+					cache := NewInMemoryCache(InMemoryCacheOptions{
+						Enabled:             true,
+						MaxEntries:          cacheSize * 2,
+						SimilarityThreshold: 0.85,
+						TTLSeconds:          0,
+						UseHNSW:             true,
+						HNSWM:               16,
+						HNSWEfConstruction:  200,
+					})
+					b.StartTimer()
+
+					// Build index by adding entries
+					for j, query := range testQueries {
+						reqID := fmt.Sprintf("req%d", j)
+						_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+					}
+				}
+			})
+		}
+	}
+}
diff --git a/src/semantic-router/pkg/cache/hybrid_cache.go b/src/semantic-router/pkg/cache/hybrid_cache.go
new file mode 100644
index 00000000..acc78fca
--- /dev/null
+++ b/src/semantic-router/pkg/cache/hybrid_cache.go
@@ -0,0 +1,898 @@
+//go:build !windows && cgo
+// +build !windows,cgo
+
+package cache
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/metrics"
+	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability"
+)
+
+// searchBuffers holds reusable buffers for HNSW search to reduce GC pressure
+type searchBuffers struct {
+	visited    map[int]bool
+	candidates *minHeap
+	results    *maxHeap
+}
+
+// Global pool for search buffers (reduces allocations)
+var searchBufferPool = sync.Pool{
+	New: func() interface{} {
+		return &searchBuffers{
+			visited:    make(map[int]bool, 100),
+			candidates: newMinHeap(),
+			results:    newMaxHeap(),
+		}
+	},
+}
+
+// getSearchBuffers gets reusable buffers from pool
+func getSearchBuffers() *searchBuffers {
+	buf := searchBufferPool.Get().(*searchBuffers)
+	// Clear maps and heaps for reuse
+	for k := range buf.visited {
+		delete(buf.visited, k)
+	}
+	buf.candidates.data = buf.candidates.data[:0]
+	buf.results.data = buf.results.data[:0]
+	return buf
+}
+
+// putSearchBuffers returns buffers to pool
+func putSearchBuffers(buf *searchBuffers) {
+	// Don't return to pool if buffers grew too large (avoid memory bloat)
+	if len(buf.visited) > 1000 || cap(buf.candidates.data) > 200 || cap(buf.results.data) > 200 {
+		return
+	}
+	searchBufferPool.Put(buf)
+}
+
+// HybridCache combines in-memory HNSW index with external Milvus storage
+// Architecture:
+//   - In-memory: HNSW index with ALL embeddings (for fast O(log n) search)
+//   - Milvus: ALL documents (fetched by ID after search)
+//
+// This provides fast search while supporting millions of entries without storing docs in memory
+type HybridCache struct {
+	// In-memory components (search only)
+	hnswIndex  *HNSWIndex
+	embeddings [][]float32
+	idMap      map[int]string // Entry index → Milvus ID
+
+	// External storage (all documents)
+	milvusCache *MilvusCache
+
+	// Configuration
+	similarityThreshold float32
+	maxMemoryEntries    int // Max entries in HNSW index
+	ttlSeconds          int
+	enabled             bool
+
+	// Statistics
+	hitCount   int64
+	missCount  int64
+	evictCount int64
+
+	// Concurrency control
+	mu sync.RWMutex
+}
+
+// HybridCacheOptions contains configuration for the hybrid cache
+type HybridCacheOptions struct {
+	// Core settings
+	Enabled             bool
+	SimilarityThreshold float32
+	TTLSeconds          int
+
+	// HNSW settings
+	MaxMemoryEntries   int // Max entries in HNSW (default: 100,000)
+	HNSWM              int // HNSW M parameter
+	HNSWEfConstruction int // HNSW efConstruction parameter
+
+	// Milvus settings
+	MilvusConfigPath string
+}
+
+// NewHybridCache creates a new hybrid cache instance
+func NewHybridCache(options HybridCacheOptions) (*HybridCache, error) {
+	observability.Infof("Initializing hybrid cache: enabled=%t, maxMemoryEntries=%d, threshold=%.3f",
+		options.Enabled, options.MaxMemoryEntries, options.SimilarityThreshold)
+
+	if !options.Enabled {
+		observability.Debugf("Hybrid cache disabled, returning inactive instance")
+		return &HybridCache{
+			enabled: false,
+		}, nil
+	}
+
+	// Initialize Milvus backend
+	milvusOptions := MilvusCacheOptions{
+		Enabled:             true,
+		SimilarityThreshold: options.SimilarityThreshold,
+		TTLSeconds:          options.TTLSeconds,
+		ConfigPath:          options.MilvusConfigPath,
+	}
+
+	milvusCache, err := NewMilvusCache(milvusOptions)
+	if err != nil {
+		return nil, fmt.Errorf("failed to initialize Milvus backend: %w", err)
+	}
+
+	// Set defaults
+	if options.MaxMemoryEntries <= 0 {
+		options.MaxMemoryEntries = 100000 // Default: 100K entries in memory
+	}
+	if options.HNSWM <= 0 {
+		options.HNSWM = 16
+	}
+	if options.HNSWEfConstruction <= 0 {
+		options.HNSWEfConstruction = 200
+	}
+
+	// Initialize HNSW index
+	hnswIndex := newHNSWIndex(options.HNSWM, options.HNSWEfConstruction)
+
+	cache := &HybridCache{
+		hnswIndex:           hnswIndex,
+		embeddings:          make([][]float32, 0, options.MaxMemoryEntries),
+		idMap:               make(map[int]string),
+		milvusCache:         milvusCache,
+		similarityThreshold: options.SimilarityThreshold,
+		maxMemoryEntries:    options.MaxMemoryEntries,
+		ttlSeconds:          options.TTLSeconds,
+		enabled:             true,
+	}
+
+	observability.Infof("Hybrid cache initialized: HNSW(M=%d, ef=%d), maxMemory=%d",
+		options.HNSWM, options.HNSWEfConstruction, options.MaxMemoryEntries)
+
+	return cache, nil
+}
+
+// IsEnabled returns whether the cache is active
+func (h *HybridCache) IsEnabled() bool {
+	return h.enabled
+}
+
+// AddPendingRequest stores a request awaiting its response
+func (h *HybridCache) AddPendingRequest(requestID string, model string, query string, requestBody []byte) error {
+	start := time.Now()
+
+	if !h.enabled {
+		return nil
+	}
+
+	// Generate embedding
+	embedding, err := candle_binding.GetEmbedding(query, 0)
+	if err != nil {
+		metrics.RecordCacheOperation("hybrid", "add_pending", "error", time.Since(start).Seconds())
+		return fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	// Store in Milvus (write-through)
+	if err := h.milvusCache.AddPendingRequest(requestID, model, query, requestBody); err != nil {
+		metrics.RecordCacheOperation("hybrid", "add_pending", "error", time.Since(start).Seconds())
+		return fmt.Errorf("milvus add pending failed: %w", err)
+	}
+
+	// Add to in-memory HNSW index
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	// Check if we need to evict
+	if len(h.embeddings) >= h.maxMemoryEntries {
+		h.evictOneUnsafe()
+	}
+
+	// Add to HNSW
+	entryIndex := len(h.embeddings)
+	h.embeddings = append(h.embeddings, embedding)
+	h.idMap[entryIndex] = requestID
+	h.addNodeHybrid(entryIndex, embedding)
+
+	observability.Debugf("HybridCache.AddPendingRequest: added to HNSW index=%d, milvusID=%s",
+		entryIndex, requestID)
+
+	metrics.RecordCacheOperation("hybrid", "add_pending", "success", time.Since(start).Seconds())
+	metrics.UpdateCacheEntries("hybrid", len(h.embeddings))
+
+	return nil
+}
+
+// UpdateWithResponse completes a pending request with its response
+func (h *HybridCache) UpdateWithResponse(requestID string, responseBody []byte) error {
+	start := time.Now()
+
+	if !h.enabled {
+		return nil
+	}
+
+	// Update in Milvus
+	if err := h.milvusCache.UpdateWithResponse(requestID, responseBody); err != nil {
+		metrics.RecordCacheOperation("hybrid", "update_response", "error", time.Since(start).Seconds())
+		return fmt.Errorf("milvus update failed: %w", err)
+	}
+
+	// HNSW index already has the embedding, no update needed there
+
+	observability.Debugf("HybridCache.UpdateWithResponse: updated milvusID=%s", requestID)
+	metrics.RecordCacheOperation("hybrid", "update_response", "success", time.Since(start).Seconds())
+
+	return nil
+}
+
+// AddEntry stores a complete request-response pair
+func (h *HybridCache) AddEntry(requestID string, model string, query string, requestBody, responseBody []byte) error {
+	start := time.Now()
+
+	if !h.enabled {
+		return nil
+	}
+
+	// Generate embedding
+	embedding, err := candle_binding.GetEmbedding(query, 0)
+	if err != nil {
+		metrics.RecordCacheOperation("hybrid", "add_entry", "error", time.Since(start).Seconds())
+		return fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	// Store in Milvus (write-through)
+	if err := h.milvusCache.AddEntry(requestID, model, query, requestBody, responseBody); err != nil {
+		metrics.RecordCacheOperation("hybrid", "add_entry", "error", time.Since(start).Seconds())
+		return fmt.Errorf("milvus add entry failed: %w", err)
+	}
+
+	// Add to in-memory HNSW index
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	// Check if we need to evict
+	if len(h.embeddings) >= h.maxMemoryEntries {
+		h.evictOneUnsafe()
+	}
+
+	// Add to HNSW
+	entryIndex := len(h.embeddings)
+	h.embeddings = append(h.embeddings, embedding)
+	h.idMap[entryIndex] = requestID
+	h.addNodeHybrid(entryIndex, embedding)
+
+	observability.Debugf("HybridCache.AddEntry: added to HNSW index=%d, milvusID=%s",
+		entryIndex, requestID)
+	observability.LogEvent("hybrid_cache_entry_added", map[string]interface{}{
+		"backend": "hybrid",
+		"query":   query,
+		"model":   model,
+		"in_hnsw": true,
+	})
+
+	metrics.RecordCacheOperation("hybrid", "add_entry", "success", time.Since(start).Seconds())
+	metrics.UpdateCacheEntries("hybrid", len(h.embeddings))
+
+	return nil
+}
+
+// AddEntriesBatch stores multiple request-response pairs efficiently
+func (h *HybridCache) AddEntriesBatch(entries []CacheEntry) error {
+	start := time.Now()
+
+	if !h.enabled {
+		return nil
+	}
+
+	if len(entries) == 0 {
+		return nil
+	}
+
+	observability.Debugf("HybridCache.AddEntriesBatch: adding %d entries in batch", len(entries))
+
+	// Generate all embeddings first
+	embeddings := make([][]float32, len(entries))
+	for i, entry := range entries {
+		embedding, err := candle_binding.GetEmbedding(entry.Query, 0)
+		if err != nil {
+			metrics.RecordCacheOperation("hybrid", "add_entries_batch", "error", time.Since(start).Seconds())
+			return fmt.Errorf("failed to generate embedding for entry %d: %w", i, err)
+		}
+		embeddings[i] = embedding
+	}
+
+	// Store all in Milvus at once (write-through)
+	if err := h.milvusCache.AddEntriesBatch(entries); err != nil {
+		metrics.RecordCacheOperation("hybrid", "add_entries_batch", "error", time.Since(start).Seconds())
+		return fmt.Errorf("milvus batch add failed: %w", err)
+	}
+
+	// Add all to in-memory HNSW index
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	for i, entry := range entries {
+		// Check if we need to evict
+		if len(h.embeddings) >= h.maxMemoryEntries {
+			h.evictOneUnsafe()
+		}
+
+		// Add to HNSW
+		entryIndex := len(h.embeddings)
+		h.embeddings = append(h.embeddings, embeddings[i])
+		h.idMap[entryIndex] = entry.RequestID
+		h.addNodeHybrid(entryIndex, embeddings[i])
+	}
+
+	elapsed := time.Since(start)
+	observability.Debugf("HybridCache.AddEntriesBatch: added %d entries in %v (%.0f entries/sec)",
+		len(entries), elapsed, float64(len(entries))/elapsed.Seconds())
+	observability.LogEvent("hybrid_cache_entries_added", map[string]interface{}{
+		"backend": "hybrid",
+		"count":   len(entries),
+		"in_hnsw": true,
+	})
+
+	metrics.RecordCacheOperation("hybrid", "add_entries_batch", "success", elapsed.Seconds())
+	metrics.UpdateCacheEntries("hybrid", len(h.embeddings))
+
+	return nil
+}
+
+// Flush forces Milvus to persist all buffered data to disk
+func (h *HybridCache) Flush() error {
+	if !h.enabled {
+		return nil
+	}
+
+	return h.milvusCache.Flush()
+}
+
+// FindSimilar searches for semantically similar cached requests
+func (h *HybridCache) FindSimilar(model string, query string) ([]byte, bool, error) {
+	start := time.Now()
+
+	if !h.enabled {
+		return nil, false, nil
+	}
+
+	queryPreview := query
+	if len(query) > 50 {
+		queryPreview = query[:50] + "..."
+	}
+	observability.Debugf("HybridCache.FindSimilar: searching for model='%s', query='%s'",
+		model, queryPreview)
+
+	// Generate query embedding
+	queryEmbedding, err := candle_binding.GetEmbedding(query, 0)
+	if err != nil {
+		metrics.RecordCacheOperation("hybrid", "find_similar", "error", time.Since(start).Seconds())
+		return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	// Search HNSW index for candidates above similarity threshold
+	// For semantic cache, we only need the first match, so search with k=1
+	// and stop early when finding a match above threshold
+	h.mu.RLock()
+	candidates := h.searchKNNHybridWithThreshold(queryEmbedding, 1, 20, h.similarityThreshold)
+	threshold := h.similarityThreshold
+	h.mu.RUnlock()
+
+	// Filter by similarity threshold before fetching from Milvus
+	var qualifiedCandidates []searchResult
+	for _, candidate := range candidates {
+		if candidate.similarity >= threshold {
+			qualifiedCandidates = append(qualifiedCandidates, candidate)
+		}
+	}
+
+	// Map qualified candidates to Milvus IDs (need lock for idMap access)
+	type candidateWithID struct {
+		milvusID   string
+		similarity float32
+		index      int
+	}
+
+	h.mu.RLock()
+	candidatesWithIDs := make([]candidateWithID, 0, len(qualifiedCandidates))
+	for _, candidate := range qualifiedCandidates {
+		if milvusID, ok := h.idMap[candidate.index]; ok {
+			candidatesWithIDs = append(candidatesWithIDs, candidateWithID{
+				milvusID:   milvusID,
+				similarity: candidate.similarity,
+				index:      candidate.index,
+			})
+		}
+	}
+	h.mu.RUnlock()
+
+	if len(candidatesWithIDs) == 0 {
+		atomic.AddInt64(&h.missCount, 1)
+		if len(candidates) > 0 {
+			observability.Debugf("HybridCache.FindSimilar: %d candidates found but none above threshold %.3f",
+				len(candidates), h.similarityThreshold)
+		} else {
+			observability.Debugf("HybridCache.FindSimilar: no candidates found in HNSW")
+		}
+		metrics.RecordCacheOperation("hybrid", "find_similar", "miss", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	observability.Debugf("HybridCache.FindSimilar: HNSW returned %d candidates, %d above threshold",
+		len(candidates), len(candidatesWithIDs))
+
+	// Fetch document from Milvus for qualified candidates
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	// Try candidates in order (already sorted by similarity from HNSW)
+	for _, candidate := range candidatesWithIDs {
+		// Fetch document from Milvus by ID (direct lookup by primary key)
+		fetchCtx, fetchCancel := context.WithTimeout(ctx, 2*time.Second)
+		responseBody, err := h.milvusCache.GetByID(fetchCtx, candidate.milvusID)
+		fetchCancel()
+
+		if err != nil {
+			observability.Debugf("HybridCache.FindSimilar: Milvus GetByID failed for %s: %v",
+				candidate.milvusID, err)
+			continue
+		}
+
+		if responseBody != nil {
+			atomic.AddInt64(&h.hitCount, 1)
+			observability.Debugf("HybridCache.FindSimilar: MILVUS HIT - similarity=%.4f (threshold=%.3f)",
+				candidate.similarity, h.similarityThreshold)
+			observability.LogEvent("hybrid_cache_hit", map[string]interface{}{
+				"backend":    "hybrid",
+				"source":     "milvus",
+				"similarity": candidate.similarity,
+				"threshold":  h.similarityThreshold,
+				"model":      model,
+				"latency_ms": time.Since(start).Milliseconds(),
+			})
+			metrics.RecordCacheOperation("hybrid", "find_similar", "hit_milvus", time.Since(start).Seconds())
+			metrics.RecordCacheHit()
+			return responseBody, true, nil
+		}
+	}
+
+	// No match found above threshold
+	atomic.AddInt64(&h.missCount, 1)
+	observability.Debugf("HybridCache.FindSimilar: CACHE MISS - no match above threshold")
+	observability.LogEvent("hybrid_cache_miss", map[string]interface{}{
+		"backend":    "hybrid",
+		"threshold":  h.similarityThreshold,
+		"model":      model,
+		"candidates": len(candidatesWithIDs),
+	})
+	metrics.RecordCacheOperation("hybrid", "find_similar", "miss", time.Since(start).Seconds())
+	metrics.RecordCacheMiss()
+
+	// Suppress context error to avoid noise
+	_ = ctx
+
+	return nil, false, nil
+}
+
+// Close releases all resources
+func (h *HybridCache) Close() error {
+	if !h.enabled {
+		return nil
+	}
+
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	// Close Milvus connection
+	if h.milvusCache != nil {
+		if err := h.milvusCache.Close(); err != nil {
+			observability.Debugf("HybridCache.Close: Milvus close error: %v", err)
+		}
+	}
+
+	// Clear in-memory structures
+	h.embeddings = nil
+	h.idMap = nil
+	h.hnswIndex = nil
+
+	metrics.UpdateCacheEntries("hybrid", 0)
+
+	return nil
+}
+
+// GetStats returns cache statistics
+func (h *HybridCache) GetStats() CacheStats {
+	h.mu.RLock()
+	defer h.mu.RUnlock()
+
+	hits := atomic.LoadInt64(&h.hitCount)
+	misses := atomic.LoadInt64(&h.missCount)
+	total := hits + misses
+
+	var hitRatio float64
+	if total > 0 {
+		hitRatio = float64(hits) / float64(total)
+	}
+
+	return CacheStats{
+		TotalEntries: len(h.embeddings),
+		HitCount:     hits,
+		MissCount:    misses,
+		HitRatio:     hitRatio,
+	}
+}
+
+// Helper methods
+
+// evictOneUnsafe removes one entry from HNSW index (must hold write lock)
+func (h *HybridCache) evictOneUnsafe() {
+	if len(h.embeddings) == 0 {
+		return
+	}
+
+	// Simple FIFO eviction: remove oldest entry
+	victimIdx := 0
+
+	// Could use LRU/LFU here by tracking access times/counts
+	// For now, just evict the first entry
+
+	// Get milvusID before removing from map (for logging)
+	milvusID := h.idMap[victimIdx]
+
+	// Remove from structures
+	delete(h.idMap, victimIdx)
+
+	// Note: We don't remove from Milvus (data persists there)
+	// We also don't rebuild HNSW (mark as stale)
+	h.hnswIndex.markStale()
+
+	atomic.AddInt64(&h.evictCount, 1)
+
+	observability.LogEvent("hybrid_cache_evicted", map[string]interface{}{
+		"backend":     "hybrid",
+		"milvus_id":   milvusID,
+		"hnsw_index":  victimIdx,
+		"max_entries": h.maxMemoryEntries,
+	})
+}
+
+// searchResult holds a candidate with its similarity score
+type searchResult struct {
+	index      int
+	similarity float32
+}
+
+// dotProduct calculates the dot product between two vectors
+// Uses SIMD instructions (AVX2/AVX-512) when available for performance
+// Falls back to scalar implementation on non-x86 platforms
+func dotProduct(a, b []float32) float32 {
+	return dotProductSIMD(a, b)
+}
+
+// hybridHNSWAdapter adapts the HNSW index to work with [][]float32 instead of []CacheEntry
+type hybridHNSWAdapter struct {
+	embeddings [][]float32
+}
+
+func (h *hybridHNSWAdapter) getEmbedding(idx int) []float32 {
+	if idx < 0 || idx >= len(h.embeddings) {
+		return nil
+	}
+	return h.embeddings[idx]
+}
+
+func (h *hybridHNSWAdapter) distance(idx1, idx2 int) float32 {
+	emb1 := h.getEmbedding(idx1)
+	emb2 := h.getEmbedding(idx2)
+	if emb1 == nil || emb2 == nil {
+		return 0
+	}
+	return dotProduct(emb1, emb2)
+}
+
+// addNodeHybrid adds a node to the HNSW index (hybrid version)
+func (h *HybridCache) addNodeHybrid(entryIndex int, embedding []float32) {
+	// Lock is already held by caller (mu.Lock())
+
+	level := h.selectLevelHybrid()
+	node := &HNSWNode{
+		entryIndex: entryIndex,
+		neighbors:  make(map[int][]int),
+		maxLayer:   level,
+	}
+
+	for i := 0; i <= level; i++ {
+		node.neighbors[i] = make([]int, 0)
+	}
+
+	h.hnswIndex.nodes = append(h.hnswIndex.nodes, node)
+	h.hnswIndex.nodeIndex[entryIndex] = node // Add to O(1) lookup map
+
+	if h.hnswIndex.entryPoint == -1 {
+		h.hnswIndex.entryPoint = entryIndex
+		h.hnswIndex.maxLayer = level
+		return
+	}
+
+	// Find nearest neighbors at each layer
+	adapter := &hybridHNSWAdapter{embeddings: h.embeddings}
+
+	// Start from top layer
+	currNearest := h.hnswIndex.entryPoint
+	for lc := h.hnswIndex.maxLayer; lc > level; lc-- {
+		// Search for nearest at this layer - Fast O(1) lookup
+		candidates := []int{currNearest}
+		if hn := h.hnswIndex.nodeIndex[currNearest]; hn != nil && hn.neighbors[lc] != nil {
+			for _, neighbor := range hn.neighbors[lc] {
+				if neighbor >= 0 && neighbor < len(h.embeddings) {
+					candidates = append(candidates, neighbor)
+				}
+			}
+		}
+
+		// Find closest
+		bestDist := adapter.distance(entryIndex, currNearest)
+		for _, candidate := range candidates {
+			dist := adapter.distance(entryIndex, candidate)
+			if dist > bestDist {
+				bestDist = dist
+				currNearest = candidate
+			}
+		}
+	}
+
+	// Insert at appropriate layers
+	for lc := level; lc >= 0; lc-- {
+		// Find neighbors at this layer
+		neighbors := h.searchLayerHybrid(embedding, h.hnswIndex.efConstruction, lc, []int{currNearest})
+
+		m := h.hnswIndex.M
+		if lc == 0 {
+			m = h.hnswIndex.Mmax0
+		}
+
+		selectedNeighbors := h.selectNeighborsHybrid(neighbors, m)
+
+		// Add bidirectional links
+		for _, neighborID := range selectedNeighbors {
+			node.neighbors[lc] = append(node.neighbors[lc], neighborID)
+
+			// Add reverse link - Fast O(1) lookup
+			if neighborNode := h.hnswIndex.nodeIndex[neighborID]; neighborNode != nil {
+				if neighborNode.neighbors[lc] == nil {
+					neighborNode.neighbors[lc] = make([]int, 0)
+				}
+				neighborNode.neighbors[lc] = append(neighborNode.neighbors[lc], entryIndex)
+			}
+		}
+	}
+
+	if level > h.hnswIndex.maxLayer {
+		h.hnswIndex.maxLayer = level
+		h.hnswIndex.entryPoint = entryIndex
+	}
+}
+
+// selectLevelHybrid randomly selects a level for a new node
+func (h *HybridCache) selectLevelHybrid() int {
+	// Use exponential decay to select level
+	// Most nodes at layer 0, fewer at higher layers
+	level := 0
+	for level < 16 { // Max 16 layers
+		if randFloat() > h.hnswIndex.ml {
+			break
+		}
+		level++
+	}
+	return level
+}
+
+// randFloat returns a random float between 0 and 1
+func randFloat() float64 {
+	// Simple random using time-based seed
+	return float64(time.Now().UnixNano()%1000) / 1000.0
+}
+
+// searchLayerHybrid searches for nearest neighbors at a specific layer
+func (h *HybridCache) searchLayerHybrid(query []float32, ef int, layer int, entryPoints []int) []int {
+	// Reuse buffers from pool to reduce allocations
+	buf := getSearchBuffers()
+	defer putSearchBuffers(buf)
+
+	visited := buf.visited
+	candidates := buf.candidates
+	results := buf.results
+
+	for _, ep := range entryPoints {
+		if ep < 0 || ep >= len(h.embeddings) {
+			continue
+		}
+		dist := -dotProduct(query, h.embeddings[ep])
+		candidates.push(ep, dist)
+		results.push(ep, dist)
+		visited[ep] = true
+	}
+
+	for len(candidates.data) > 0 {
+		currentIdx, currentDist := candidates.pop()
+		if len(results.data) > 0 && currentDist > -results.data[0].dist {
+			break
+		}
+
+		// Fast O(1) lookup using nodeIndex map
+		currentNode := h.hnswIndex.nodeIndex[currentIdx]
+		if currentNode == nil || currentNode.neighbors[layer] == nil {
+			continue
+		}
+
+		for _, neighborID := range currentNode.neighbors[layer] {
+			if visited[neighborID] || neighborID < 0 || neighborID >= len(h.embeddings) {
+				continue
+			}
+			visited[neighborID] = true
+
+			dist := -dotProduct(query, h.embeddings[neighborID])
+
+			if len(results.data) < ef || dist < -results.data[0].dist {
+				candidates.push(neighborID, dist)
+				results.push(neighborID, dist)
+
+				if len(results.data) > ef {
+					results.pop()
+				}
+			}
+		}
+	}
+
+	// Extract IDs from heap and reverse to get correct order
+	resultIDs := make([]int, 0, len(results.data))
+	for len(results.data) > 0 {
+		idx, _ := results.pop()
+		resultIDs = append(resultIDs, idx)
+	}
+
+	// Reverse in place to match similarity order
+	for i, j := 0, len(resultIDs)-1; i < j; i, j = i+1, j-1 {
+		resultIDs[i], resultIDs[j] = resultIDs[j], resultIDs[i]
+	}
+
+	return resultIDs
+}
+
+// selectNeighborsHybrid selects the best neighbors from candidates (hybrid version)
+func (h *HybridCache) selectNeighborsHybrid(candidates []int, m int) []int {
+	if len(candidates) <= m {
+		return candidates
+	}
+
+	// Simple selection: take first M candidates
+	return candidates[:m]
+}
+
+// searchKNNHybridWithThreshold searches for k nearest neighbors with early stopping
+// Stops immediately when finding a match above the similarity threshold
+// This is optimal for semantic cache where we only need the first good match
+func (h *HybridCache) searchKNNHybridWithThreshold(query []float32, k int, ef int, threshold float32) []searchResult {
+	// Lock is already held by caller (mu.RLock())
+
+	if h.hnswIndex.entryPoint == -1 || len(h.embeddings) == 0 {
+		return nil
+	}
+
+	// Search from top layer down to layer 1 for navigation
+	currNearest := []int{h.hnswIndex.entryPoint}
+
+	for lc := h.hnswIndex.maxLayer; lc > 0; lc-- {
+		currNearest = h.searchLayerHybrid(query, 1, lc, currNearest)
+	}
+
+	// Search at layer 0 with early stopping at threshold
+	candidateIndices := h.searchLayerHybridWithEarlyStop(query, ef, 0, currNearest, threshold)
+
+	// Convert to searchResults with similarity scores
+	results := make([]searchResult, 0, len(candidateIndices))
+	for _, idx := range candidateIndices {
+		if idx >= 0 && idx < len(h.embeddings) {
+			similarity := dotProductSIMD(query, h.embeddings[idx])
+
+			// Return immediately if we found a match above threshold
+			if similarity >= threshold {
+				results = append(results, searchResult{
+					index:      idx,
+					similarity: similarity,
+				})
+				return results
+			}
+
+			results = append(results, searchResult{
+				index:      idx,
+				similarity: similarity,
+			})
+		}
+	}
+
+	// Return top k (or fewer if early stopped)
+	if len(results) > k {
+		return results[:k]
+	}
+	return results
+}
+
+// searchLayerHybridWithEarlyStop searches a layer and stops when finding a match above threshold
+func (h *HybridCache) searchLayerHybridWithEarlyStop(query []float32, ef int, layer int, entryPoints []int, threshold float32) []int {
+	buf := getSearchBuffers()
+	defer putSearchBuffers(buf)
+
+	visited := buf.visited
+	candidates := buf.candidates
+	results := buf.results
+
+	for _, ep := range entryPoints {
+		if ep < 0 || ep >= len(h.embeddings) {
+			continue
+		}
+		dist := -dotProductSIMD(query, h.embeddings[ep])
+		candidates.push(ep, dist)
+		results.push(ep, dist)
+		visited[ep] = true
+
+		// Check if this entry point already meets the threshold
+		if -dist >= threshold {
+			return []int{ep}
+		}
+	}
+
+	for len(candidates.data) > 0 {
+		currentIdx, currentDist := candidates.pop()
+		if len(results.data) > 0 && currentDist > -results.data[0].dist {
+			break
+		}
+
+		currentNode := h.hnswIndex.nodeIndex[currentIdx]
+		if currentNode == nil || currentNode.neighbors[layer] == nil {
+			continue
+		}
+
+		for _, neighborID := range currentNode.neighbors[layer] {
+			if visited[neighborID] || neighborID < 0 || neighborID >= len(h.embeddings) {
+				continue
+			}
+			visited[neighborID] = true
+
+			similarity := dotProductSIMD(query, h.embeddings[neighborID])
+			dist := -similarity
+
+			// Stop if this neighbor meets the threshold
+			if similarity >= threshold {
+				return []int{neighborID}
+			}
+
+			if len(results.data) < ef || dist < -results.data[0].dist {
+				candidates.push(neighborID, dist)
+				results.push(neighborID, dist)
+
+				if len(results.data) > ef {
+					results.pop()
+				}
+			}
+		}
+	}
+
+	// Extract IDs (sorted by similarity)
+	resultIDs := make([]int, 0, len(results.data))
+	for len(results.data) > 0 {
+		idx, _ := results.pop()
+		resultIDs = append(resultIDs, idx)
+	}
+
+	// Reverse in place
+	for i, j := 0, len(resultIDs)-1; i < j; i, j = i+1, j-1 {
+		resultIDs[i], resultIDs[j] = resultIDs[j], resultIDs[i]
+	}
+
+	return resultIDs
+}
diff --git a/src/semantic-router/pkg/cache/hybrid_cache_test.go b/src/semantic-router/pkg/cache/hybrid_cache_test.go
new file mode 100644
index 00000000..38ae188e
--- /dev/null
+++ b/src/semantic-router/pkg/cache/hybrid_cache_test.go
@@ -0,0 +1,447 @@
+//go:build !windows && cgo
+// +build !windows,cgo
+
+package cache
+
+import (
+	"fmt"
+	"os"
+	"testing"
+	"time"
+)
+
+// TestHybridCacheDisabled tests that disabled hybrid cache returns immediately
+func TestHybridCacheDisabled(t *testing.T) {
+	cache, err := NewHybridCache(HybridCacheOptions{
+		Enabled: false,
+	})
+	if err != nil {
+		t.Fatalf("Failed to create disabled cache: %v", err)
+	}
+	defer cache.Close()
+
+	if cache.IsEnabled() {
+		t.Error("Cache should be disabled")
+	}
+
+	// All operations should be no-ops
+	err = cache.AddEntry("req1", "model1", "test query", []byte("request"), []byte("response"))
+	if err != nil {
+		t.Errorf("AddEntry should not error on disabled cache: %v", err)
+	}
+
+	_, found, err := cache.FindSimilar("model1", "test query")
+	if err != nil {
+		t.Errorf("FindSimilar should not error on disabled cache: %v", err)
+	}
+	if found {
+		t.Error("FindSimilar should not find anything on disabled cache")
+	}
+}
+
+// TestHybridCacheBasicOperations tests basic cache operations
+func TestHybridCacheBasicOperations(t *testing.T) {
+	// Skip if Milvus is not configured
+	if os.Getenv("MILVUS_URI") == "" {
+		t.Skip("Skipping: MILVUS_URI not set")
+	}
+
+	// Create a test Milvus config
+	milvusConfig := "/tmp/test_milvus_config.yaml"
+	err := os.WriteFile(milvusConfig, []byte(`
+milvus:
+  address: "localhost:19530"
+  collection_name: "test_hybrid_cache"
+  dimension: 384
+  index_type: "HNSW"
+  metric_type: "IP"
+  params:
+    M: 16
+    efConstruction: 200
+`), 0644)
+	if err != nil {
+		t.Fatalf("Failed to create test config: %v", err)
+	}
+	defer os.Remove(milvusConfig)
+
+	cache, err := NewHybridCache(HybridCacheOptions{
+		Enabled:             true,
+		SimilarityThreshold: 0.8,
+		TTLSeconds:          300,
+		MaxMemoryEntries:    100,
+		HNSWM:               16,
+		HNSWEfConstruction:  200,
+		MilvusConfigPath:    milvusConfig,
+	})
+	if err != nil {
+		t.Fatalf("Failed to create hybrid cache: %v", err)
+	}
+	defer cache.Close()
+
+	if !cache.IsEnabled() {
+		t.Fatal("Cache should be enabled")
+	}
+
+	// Test AddEntry
+	testQuery := "What is the meaning of life?"
+	testResponse := []byte(`{"response": "42"}`)
+
+	err = cache.AddEntry("req1", "gpt-4", testQuery, []byte("{}"), testResponse)
+	if err != nil {
+		t.Fatalf("Failed to add entry: %v", err)
+	}
+
+	// Verify stats
+	stats := cache.GetStats()
+	if stats.TotalEntries != 1 {
+		t.Errorf("Expected 1 entry, got %d", stats.TotalEntries)
+	}
+
+	// Test FindSimilar with exact same query (should hit)
+	time.Sleep(100 * time.Millisecond) // Allow indexing to complete
+
+	response, found, err := cache.FindSimilar("gpt-4", testQuery)
+	if err != nil {
+		t.Fatalf("FindSimilar failed: %v", err)
+	}
+	if !found {
+		t.Error("Expected to find cached entry")
+	}
+	if string(response) != string(testResponse) {
+		t.Errorf("Response mismatch: got %s, want %s", string(response), string(testResponse))
+	}
+
+	// Test FindSimilar with similar query (should hit)
+	response, found, err = cache.FindSimilar("gpt-4", "What's the meaning of life?")
+	if err != nil {
+		t.Fatalf("FindSimilar failed: %v", err)
+	}
+	if !found {
+		t.Error("Expected to find similar cached entry")
+	}
+
+	// Test FindSimilar with dissimilar query (should miss)
+	_, found, err = cache.FindSimilar("gpt-4", "How to cook pasta?")
+	if err != nil {
+		t.Fatalf("FindSimilar failed: %v", err)
+	}
+	if found {
+		t.Error("Should not find dissimilar query")
+	}
+
+	// Verify updated stats
+	stats = cache.GetStats()
+	if stats.HitCount < 1 {
+		t.Errorf("Expected at least 1 hit, got %d", stats.HitCount)
+	}
+	if stats.MissCount < 1 {
+		t.Errorf("Expected at least 1 miss, got %d", stats.MissCount)
+	}
+}
+
+// TestHybridCachePendingRequest tests pending request flow
+func TestHybridCachePendingRequest(t *testing.T) {
+	// Skip if Milvus is not configured
+	if os.Getenv("MILVUS_URI") == "" {
+		t.Skip("Skipping: MILVUS_URI not set")
+	}
+
+	milvusConfig := "/tmp/test_milvus_pending_config.yaml"
+	err := os.WriteFile(milvusConfig, []byte(`
+milvus:
+  address: "localhost:19530"
+  collection_name: "test_hybrid_pending"
+  dimension: 384
+  index_type: "HNSW"
+  metric_type: "IP"
+`), 0644)
+	if err != nil {
+		t.Fatalf("Failed to create test config: %v", err)
+	}
+	defer os.Remove(milvusConfig)
+
+	cache, err := NewHybridCache(HybridCacheOptions{
+		Enabled:             true,
+		SimilarityThreshold: 0.8,
+		TTLSeconds:          300,
+		MaxMemoryEntries:    100,
+		MilvusConfigPath:    milvusConfig,
+	})
+	if err != nil {
+		t.Fatalf("Failed to create hybrid cache: %v", err)
+	}
+	defer cache.Close()
+
+	// Add pending request
+	testQuery := "Explain quantum computing"
+	err = cache.AddPendingRequest("req1", "gpt-4", testQuery, []byte("{}"))
+	if err != nil {
+		t.Fatalf("Failed to add pending request: %v", err)
+	}
+
+	// Update with response
+	testResponse := []byte(`{"answer": "Quantum computing uses qubits..."}`)
+	err = cache.UpdateWithResponse("req1", testResponse)
+	if err != nil {
+		t.Fatalf("Failed to update with response: %v", err)
+	}
+
+	// Wait for indexing
+	time.Sleep(100 * time.Millisecond)
+
+	// Try to find it
+	response, found, err := cache.FindSimilar("gpt-4", testQuery)
+	if err != nil {
+		t.Fatalf("FindSimilar failed: %v", err)
+	}
+	if !found {
+		t.Error("Expected to find cached entry after update")
+	}
+	if string(response) != string(testResponse) {
+		t.Errorf("Response mismatch: got %s, want %s", string(response), string(testResponse))
+	}
+}
+
+// TestHybridCacheEviction tests memory eviction behavior
+func TestHybridCacheEviction(t *testing.T) {
+	// Skip if Milvus is not configured
+	if os.Getenv("MILVUS_URI") == "" {
+		t.Skip("Skipping: MILVUS_URI not set")
+	}
+
+	milvusConfig := "/tmp/test_milvus_eviction_config.yaml"
+	err := os.WriteFile(milvusConfig, []byte(`
+milvus:
+  address: "localhost:19530"
+  collection_name: "test_hybrid_eviction"
+  dimension: 384
+  index_type: "HNSW"
+  metric_type: "IP"
+`), 0644)
+	if err != nil {
+		t.Fatalf("Failed to create test config: %v", err)
+	}
+	defer os.Remove(milvusConfig)
+
+	// Create cache with very small memory limit
+	cache, err := NewHybridCache(HybridCacheOptions{
+		Enabled:             true,
+		SimilarityThreshold: 0.8,
+		TTLSeconds:          300,
+		MaxMemoryEntries:    5, // Only 5 entries in memory
+		MilvusConfigPath:    milvusConfig,
+	})
+	if err != nil {
+		t.Fatalf("Failed to create hybrid cache: %v", err)
+	}
+	defer cache.Close()
+
+	// Add 10 entries (will trigger evictions)
+	for i := 0; i < 10; i++ {
+		query := fmt.Sprintf("Query number %d", i)
+		response := []byte(fmt.Sprintf(`{"answer": "Response %d"}`, i))
+		err = cache.AddEntry(fmt.Sprintf("req%d", i), "gpt-4", query, []byte("{}"), response)
+		if err != nil {
+			t.Fatalf("Failed to add entry %d: %v", i, err)
+		}
+	}
+
+	// Check that we have at most MaxMemoryEntries in HNSW
+	stats := cache.GetStats()
+	if stats.TotalEntries > 5 {
+		t.Errorf("Expected at most 5 entries in memory, got %d", stats.TotalEntries)
+	}
+
+	// All entries should still be in Milvus
+	// Try to find a recent entry (should be in memory)
+	time.Sleep(100 * time.Millisecond)
+	_, found, err := cache.FindSimilar("gpt-4", "Query number 9")
+	if err != nil {
+		t.Fatalf("FindSimilar failed: %v", err)
+	}
+	if !found {
+		t.Error("Expected to find recent entry")
+	}
+
+	// Try to find an old evicted entry (should be in Milvus)
+	_, found, err = cache.FindSimilar("gpt-4", "Query number 0")
+	if err != nil {
+		t.Fatalf("FindSimilar failed: %v", err)
+	}
+	// May or may not find it depending on Milvus indexing speed
+	// Just verify no error
+}
+
+// TestHybridCacheLocalCacheHit tests local cache hot path
+func TestHybridCacheLocalCacheHit(t *testing.T) {
+	// Skip if Milvus is not configured
+	if os.Getenv("MILVUS_URI") == "" {
+		t.Skip("Skipping: MILVUS_URI not set")
+	}
+
+	milvusConfig := "/tmp/test_milvus_local_config.yaml"
+	err := os.WriteFile(milvusConfig, []byte(`
+milvus:
+  address: "localhost:19530"
+  collection_name: "test_hybrid_local"
+  dimension: 384
+  index_type: "HNSW"
+  metric_type: "IP"
+`), 0644)
+	if err != nil {
+		t.Fatalf("Failed to create test config: %v", err)
+	}
+	defer os.Remove(milvusConfig)
+
+	cache, err := NewHybridCache(HybridCacheOptions{
+		Enabled:             true,
+		SimilarityThreshold: 0.8,
+		TTLSeconds:          300,
+		MaxMemoryEntries:    100,
+		MilvusConfigPath:    milvusConfig,
+	})
+	if err != nil {
+		t.Fatalf("Failed to create hybrid cache: %v", err)
+	}
+	defer cache.Close()
+
+	// Add an entry
+	testQuery := "What is machine learning?"
+	testResponse := []byte(`{"answer": "ML is..."}`)
+	err = cache.AddEntry("req1", "gpt-4", testQuery, []byte("{}"), testResponse)
+	if err != nil {
+		t.Fatalf("Failed to add entry: %v", err)
+	}
+
+	time.Sleep(100 * time.Millisecond)
+
+	// First search - should populate local cache
+	response, found, err := cache.FindSimilar("gpt-4", testQuery)
+	if err != nil {
+		t.Fatalf("FindSimilar failed: %v", err)
+	}
+	if !found {
+		t.Fatal("Expected to find entry")
+	}
+
+	// Second search - should hit local cache (much faster)
+	startTime := time.Now()
+	response, found, err = cache.FindSimilar("gpt-4", testQuery)
+	localLatency := time.Since(startTime)
+	if err != nil {
+		t.Fatalf("FindSimilar failed: %v", err)
+	}
+	if !found {
+		t.Fatal("Expected to find entry in local cache")
+	}
+	if string(response) != string(testResponse) {
+		t.Errorf("Response mismatch: got %s, want %s", string(response), string(testResponse))
+	}
+
+	// Local cache should be very fast (< 10ms)
+	if localLatency > 10*time.Millisecond {
+		t.Logf("Local cache hit took %v (expected < 10ms, but may vary)", localLatency)
+	}
+
+	stats := cache.GetStats()
+	if stats.HitCount < 2 {
+		t.Errorf("Expected at least 2 hits, got %d", stats.HitCount)
+	}
+}
+
+// BenchmarkHybridCacheAddEntry benchmarks adding entries to hybrid cache
+func BenchmarkHybridCacheAddEntry(b *testing.B) {
+	if os.Getenv("MILVUS_URI") == "" {
+		b.Skip("Skipping: MILVUS_URI not set")
+	}
+
+	milvusConfig := "/tmp/bench_milvus_config.yaml"
+	err := os.WriteFile(milvusConfig, []byte(`
+milvus:
+  address: "localhost:19530"
+  collection_name: "bench_hybrid_cache"
+  dimension: 384
+  index_type: "HNSW"
+  metric_type: "IP"
+`), 0644)
+	if err != nil {
+		b.Fatalf("Failed to create test config: %v", err)
+	}
+	defer os.Remove(milvusConfig)
+
+	cache, err := NewHybridCache(HybridCacheOptions{
+		Enabled:             true,
+		SimilarityThreshold: 0.8,
+		TTLSeconds:          300,
+		MaxMemoryEntries:    10000,
+		MilvusConfigPath:    milvusConfig,
+	})
+	if err != nil {
+		b.Fatalf("Failed to create hybrid cache: %v", err)
+	}
+	defer cache.Close()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		query := fmt.Sprintf("Benchmark query number %d", i)
+		response := []byte(fmt.Sprintf(`{"answer": "Response %d"}`, i))
+		err := cache.AddEntry(fmt.Sprintf("req%d", i), "gpt-4", query, []byte("{}"), response)
+		if err != nil {
+			b.Fatalf("AddEntry failed: %v", err)
+		}
+	}
+}
+
+// BenchmarkHybridCacheFindSimilar benchmarks searching in hybrid cache
+func BenchmarkHybridCacheFindSimilar(b *testing.B) {
+	if os.Getenv("MILVUS_URI") == "" {
+		b.Skip("Skipping: MILVUS_URI not set")
+	}
+
+	milvusConfig := "/tmp/bench_milvus_search_config.yaml"
+	err := os.WriteFile(milvusConfig, []byte(`
+milvus:
+  address: "localhost:19530"
+  collection_name: "bench_hybrid_search"
+  dimension: 384
+  index_type: "HNSW"
+  metric_type: "IP"
+`), 0644)
+	if err != nil {
+		b.Fatalf("Failed to create test config: %v", err)
+	}
+	defer os.Remove(milvusConfig)
+
+	cache, err := NewHybridCache(HybridCacheOptions{
+		Enabled:             true,
+		SimilarityThreshold: 0.8,
+		TTLSeconds:          300,
+		MaxMemoryEntries:    1000,
+		MilvusConfigPath:    milvusConfig,
+	})
+	if err != nil {
+		b.Fatalf("Failed to create hybrid cache: %v", err)
+	}
+	defer cache.Close()
+
+	// Pre-populate cache
+	for i := 0; i < 100; i++ {
+		query := fmt.Sprintf("Benchmark query number %d", i)
+		response := []byte(fmt.Sprintf(`{"answer": "Response %d"}`, i))
+		err := cache.AddEntry(fmt.Sprintf("req%d", i), "gpt-4", query, []byte("{}"), response)
+		if err != nil {
+			b.Fatalf("AddEntry failed: %v", err)
+		}
+	}
+
+	time.Sleep(500 * time.Millisecond) // Allow indexing
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		query := fmt.Sprintf("Benchmark query number %d", i%100)
+		_, _, err := cache.FindSimilar("gpt-4", query)
+		if err != nil {
+			b.Fatalf("FindSimilar failed: %v", err)
+		}
+	}
+}
diff --git a/src/semantic-router/pkg/cache/hybrid_vs_milvus_benchmark_test.go b/src/semantic-router/pkg/cache/hybrid_vs_milvus_benchmark_test.go
new file mode 100644
index 00000000..629e8900
--- /dev/null
+++ b/src/semantic-router/pkg/cache/hybrid_vs_milvus_benchmark_test.go
@@ -0,0 +1,869 @@
+//go:build milvus && !windows && cgo
+// +build milvus,!windows,cgo
+
+package cache
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"sync"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
+)
+
+// BenchmarkResult stores detailed benchmark metrics
+type BenchmarkResult struct {
+	CacheType           string
+	CacheSize           int
+	Operation           string
+	AvgLatencyNs        int64
+	AvgLatencyMs        float64
+	P50LatencyMs        float64
+	P95LatencyMs        float64
+	P99LatencyMs        float64
+	QPS                 float64
+	MemoryUsageMB       float64
+	HitRate             float64
+	DatabaseCalls       int64
+	TotalRequests       int64
+	DatabaseCallPercent float64
+}
+
+// LatencyDistribution tracks percentile latencies
+type LatencyDistribution struct {
+	latencies []time.Duration
+	mu        sync.Mutex
+}
+
+func (ld *LatencyDistribution) Record(latency time.Duration) {
+	ld.mu.Lock()
+	defer ld.mu.Unlock()
+	ld.latencies = append(ld.latencies, latency)
+}
+
+func (ld *LatencyDistribution) GetPercentile(p float64) float64 {
+	ld.mu.Lock()
+	defer ld.mu.Unlock()
+
+	if len(ld.latencies) == 0 {
+		return 0
+	}
+
+	// Sort latencies
+	sorted := make([]time.Duration, len(ld.latencies))
+	copy(sorted, ld.latencies)
+	for i := 0; i < len(sorted); i++ {
+		for j := i + 1; j < len(sorted); j++ {
+			if sorted[i] > sorted[j] {
+				sorted[i], sorted[j] = sorted[j], sorted[i]
+			}
+		}
+	}
+
+	idx := int(float64(len(sorted)) * p)
+	if idx >= len(sorted) {
+		idx = len(sorted) - 1
+	}
+
+	return float64(sorted[idx].Nanoseconds()) / 1e6
+}
+
+// DatabaseCallCounter tracks Milvus database calls
+type DatabaseCallCounter struct {
+	calls int64
+}
+
+func (dcc *DatabaseCallCounter) Increment() {
+	atomic.AddInt64(&dcc.calls, 1)
+}
+
+func (dcc *DatabaseCallCounter) Get() int64 {
+	return atomic.LoadInt64(&dcc.calls)
+}
+
+func (dcc *DatabaseCallCounter) Reset() {
+	atomic.StoreInt64(&dcc.calls, 0)
+}
+
+// getMilvusConfigPath returns the path to milvus.yaml config file
+func getMilvusConfigPath() string {
+	// Try absolute path first (for direct test execution)
+	configPath := "/home/ubuntu/rootfs/back/semantic-router.bak/config/cache/milvus.yaml"
+	if _, err := os.Stat(configPath); err == nil {
+		return configPath
+	}
+
+	// Try relative from project root (when run via make)
+	configPath = "config/cache/milvus.yaml"
+	if _, err := os.Stat(configPath); err == nil {
+		return configPath
+	}
+
+	// Fallback to relative from test directory
+	return "../../../../../config/cache/milvus.yaml"
+}
+
+// BenchmarkHybridVsMilvus is the comprehensive benchmark comparing hybrid cache vs pure Milvus
+// This validates the claims from the hybrid HNSW storage architecture paper
+func BenchmarkHybridVsMilvus(b *testing.B) {
+	// Initialize BERT model
+	useCPU := os.Getenv("USE_CPU") != "false"
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		b.Fatalf("Failed to initialize BERT model: %v", err)
+	}
+
+	// Test configurations - realistic production scales
+	cacheSizes := []int{
+		10000,  // Medium: 10K entries
+		50000,  // Large: 50K entries
+		100000, // Extra Large: 100K entries
+	}
+
+	// CSV output file - save to project benchmark_results directory
+	// Determine project root by walking up from test directory
+	projectRoot := "/home/ubuntu/rootfs/back/semantic-router.bak"
+	if envRoot := os.Getenv("PROJECT_ROOT"); envRoot != "" {
+		projectRoot = envRoot
+	}
+	resultsDir := filepath.Join(projectRoot, "benchmark_results", "hybrid_vs_milvus")
+	os.MkdirAll(resultsDir, 0755)
+	timestamp := time.Now().Format("20060102_150405")
+	csvPath := filepath.Join(resultsDir, fmt.Sprintf("results_%s.csv", timestamp))
+	csvFile, err := os.Create(csvPath)
+	if err != nil {
+		b.Logf("Warning: Could not create CSV file at %s: %v", csvPath, err)
+	} else {
+		defer csvFile.Close()
+		b.Logf("Results will be saved to: %s", csvPath)
+		// Write CSV header
+		csvFile.WriteString("cache_type,cache_size,operation,avg_latency_ns,avg_latency_ms,p50_ms,p95_ms,p99_ms,qps,memory_mb,hit_rate,db_calls,total_requests,db_call_percent\n")
+	}
+
+	b.Logf("=== Hybrid Cache vs Pure Milvus Benchmark ===")
+	b.Logf("")
+
+	for _, cacheSize := range cacheSizes {
+		b.Run(fmt.Sprintf("CacheSize_%d", cacheSize), func(b *testing.B) {
+			// Generate test queries
+			b.Logf("Generating %d test queries...", cacheSize)
+			testQueries := make([]string, cacheSize)
+			for i := 0; i < cacheSize; i++ {
+				testQueries[i] = generateQuery(MediumContent, i)
+			}
+
+			// Test two realistic hit rate scenarios
+			scenarios := []struct {
+				name    string
+				hitRate float64
+			}{
+				{"HitRate_5pct", 0.05},  // 5% hit rate - very realistic for semantic cache
+				{"HitRate_20pct", 0.20}, // 20% hit rate - optimistic but realistic
+			}
+
+			// Generate search queries for each scenario
+			allSearchQueries := make(map[string][]string)
+			for _, scenario := range scenarios {
+				queries := make([]string, 100)
+				hitCount := int(scenario.hitRate * 100)
+
+				// Hits: reuse cached queries
+				for i := 0; i < hitCount; i++ {
+					queries[i] = testQueries[i%cacheSize]
+				}
+
+				// Misses: generate new queries
+				for i := hitCount; i < 100; i++ {
+					queries[i] = generateQuery(MediumContent, cacheSize+i)
+				}
+
+				allSearchQueries[scenario.name] = queries
+				b.Logf("Generated queries for %s: %d hits, %d misses",
+					scenario.name, hitCount, 100-hitCount)
+			}
+
+			// ============================================================
+			// 1. Benchmark Pure Milvus Cache (Optional via SKIP_MILVUS env var)
+			// ============================================================
+			b.Run("Milvus", func(b *testing.B) {
+				if os.Getenv("SKIP_MILVUS") == "true" {
+					b.Skip("Skipping Milvus benchmark (SKIP_MILVUS=true)")
+					return
+				}
+				b.Logf("\n=== Testing Pure Milvus Cache ===")
+
+				milvusCache, err := NewMilvusCache(MilvusCacheOptions{
+					Enabled:             true,
+					SimilarityThreshold: 0.80,
+					TTLSeconds:          3600,
+					ConfigPath:          getMilvusConfigPath(),
+				})
+				if err != nil {
+					b.Fatalf("Failed to create Milvus cache: %v", err)
+				}
+				defer milvusCache.Close()
+
+				// Wait for Milvus to be ready
+				time.Sleep(2 * time.Second)
+
+				// Populate cache using batch insert for speed
+				b.Logf("Populating Milvus with %d entries (using batch insert)...", cacheSize)
+				populateStart := time.Now()
+
+				// Prepare all entries
+				entries := make([]CacheEntry, cacheSize)
+				for i := 0; i < cacheSize; i++ {
+					entries[i] = CacheEntry{
+						RequestID:    fmt.Sprintf("req-milvus-%d", i),
+						Model:        "test-model",
+						Query:        testQueries[i],
+						RequestBody:  []byte(fmt.Sprintf("request-%d", i)),
+						ResponseBody: []byte(fmt.Sprintf("response-%d-this-is-a-longer-response-body-to-simulate-realistic-llm-output", i)),
+					}
+				}
+
+				// Insert in batches of 100
+				batchSize := 100
+				for i := 0; i < cacheSize; i += batchSize {
+					end := i + batchSize
+					if end > cacheSize {
+						end = cacheSize
+					}
+
+					err := milvusCache.AddEntriesBatch(entries[i:end])
+					if err != nil {
+						b.Fatalf("Failed to add batch: %v", err)
+					}
+
+					if (i+batchSize)%1000 == 0 {
+						b.Logf("  Populated %d/%d entries", i+batchSize, cacheSize)
+					}
+				}
+
+				// Flush once after all batches
+				b.Logf("Flushing Milvus...")
+				if err := milvusCache.Flush(); err != nil {
+					b.Logf("Warning: flush failed: %v", err)
+				}
+
+				populateTime := time.Since(populateStart)
+				b.Logf("✓ Populated in %v (%.0f entries/sec)", populateTime, float64(cacheSize)/populateTime.Seconds())
+
+				// Wait for Milvus to be ready
+				time.Sleep(2 * time.Second)
+
+				// Test each hit rate scenario
+				for _, scenario := range scenarios {
+					searchQueries := allSearchQueries[scenario.name]
+
+					b.Run(scenario.name, func(b *testing.B) {
+						// Benchmark search operations
+						b.Logf("Running search benchmark for %s...", scenario.name)
+						latencyDist := &LatencyDistribution{latencies: make([]time.Duration, 0, b.N)}
+						dbCallCounter := &DatabaseCallCounter{}
+						hits := 0
+						misses := 0
+
+						b.ResetTimer()
+						start := time.Now()
+
+						for i := 0; i < b.N; i++ {
+							queryIdx := i % len(searchQueries)
+							searchStart := time.Now()
+
+							// Every Milvus FindSimilar is a database call
+							dbCallCounter.Increment()
+
+							_, found, err := milvusCache.FindSimilar("test-model", searchQueries[queryIdx])
+							searchLatency := time.Since(searchStart)
+
+							if err != nil {
+								b.Logf("Warning: search error at iteration %d: %v", i, err)
+							}
+
+							latencyDist.Record(searchLatency)
+
+							if found {
+								hits++
+							} else {
+								misses++
+							}
+						}
+
+						elapsed := time.Since(start)
+						b.StopTimer()
+
+						// Calculate metrics
+						avgLatencyNs := elapsed.Nanoseconds() / int64(b.N)
+						avgLatencyMs := float64(avgLatencyNs) / 1e6
+						qps := float64(b.N) / elapsed.Seconds()
+						hitRate := float64(hits) / float64(b.N) * 100
+						dbCalls := dbCallCounter.Get()
+						dbCallPercent := float64(dbCalls) / float64(b.N) * 100
+
+						// Memory usage estimation
+						memUsageMB := estimateMilvusMemory(cacheSize)
+
+						result := BenchmarkResult{
+							CacheType:           "milvus",
+							CacheSize:           cacheSize,
+							Operation:           "search",
+							AvgLatencyNs:        avgLatencyNs,
+							AvgLatencyMs:        avgLatencyMs,
+							P50LatencyMs:        latencyDist.GetPercentile(0.50),
+							P95LatencyMs:        latencyDist.GetPercentile(0.95),
+							P99LatencyMs:        latencyDist.GetPercentile(0.99),
+							QPS:                 qps,
+							MemoryUsageMB:       memUsageMB,
+							HitRate:             hitRate,
+							DatabaseCalls:       dbCalls,
+							TotalRequests:       int64(b.N),
+							DatabaseCallPercent: dbCallPercent,
+						}
+
+						// Report results
+						b.Logf("\n--- Milvus Results (%s) ---", scenario.name)
+						b.Logf("Avg Latency: %.2f ms", avgLatencyMs)
+						b.Logf("P50: %.2f ms, P95: %.2f ms, P99: %.2f ms", result.P50LatencyMs, result.P95LatencyMs, result.P99LatencyMs)
+						b.Logf("QPS: %.0f", qps)
+						b.Logf("Hit Rate: %.1f%% (expected: %.0f%%)", hitRate, scenario.hitRate*100)
+						b.Logf("Hits: %d, Misses: %d out of %d total", hits, misses, b.N)
+						b.Logf("Database Calls: %d/%d (%.0f%%)", dbCalls, b.N, dbCallPercent)
+						b.Logf("Memory Usage: %.1f MB", memUsageMB)
+
+						// Write to CSV
+						if csvFile != nil {
+							writeBenchmarkResultToCSV(csvFile, result)
+						}
+
+						b.ReportMetric(avgLatencyMs, "ms/op")
+						b.ReportMetric(qps, "qps")
+						b.ReportMetric(hitRate, "hit_rate_%")
+					})
+				}
+			})
+
+			// ============================================================
+			// 2. Benchmark Hybrid Cache
+			// ============================================================
+			b.Run("Hybrid", func(b *testing.B) {
+				b.Logf("\n=== Testing Hybrid Cache ===")
+
+				hybridCache, err := NewHybridCache(HybridCacheOptions{
+					Enabled:             true,
+					SimilarityThreshold: 0.80,
+					TTLSeconds:          3600,
+					MaxMemoryEntries:    cacheSize,
+					HNSWM:               16,
+					HNSWEfConstruction:  200,
+					MilvusConfigPath:    getMilvusConfigPath(),
+				})
+				if err != nil {
+					b.Fatalf("Failed to create Hybrid cache: %v", err)
+				}
+				defer hybridCache.Close()
+
+				// Wait for initialization
+				time.Sleep(2 * time.Second)
+
+				// Populate cache using batch insert for speed
+				b.Logf("Populating Hybrid cache with %d entries (using batch insert)...", cacheSize)
+				populateStart := time.Now()
+
+				// Prepare all entries
+				entries := make([]CacheEntry, cacheSize)
+				for i := 0; i < cacheSize; i++ {
+					entries[i] = CacheEntry{
+						RequestID:    fmt.Sprintf("req-hybrid-%d", i),
+						Model:        "test-model",
+						Query:        testQueries[i],
+						RequestBody:  []byte(fmt.Sprintf("request-%d", i)),
+						ResponseBody: []byte(fmt.Sprintf("response-%d-this-is-a-longer-response-body-to-simulate-realistic-llm-output", i)),
+					}
+				}
+
+				// Insert in batches of 100
+				batchSize := 100
+				for i := 0; i < cacheSize; i += batchSize {
+					end := i + batchSize
+					if end > cacheSize {
+						end = cacheSize
+					}
+
+					err := hybridCache.AddEntriesBatch(entries[i:end])
+					if err != nil {
+						b.Fatalf("Failed to add batch: %v", err)
+					}
+
+					if (i+batchSize)%1000 == 0 {
+						b.Logf("  Populated %d/%d entries", i+batchSize, cacheSize)
+					}
+				}
+
+				// Flush once after all batches
+				b.Logf("Flushing Milvus...")
+				if err := hybridCache.Flush(); err != nil {
+					b.Logf("Warning: flush failed: %v", err)
+				}
+
+				populateTime := time.Since(populateStart)
+				b.Logf("✓ Populated in %v (%.0f entries/sec)", populateTime, float64(cacheSize)/populateTime.Seconds())
+
+				// Wait for Milvus to be ready
+				time.Sleep(2 * time.Second)
+
+				// Test each hit rate scenario
+				for _, scenario := range scenarios {
+					searchQueries := allSearchQueries[scenario.name]
+
+					b.Run(scenario.name, func(b *testing.B) {
+						// Get initial memory stats
+						var memBefore runtime.MemStats
+						runtime.ReadMemStats(&memBefore)
+
+						// Benchmark search operations
+						b.Logf("Running search benchmark for %s...", scenario.name)
+						latencyDist := &LatencyDistribution{latencies: make([]time.Duration, 0, b.N)}
+						hits := 0
+						misses := 0
+
+						// Track database calls (Hybrid should make fewer calls due to threshold filtering)
+						initialMilvusCallCount := hybridCache.milvusCache.hitCount + hybridCache.milvusCache.missCount
+
+						b.ResetTimer()
+						start := time.Now()
+
+						for i := 0; i < b.N; i++ {
+							queryIdx := i % len(searchQueries)
+							searchStart := time.Now()
+
+							_, found, err := hybridCache.FindSimilar("test-model", searchQueries[queryIdx])
+							searchLatency := time.Since(searchStart)
+
+							if err != nil {
+								b.Logf("Warning: search error at iteration %d: %v", i, err)
+							}
+
+							latencyDist.Record(searchLatency)
+
+							if found {
+								hits++
+							} else {
+								misses++
+							}
+						}
+
+						elapsed := time.Since(start)
+						b.StopTimer()
+
+						// Calculate database calls (both hits and misses involve Milvus calls)
+						finalMilvusCallCount := hybridCache.milvusCache.hitCount + hybridCache.milvusCache.missCount
+						dbCalls := finalMilvusCallCount - initialMilvusCallCount
+
+						// Get final memory stats
+						var memAfter runtime.MemStats
+						runtime.ReadMemStats(&memAfter)
+
+						// Fix: Prevent unsigned integer underflow if GC ran during benchmark
+						var memUsageMB float64
+						if memAfter.Alloc >= memBefore.Alloc {
+							memUsageMB = float64(memAfter.Alloc-memBefore.Alloc) / 1024 / 1024
+						} else {
+							// GC ran, use estimation instead
+							memUsageMB = estimateHybridMemory(cacheSize)
+						}
+
+						// Calculate metrics
+						avgLatencyNs := elapsed.Nanoseconds() / int64(b.N)
+						avgLatencyMs := float64(avgLatencyNs) / 1e6
+						qps := float64(b.N) / elapsed.Seconds()
+						hitRate := float64(hits) / float64(b.N) * 100
+						dbCallPercent := float64(dbCalls) / float64(b.N) * 100
+
+						result := BenchmarkResult{
+							CacheType:           "hybrid",
+							CacheSize:           cacheSize,
+							Operation:           "search",
+							AvgLatencyNs:        avgLatencyNs,
+							AvgLatencyMs:        avgLatencyMs,
+							P50LatencyMs:        latencyDist.GetPercentile(0.50),
+							P95LatencyMs:        latencyDist.GetPercentile(0.95),
+							P99LatencyMs:        latencyDist.GetPercentile(0.99),
+							QPS:                 qps,
+							MemoryUsageMB:       memUsageMB,
+							HitRate:             hitRate,
+							DatabaseCalls:       dbCalls,
+							TotalRequests:       int64(b.N),
+							DatabaseCallPercent: dbCallPercent,
+						}
+
+						// Report results
+						b.Logf("\n--- Hybrid Cache Results (%s) ---", scenario.name)
+						b.Logf("Avg Latency: %.2f ms", avgLatencyMs)
+						b.Logf("P50: %.2f ms, P95: %.2f ms, P99: %.2f ms", result.P50LatencyMs, result.P95LatencyMs, result.P99LatencyMs)
+						b.Logf("QPS: %.0f", qps)
+						b.Logf("Hit Rate: %.1f%% (expected: %.0f%%)", hitRate, scenario.hitRate*100)
+						b.Logf("Hits: %d, Misses: %d out of %d total", hits, misses, b.N)
+						b.Logf("Database Calls: %d/%d (%.0f%%)", dbCalls, b.N, dbCallPercent)
+						b.Logf("Memory Usage: %.1f MB", memUsageMB)
+
+						// Write to CSV
+						if csvFile != nil {
+							writeBenchmarkResultToCSV(csvFile, result)
+						}
+
+						b.ReportMetric(avgLatencyMs, "ms/op")
+						b.ReportMetric(qps, "qps")
+						b.ReportMetric(hitRate, "hit_rate_%")
+						b.ReportMetric(dbCallPercent, "db_call_%")
+					})
+				}
+			})
+		})
+	}
+}
+
+// BenchmarkComponentLatency measures individual component latencies
+func BenchmarkComponentLatency(b *testing.B) {
+	// Initialize BERT model
+	useCPU := os.Getenv("USE_CPU") != "false"
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		b.Fatalf("Failed to initialize BERT model: %v", err)
+	}
+
+	cacheSize := 10000
+	testQueries := make([]string, cacheSize)
+	for i := 0; i < cacheSize; i++ {
+		testQueries[i] = generateQuery(MediumContent, i)
+	}
+
+	b.Run("EmbeddingGeneration", func(b *testing.B) {
+		query := testQueries[0]
+		b.ResetTimer()
+		start := time.Now()
+		for i := 0; i < b.N; i++ {
+			_, err := candle_binding.GetEmbedding(query, 0)
+			if err != nil {
+				b.Fatal(err)
+			}
+		}
+		elapsed := time.Since(start)
+		avgMs := float64(elapsed.Nanoseconds()) / float64(b.N) / 1e6
+		b.Logf("Embedding generation: %.2f ms/op", avgMs)
+		b.ReportMetric(avgMs, "ms/op")
+	})
+
+	b.Run("HNSWSearch", func(b *testing.B) {
+		// Build HNSW index
+		cache := NewInMemoryCache(InMemoryCacheOptions{
+			Enabled:             true,
+			SimilarityThreshold: 0.80,
+			MaxEntries:          cacheSize,
+			UseHNSW:             true,
+			HNSWM:               16,
+			HNSWEfConstruction:  200,
+		})
+
+		b.Logf("Building HNSW index with %d entries...", cacheSize)
+		for i := 0; i < cacheSize; i++ {
+			cache.AddEntry(fmt.Sprintf("req-%d", i), "model", testQueries[i], []byte("req"), []byte("resp"))
+		}
+		b.Logf("✓ HNSW index built")
+
+		query := testQueries[0]
+
+		b.ResetTimer()
+		start := time.Now()
+		for i := 0; i < b.N; i++ {
+			// Note: HNSW search uses entries slice internally
+			cache.FindSimilar("model", query)
+		}
+		elapsed := time.Since(start)
+		avgMs := float64(elapsed.Nanoseconds()) / float64(b.N) / 1e6
+		b.Logf("HNSW search: %.2f ms/op", avgMs)
+		b.ReportMetric(avgMs, "ms/op")
+	})
+
+	b.Run("MilvusVectorSearch", func(b *testing.B) {
+		milvusCache, err := NewMilvusCache(MilvusCacheOptions{
+			Enabled:             true,
+			SimilarityThreshold: 0.80,
+			TTLSeconds:          3600,
+			ConfigPath:          getMilvusConfigPath(),
+		})
+		if err != nil {
+			b.Fatalf("Failed to create Milvus cache: %v", err)
+		}
+		defer milvusCache.Close()
+
+		time.Sleep(2 * time.Second)
+
+		b.Logf("Populating Milvus with %d entries...", cacheSize)
+		for i := 0; i < cacheSize; i++ {
+			milvusCache.AddEntry(fmt.Sprintf("req-%d", i), "model", testQueries[i], []byte("req"), []byte("resp"))
+		}
+		time.Sleep(2 * time.Second)
+		b.Logf("✓ Milvus populated")
+
+		query := testQueries[0]
+
+		b.ResetTimer()
+		start := time.Now()
+		for i := 0; i < b.N; i++ {
+			milvusCache.FindSimilar("model", query)
+		}
+		elapsed := time.Since(start)
+		avgMs := float64(elapsed.Nanoseconds()) / float64(b.N) / 1e6
+		b.Logf("Milvus vector search: %.2f ms/op", avgMs)
+		b.ReportMetric(avgMs, "ms/op")
+	})
+
+	b.Run("MilvusGetByID", func(b *testing.B) {
+		// This would test Milvus get by ID if we exposed that method
+		b.Skip("Milvus GetByID not exposed in current implementation")
+	})
+}
+
+// BenchmarkThroughputUnderLoad tests throughput with concurrent requests
+func BenchmarkThroughputUnderLoad(b *testing.B) {
+	// Initialize BERT model
+	useCPU := os.Getenv("USE_CPU") != "false"
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		b.Fatalf("Failed to initialize BERT model: %v", err)
+	}
+
+	cacheSize := 10000
+	concurrencyLevels := []int{1, 10, 50, 100}
+
+	testQueries := make([]string, cacheSize)
+	for i := 0; i < cacheSize; i++ {
+		testQueries[i] = generateQuery(MediumContent, i)
+	}
+
+	for _, concurrency := range concurrencyLevels {
+		b.Run(fmt.Sprintf("Milvus_Concurrency_%d", concurrency), func(b *testing.B) {
+			milvusCache, err := NewMilvusCache(MilvusCacheOptions{
+				Enabled:             true,
+				SimilarityThreshold: 0.80,
+				TTLSeconds:          3600,
+				ConfigPath:          getMilvusConfigPath(),
+			})
+			if err != nil {
+				b.Fatalf("Failed to create Milvus cache: %v", err)
+			}
+			defer milvusCache.Close()
+
+			time.Sleep(2 * time.Second)
+
+			// Populate
+			for i := 0; i < cacheSize; i++ {
+				milvusCache.AddEntry(fmt.Sprintf("req-%d", i), "model", testQueries[i], []byte("req"), []byte("resp"))
+			}
+			time.Sleep(2 * time.Second)
+
+			b.ResetTimer()
+			b.SetParallelism(concurrency)
+			start := time.Now()
+
+			b.RunParallel(func(pb *testing.PB) {
+				i := 0
+				for pb.Next() {
+					query := testQueries[i%len(testQueries)]
+					milvusCache.FindSimilar("model", query)
+					i++
+				}
+			})
+
+			elapsed := time.Since(start)
+			qps := float64(b.N) / elapsed.Seconds()
+			b.Logf("QPS with %d concurrent workers: %.0f", concurrency, qps)
+			b.ReportMetric(qps, "qps")
+		})
+
+		b.Run(fmt.Sprintf("Hybrid_Concurrency_%d", concurrency), func(b *testing.B) {
+			hybridCache, err := NewHybridCache(HybridCacheOptions{
+				Enabled:             true,
+				SimilarityThreshold: 0.80,
+				TTLSeconds:          3600,
+				MaxMemoryEntries:    cacheSize,
+				HNSWM:               16,
+				HNSWEfConstruction:  200,
+				MilvusConfigPath:    getMilvusConfigPath(),
+			})
+			if err != nil {
+				b.Fatalf("Failed to create Hybrid cache: %v", err)
+			}
+			defer hybridCache.Close()
+
+			time.Sleep(2 * time.Second)
+
+			// Populate
+			for i := 0; i < cacheSize; i++ {
+				hybridCache.AddEntry(fmt.Sprintf("req-%d", i), "model", testQueries[i], []byte("req"), []byte("resp"))
+			}
+			time.Sleep(2 * time.Second)
+
+			b.ResetTimer()
+			b.SetParallelism(concurrency)
+			start := time.Now()
+
+			b.RunParallel(func(pb *testing.PB) {
+				i := 0
+				for pb.Next() {
+					query := testQueries[i%len(testQueries)]
+					hybridCache.FindSimilar("model", query)
+					i++
+				}
+			})
+
+			elapsed := time.Since(start)
+			qps := float64(b.N) / elapsed.Seconds()
+			b.Logf("QPS with %d concurrent workers: %.0f", concurrency, qps)
+			b.ReportMetric(qps, "qps")
+		})
+	}
+}
+
+// Helper functions
+
+func estimateMilvusMemory(cacheSize int) float64 {
+	// Milvus memory estimation (rough)
+	// - Embeddings: cacheSize × 384 × 4 bytes
+	// - HNSW index: cacheSize × 16 × 2 × 4 bytes (M=16, bidirectional)
+	// - Metadata: cacheSize × 0.5 KB
+	embeddingMB := float64(cacheSize*384*4) / 1024 / 1024
+	indexMB := float64(cacheSize*16*2*4) / 1024 / 1024
+	metadataMB := float64(cacheSize) * 0.5 / 1024
+	return embeddingMB + indexMB + metadataMB
+}
+
+func estimateHybridMemory(cacheSize int) float64 {
+	// Hybrid memory estimation (in-memory HNSW only, documents in Milvus)
+	// - Embeddings: cacheSize × 384 × 4 bytes
+	// - HNSW index: cacheSize × 16 × 2 × 4 bytes (M=16, bidirectional)
+	// - ID map: cacheSize × 50 bytes (average string length)
+	embeddingMB := float64(cacheSize*384*4) / 1024 / 1024
+	indexMB := float64(cacheSize*16*2*4) / 1024 / 1024
+	idMapMB := float64(cacheSize*50) / 1024 / 1024
+	return embeddingMB + indexMB + idMapMB
+}
+
+func writeBenchmarkResultToCSV(file *os.File, result BenchmarkResult) {
+	line := fmt.Sprintf("%s,%d,%s,%d,%.3f,%.3f,%.3f,%.3f,%.0f,%.1f,%.1f,%d,%d,%.1f\n",
+		result.CacheType,
+		result.CacheSize,
+		result.Operation,
+		result.AvgLatencyNs,
+		result.AvgLatencyMs,
+		result.P50LatencyMs,
+		result.P95LatencyMs,
+		result.P99LatencyMs,
+		result.QPS,
+		result.MemoryUsageMB,
+		result.HitRate,
+		result.DatabaseCalls,
+		result.TotalRequests,
+		result.DatabaseCallPercent,
+	)
+	file.WriteString(line)
+}
+
+// TestHybridVsMilvusSmoke is a quick smoke test to verify both caches work
+func TestHybridVsMilvusSmoke(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping smoke test in short mode")
+	}
+
+	// Initialize BERT model
+	useCPU := os.Getenv("USE_CPU") != "false"
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		t.Fatalf("Failed to initialize BERT model: %v", err)
+	}
+
+	// Test Milvus cache
+	t.Run("Milvus", func(t *testing.T) {
+		cache, err := NewMilvusCache(MilvusCacheOptions{
+			Enabled:             true,
+			SimilarityThreshold: 0.85,
+			TTLSeconds:          3600,
+			ConfigPath:          getMilvusConfigPath(),
+		})
+		if err != nil {
+			t.Fatalf("Failed to create Milvus cache: %v", err)
+		}
+		defer cache.Close()
+
+		time.Sleep(1 * time.Second)
+
+		// Add entry
+		err = cache.AddEntry("req-1", "model", "What is machine learning?", []byte("req"), []byte("ML is..."))
+		if err != nil {
+			t.Fatalf("Failed to add entry: %v", err)
+		}
+
+		time.Sleep(1 * time.Second)
+
+		// Find similar
+		resp, found, err := cache.FindSimilar("model", "What is machine learning?")
+		if err != nil {
+			t.Fatalf("FindSimilar failed: %v", err)
+		}
+		if !found {
+			t.Fatalf("Expected to find entry, but got miss")
+		}
+		if string(resp) != "ML is..." {
+			t.Fatalf("Expected 'ML is...', got '%s'", string(resp))
+		}
+
+		t.Logf("✓ Milvus cache smoke test passed")
+	})
+
+	// Test Hybrid cache
+	t.Run("Hybrid", func(t *testing.T) {
+		cache, err := NewHybridCache(HybridCacheOptions{
+			Enabled:             true,
+			SimilarityThreshold: 0.85,
+			TTLSeconds:          3600,
+			MaxMemoryEntries:    1000,
+			HNSWM:               16,
+			HNSWEfConstruction:  200,
+			MilvusConfigPath:    getMilvusConfigPath(),
+		})
+		if err != nil {
+			t.Fatalf("Failed to create Hybrid cache: %v", err)
+		}
+		defer cache.Close()
+
+		time.Sleep(1 * time.Second)
+
+		// Add entry
+		err = cache.AddEntry("req-1", "model", "What is deep learning?", []byte("req"), []byte("DL is..."))
+		if err != nil {
+			t.Fatalf("Failed to add entry: %v", err)
+		}
+
+		time.Sleep(1 * time.Second)
+
+		// Find similar
+		resp, found, err := cache.FindSimilar("model", "What is deep learning?")
+		if err != nil {
+			t.Fatalf("FindSimilar failed: %v", err)
+		}
+		if !found {
+			t.Fatalf("Expected to find entry, but got miss")
+		}
+		if string(resp) != "DL is..." {
+			t.Fatalf("Expected 'DL is...', got '%s'", string(resp))
+		}
+
+		t.Logf("✓ Hybrid cache smoke test passed")
+	})
+}
diff --git a/src/semantic-router/pkg/cache/inmemory_cache.go b/src/semantic-router/pkg/cache/inmemory_cache.go
index 5820c5f8..ca7e2c32 100644
--- a/src/semantic-router/pkg/cache/inmemory_cache.go
+++ b/src/semantic-router/pkg/cache/inmemory_cache.go
@@ -5,6 +5,7 @@ package cache
 
 import (
 	"fmt"
+	"math"
 	"sync"
 	"sync/atomic"
 	"time"
@@ -14,6 +15,26 @@ import (
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability"
 )
 
+// HNSWNode represents a node in the HNSW graph
+type HNSWNode struct {
+	entryIndex int           // Index into InMemoryCache.entries
+	neighbors  map[int][]int // Layer -> neighbor indices
+	maxLayer   int           // Highest layer this node appears in
+}
+
+// HNSWIndex implements Hierarchical Navigable Small World graph for fast ANN search
+type HNSWIndex struct {
+	nodes          []*HNSWNode
+	nodeIndex      map[int]*HNSWNode // entryIndex → node for O(1) lookup (critical for performance!)
+	entryPoint     int               // Index of the top-level entry point
+	maxLayer       int               // Maximum layer in the graph
+	efConstruction int               // Size of dynamic candidate list during construction
+	M              int               // Number of bi-directional links per node
+	Mmax           int               // Maximum number of connections per node (=M)
+	Mmax0          int               // Maximum number of connections for layer 0 (=M*2)
+	ml             float64           // Normalization factor for level assignment
+}
+
 // InMemoryCache provides a high-performance semantic cache using BERT embeddings in memory
 type InMemoryCache struct {
 	entries             []CacheEntry
@@ -26,6 +47,9 @@ type InMemoryCache struct {
 	missCount           int64
 	lastCleanupTime     *time.Time
 	evictionPolicy      EvictionPolicy
+	hnswIndex           *HNSWIndex
+	useHNSW             bool
+	hnswEfSearch        int // Search-time ef parameter
 }
 
 // InMemoryCacheOptions contains configuration parameters for the in-memory cache
@@ -35,12 +59,16 @@ type InMemoryCacheOptions struct {
 	TTLSeconds          int
 	Enabled             bool
 	EvictionPolicy      EvictionPolicyType
+	UseHNSW             bool // Enable HNSW index for faster search
+	HNSWM               int  // Number of bi-directional links (default: 16)
+	HNSWEfConstruction  int  // Size of dynamic candidate list during construction (default: 200)
+	HNSWEfSearch        int  // Size of dynamic candidate list during search (default: 50)
 }
 
 // NewInMemoryCache initializes a new in-memory semantic cache instance
 func NewInMemoryCache(options InMemoryCacheOptions) *InMemoryCache {
-	observability.Debugf("Initializing in-memory cache: enabled=%t, maxEntries=%d, ttlSeconds=%d, threshold=%.3f, eviction_policy=%s",
-		options.Enabled, options.MaxEntries, options.TTLSeconds, options.SimilarityThreshold, options.EvictionPolicy)
+	observability.Debugf("Initializing in-memory cache: enabled=%t, maxEntries=%d, ttlSeconds=%d, threshold=%.3f, eviction_policy=%s, useHNSW=%t",
+		options.Enabled, options.MaxEntries, options.TTLSeconds, options.SimilarityThreshold, options.EvictionPolicy, options.UseHNSW)
 
 	var evictionPolicy EvictionPolicy
 	switch options.EvictionPolicy {
@@ -52,14 +80,38 @@ func NewInMemoryCache(options InMemoryCacheOptions) *InMemoryCache {
 		evictionPolicy = &FIFOPolicy{}
 	}
 
-	return &InMemoryCache{
+	// Set HNSW search ef parameter
+	efSearch := options.HNSWEfSearch
+	if efSearch <= 0 {
+		efSearch = 50 // Default value
+	}
+
+	cache := &InMemoryCache{
 		entries:             []CacheEntry{},
 		similarityThreshold: options.SimilarityThreshold,
 		maxEntries:          options.MaxEntries,
 		ttlSeconds:          options.TTLSeconds,
 		enabled:             options.Enabled,
 		evictionPolicy:      evictionPolicy,
+		useHNSW:             options.UseHNSW,
+		hnswEfSearch:        efSearch,
+	}
+
+	// Initialize HNSW index if enabled
+	if options.UseHNSW {
+		M := options.HNSWM
+		if M <= 0 {
+			M = 16 // Default value
+		}
+		efConstruction := options.HNSWEfConstruction
+		if efConstruction <= 0 {
+			efConstruction = 200 // Default value
+		}
+		cache.hnswIndex = newHNSWIndex(M, efConstruction)
+		observability.Debugf("HNSW index initialized: M=%d, efConstruction=%d", M, efConstruction)
 	}
+
+	return cache
 }
 
 // IsEnabled returns the current cache activation status
@@ -107,8 +159,15 @@ func (c *InMemoryCache) AddPendingRequest(requestID string, model string, query
 	}
 
 	c.entries = append(c.entries, entry)
-	observability.Debugf("InMemoryCache.AddPendingRequest: added pending entry (total entries: %d, embedding_dim: %d)",
-		len(c.entries), len(embedding))
+	entryIndex := len(c.entries) - 1
+
+	// Add to HNSW index if enabled
+	if c.useHNSW && c.hnswIndex != nil {
+		c.hnswIndex.addNode(entryIndex, embedding, c.entries)
+	}
+
+	observability.Debugf("InMemoryCache.AddPendingRequest: added pending entry (total entries: %d, embedding_dim: %d, useHNSW: %t)",
+		len(c.entries), len(embedding), c.useHNSW)
 
 	// Record metrics
 	metrics.RecordCacheOperation("memory", "add_pending", "success", time.Since(start).Seconds())
@@ -192,12 +251,20 @@ func (c *InMemoryCache) AddEntry(requestID string, model string, query string, r
 	}
 
 	c.entries = append(c.entries, entry)
-	observability.Debugf("InMemoryCache.AddEntry: added complete entry (total entries: %d, request_size: %d, response_size: %d)",
-		len(c.entries), len(requestBody), len(responseBody))
+	entryIndex := len(c.entries) - 1
+
+	// Add to HNSW index if enabled
+	if c.useHNSW && c.hnswIndex != nil {
+		c.hnswIndex.addNode(entryIndex, embedding, c.entries)
+	}
+
+	observability.Debugf("InMemoryCache.AddEntry: added complete entry (total entries: %d, request_size: %d, response_size: %d, useHNSW: %t)",
+		len(c.entries), len(requestBody), len(responseBody), c.useHNSW)
 	observability.LogEvent("cache_entry_added", map[string]interface{}{
 		"backend": "memory",
 		"query":   query,
 		"model":   model,
+		"useHNSW": c.useHNSW,
 	})
 
 	// Record success metrics
@@ -245,36 +312,86 @@ func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, thr
 	// Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait
 	now := time.Now()
 
-	// Compare with completed entries for the same model, tracking only the best match
-	for entryIndex, entry := range c.entries {
-		// Skip incomplete entries
-		if entry.ResponseBody == nil {
-			continue
+	// Use HNSW index for fast search if enabled
+	if c.useHNSW && c.hnswIndex != nil && len(c.hnswIndex.nodes) > 0 {
+		// Search using HNSW index with configured ef parameter
+		candidateIndices := c.hnswIndex.searchKNN(queryEmbedding, 10, c.hnswEfSearch, c.entries)
+
+		// Filter candidates by model and expiration, then find best match
+		for _, entryIndex := range candidateIndices {
+			if entryIndex < 0 || entryIndex >= len(c.entries) {
+				continue
+			}
+
+			entry := c.entries[entryIndex]
+
+			// Skip incomplete entries
+			if entry.ResponseBody == nil {
+				continue
+			}
+
+			// Only consider entries for the same model
+			if entry.Model != model {
+				continue
+			}
+
+			// Skip entries that have expired before considering them
+			if c.isExpired(entry, now) {
+				expiredCount++
+				continue
+			}
+
+			// Compute semantic similarity using dot product
+			var dotProduct float32
+			for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ {
+				dotProduct += queryEmbedding[i] * entry.Embedding[i]
+			}
+
+			entriesChecked++
+			if bestIndex == -1 || dotProduct > bestSimilarity {
+				bestSimilarity = dotProduct
+				bestIndex = entryIndex
+			}
 		}
 
-		// Only consider entries for the same model
-		if entry.Model != model {
-			continue
+		observability.Debugf("InMemoryCache.FindSimilar: HNSW search checked %d candidates", len(candidateIndices))
+	} else {
+		// Fallback to linear search
+		for entryIndex, entry := range c.entries {
+			// Skip incomplete entries
+			if entry.ResponseBody == nil {
+				continue
+			}
+
+			// Only consider entries for the same model
+			if entry.Model != model {
+				continue
+			}
+
+			// Skip entries that have expired before considering them
+			if c.isExpired(entry, now) {
+				expiredCount++
+				continue
+			}
+
+			// Compute semantic similarity using dot product
+			var dotProduct float32
+			for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ {
+				dotProduct += queryEmbedding[i] * entry.Embedding[i]
+			}
+
+			entriesChecked++
+			if bestIndex == -1 || dotProduct > bestSimilarity {
+				bestSimilarity = dotProduct
+				bestIndex = entryIndex
+			}
 		}
 
-		// Skip entries that have expired before considering them
-		if c.isExpired(entry, now) {
-			expiredCount++
-			continue
-		}
-
-		// Compute semantic similarity using dot product
-		var dotProduct float32
-		for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ {
-			dotProduct += queryEmbedding[i] * entry.Embedding[i]
-		}
-
-		entriesChecked++
-		if bestIndex == -1 || dotProduct > bestSimilarity {
-			bestSimilarity = dotProduct
-			bestIndex = entryIndex
+		if !c.useHNSW {
+			observability.Debugf("InMemoryCache.FindSimilar: Linear search used (HNSW disabled)")
 		}
 	}
+
 	// Snapshot the best entry before releasing the read lock
 	if bestIndex >= 0 {
 		bestEntry = c.entries[bestIndex]
@@ -415,6 +532,11 @@ func (c *InMemoryCache) cleanupExpiredEntries() {
 	cleanupTime := time.Now()
 	c.lastCleanupTime = &cleanupTime
 
+	// Rebuild HNSW index if entries were removed
+	if expiredCount > 0 && c.useHNSW && c.hnswIndex != nil {
+		c.rebuildHNSWIndex()
+	}
+
 	// Update metrics after cleanup
 	metrics.UpdateCacheEntries("memory", len(c.entries))
 }
@@ -460,6 +582,14 @@ func (c *InMemoryCache) evictOne() {
 
 	evictedRequestID := c.entries[victimIdx].RequestID
 
+	// If using HNSW, we need to rebuild the index after eviction
+	// For simplicity, we'll mark that a rebuild is needed
+	if c.useHNSW && c.hnswIndex != nil {
+		// Remove the node from HNSW index
+		// Note: HNSW doesn't support efficient deletion, so we'll rebuild on next search if needed
+		c.hnswIndex.markStale()
+	}
+
 	c.entries[victimIdx] = c.entries[len(c.entries)-1]
 	c.entries = c.entries[:len(c.entries)-1]
 
@@ -469,3 +599,369 @@ func (c *InMemoryCache) evictOne() {
 		"max_entries": c.maxEntries,
 	})
 }
+
+// ===== HNSW Index Implementation =====
+
+// rebuildHNSWIndex rebuilds the HNSW index from scratch
+// Caller must hold a write lock
+func (c *InMemoryCache) rebuildHNSWIndex() {
+	if c.hnswIndex == nil {
+		return
+	}
+
+	observability.Debugf("InMemoryCache: Rebuilding HNSW index with %d entries", len(c.entries))
+
+	// Clear the existing index
+	c.hnswIndex.nodes = []*HNSWNode{}
+	c.hnswIndex.nodeIndex = make(map[int]*HNSWNode) // Clear O(1) lookup map
+	c.hnswIndex.entryPoint = -1
+	c.hnswIndex.maxLayer = -1
+
+	// Rebuild by adding all entries
+	for i, entry := range c.entries {
+		if len(entry.Embedding) > 0 {
+			c.hnswIndex.addNode(i, entry.Embedding, c.entries)
+		}
+	}
+
+	observability.Debugf("InMemoryCache: HNSW index rebuilt with %d nodes", len(c.hnswIndex.nodes))
+}
+
+// newHNSWIndex creates a new HNSW index
+func newHNSWIndex(m, efConstruction int) *HNSWIndex {
+	return &HNSWIndex{
+		nodes:          []*HNSWNode{},
+		nodeIndex:      make(map[int]*HNSWNode), // Initialize O(1) lookup map
+		entryPoint:     -1,
+		maxLayer:       -1,
+		efConstruction: efConstruction,
+		M:              m,
+		Mmax:           m,
+		Mmax0:          m * 2,
+		ml:             1.0 / math.Log(float64(m)),
+	}
+}
+
+// markStale marks the index as needing a rebuild
+func (h *HNSWIndex) markStale() {
+	// Simple approach: clear the index
+	h.nodes = []*HNSWNode{}
+	h.nodeIndex = make(map[int]*HNSWNode) // Clear O(1) lookup map
+	h.entryPoint = -1
+	h.maxLayer = -1
+}
+
+// selectLevel randomly selects a level for a new node
+func (h *HNSWIndex) selectLevel() int {
+	// Use exponential decay probability
+	r := -math.Log(math.Max(1e-9, 1.0-float64(time.Now().UnixNano()%1000000)/1000000.0))
+	return int(r * h.ml)
+}
+
+// addNode adds a new node to the HNSW index
+func (h *HNSWIndex) addNode(entryIndex int, embedding []float32, entries []CacheEntry) {
+	level := h.selectLevel()
+
+	node := &HNSWNode{
+		entryIndex: entryIndex,
+		neighbors:  make(map[int][]int),
+		maxLayer:   level,
+	}
+
+	// If this is the first node, make it the entry point
+	if h.entryPoint == -1 {
+		h.entryPoint = entryIndex
+		h.maxLayer = level
+		h.nodes = append(h.nodes, node)
+		h.nodeIndex[entryIndex] = node // Add to O(1) lookup map
+		return
+	}
+
+	// Find nearest neighbors and connect
+	for lc := min(level, h.maxLayer); lc >= 0; lc-- {
+		candidates := h.searchLayer(embedding, h.entryPoint, h.efConstruction, lc, entries)
+
+		// Select M nearest neighbors
+		M := h.Mmax
+		if lc == 0 {
+			M = h.Mmax0
+		}
+		neighbors := h.selectNeighbors(candidates, M, entries)
+
+		// Add bidirectional links
+		node.neighbors[lc] = neighbors
+		for _, neighborIdx := range neighbors {
+			// Fast O(1) lookup using nodeIndex map
+			if n := h.nodeIndex[neighborIdx]; n != nil {
+				if n.neighbors[lc] == nil {
+					n.neighbors[lc] = []int{}
+				}
+				n.neighbors[lc] = append(n.neighbors[lc], entryIndex)
+
+				// Prune neighbors if needed
+				if len(n.neighbors[lc]) > M {
+					n.neighbors[lc] = h.selectNeighbors(n.neighbors[lc], M, entries)
+				}
+			}
+		}
+	}
+
+	// Update entry point if this node has a higher level
+	if level > h.maxLayer {
+		h.maxLayer = level
+		h.entryPoint = entryIndex
+	}
+
+	h.nodes = append(h.nodes, node)
+	h.nodeIndex[entryIndex] = node // Add to O(1) lookup map
+}
+
+// searchKNN performs k-nearest neighbor search
+func (h *HNSWIndex) searchKNN(queryEmbedding []float32, k, ef int, entries []CacheEntry) []int {
+	if h.entryPoint == -1 || len(h.nodes) == 0 {
+		return []int{}
+	}
+
+	// Search from top layer to layer 1
+	currentNearest := h.entryPoint
+	for lc := h.maxLayer; lc > 0; lc-- {
+		nearest := h.searchLayer(queryEmbedding, currentNearest, 1, lc, entries)
+		if len(nearest) > 0 {
+			currentNearest = nearest[0]
+		}
+	}
+
+	// Search at layer 0 with ef
+	return h.searchLayer(queryEmbedding, currentNearest, ef, 0, entries)
+}
+
+// searchLayer searches for nearest neighbors at a specific layer
+func (h *HNSWIndex) searchLayer(queryEmbedding []float32, entryPoint, ef, layer int, entries []CacheEntry) []int {
+	visited := make(map[int]bool)
+	candidates := newMaxHeap()
+	results := newMinHeap()
+
+	// Calculate distance to entry point
+	if entryPoint >= 0 && entryPoint < len(entries) {
+		dist := h.distance(queryEmbedding, entries[entryPoint].Embedding)
+		candidates.push(entryPoint, dist)
+		results.push(entryPoint, dist)
+		visited[entryPoint] = true
+	}
+
+	for candidates.len() > 0 {
+		currentIdx, currentDist := candidates.pop()
+
+		if results.len() > 0 {
+			worstDist := results.peekDist()
+			if currentDist > worstDist {
+				break
+			}
+		}
+
+		// Fast O(1) lookup using nodeIndex map
+		currentNode := h.nodeIndex[currentIdx]
+		if currentNode == nil || currentNode.neighbors[layer] == nil {
+			continue
+		}
+
+		// Check neighbors
+		for _, neighborIdx := range currentNode.neighbors[layer] {
+			if visited[neighborIdx] {
+				continue
+			}
+			visited[neighborIdx] = true
+
+			if neighborIdx >= 0 && neighborIdx < len(entries) {
+				dist := h.distance(queryEmbedding, entries[neighborIdx].Embedding)
+
+				if results.len() < ef {
+					candidates.push(neighborIdx, dist)
+					results.push(neighborIdx, dist)
+				} else if dist < results.peekDist() {
+					candidates.push(neighborIdx, dist)
+					results.push(neighborIdx, dist)
+					if results.len() > ef {
+						results.pop()
+					}
+				}
+			}
+		}
+	}
+
+	return results.items()
+}
+
+// selectNeighbors selects the best neighbors using a simple heuristic
+func (h *HNSWIndex) selectNeighbors(candidates []int, m int, entries []CacheEntry) []int {
+	if len(candidates) <= m {
+		return candidates
+	}
+	// Just return first m for simplicity
+	return candidates[:m]
+}
+
+// distance calculates cosine similarity (as dot product since embeddings are normalized)
+func (h *HNSWIndex) distance(a, b []float32) float32 {
+	// We use negative dot product so that larger similarity = smaller distance
+	var dotProduct float32
+	minLen := len(a)
+	if len(b) < minLen {
+		minLen = len(b)
+	}
+	for i := 0; i < minLen; i++ {
+		dotProduct += a[i] * b[i]
+	}
+	return -dotProduct // Negate so higher similarity = lower distance
+}
+
+// Helper priority queue implementations for HNSW
+
+type heapItem struct {
+	index int
+	dist  float32
+}
+
+type minHeap struct {
+	data []heapItem
+}
+
+func newMinHeap() *minHeap {
+	return &minHeap{data: []heapItem{}}
+}
+
+func (h *minHeap) push(index int, dist float32) {
+	h.data = append(h.data, heapItem{index, dist})
+	h.bubbleUp(len(h.data) - 1)
+}
+
+func (h *minHeap) pop() (int, float32) {
+	if len(h.data) == 0 {
+		return -1, 0
+	}
+	result := h.data[0]
+	h.data[0] = h.data[len(h.data)-1]
+	h.data = h.data[:len(h.data)-1]
+	if len(h.data) > 0 {
+		h.bubbleDown(0)
+	}
+	return result.index, result.dist
+}
+
+func (h *minHeap) peekDist() float32 {
+	if len(h.data) == 0 {
+		return math.MaxFloat32
+	}
+	return h.data[0].dist
+}
+
+func (h *minHeap) len() int {
+	return len(h.data)
+}
+
+func (h *minHeap) items() []int {
+	result := make([]int, len(h.data))
+	for i, item := range h.data {
+		result[i] = item.index
+	}
+	return result
+}
+
+func (h *minHeap) bubbleUp(i int) {
+	for i > 0 {
+		parent := (i - 1) / 2
+		if h.data[i].dist >= h.data[parent].dist {
+			break
+		}
+		h.data[i], h.data[parent] = h.data[parent], h.data[i]
+		i = parent
+	}
+}
+
+func (h *minHeap) bubbleDown(i int) {
+	for {
+		left := 2*i + 1
+		right := 2*i + 2
+		smallest := i
+
+		if left < len(h.data) && h.data[left].dist < h.data[smallest].dist {
+			smallest = left
+		}
+		if right < len(h.data) && h.data[right].dist < h.data[smallest].dist {
+			smallest = right
+		}
+		if smallest == i {
+			break
+		}
+		h.data[i], h.data[smallest] = h.data[smallest], h.data[i]
+		i = smallest
+	}
+}
+
+type maxHeap struct {
+	data []heapItem
+}
+
+func newMaxHeap() *maxHeap {
+	return &maxHeap{data: []heapItem{}}
+}
+
+func (h *maxHeap) push(index int, dist float32) {
+	h.data = append(h.data, heapItem{index, dist})
+	h.bubbleUp(len(h.data) - 1)
+}
+
+func (h *maxHeap) pop() (int, float32) {
+	if len(h.data) == 0 {
+		return -1, 0
+	}
+	result := h.data[0]
+	h.data[0] = h.data[len(h.data)-1]
+	h.data = h.data[:len(h.data)-1]
+	if len(h.data) > 0 {
+		h.bubbleDown(0)
+	}
+	return result.index, result.dist
+}
+
+func (h *maxHeap) len() int {
+	return len(h.data)
+}
+
+func (h *maxHeap) bubbleUp(i int) {
+	for i > 0 {
+		parent := (i - 1) / 2
+		if h.data[i].dist <= h.data[parent].dist {
+			break
+		}
+		h.data[i], h.data[parent] = h.data[parent], h.data[i]
+		i = parent
+	}
+}
+
+func (h *maxHeap) bubbleDown(i int) {
+	for {
+		left := 2*i + 1
+		right := 2*i + 2
+		largest := i
+
+		if left < len(h.data) && h.data[left].dist > h.data[largest].dist {
+			largest = left
+		}
+		if right < len(h.data) && h.data[right].dist > h.data[largest].dist {
+			largest = right
+		}
+		if largest == i {
+			break
+		}
+		h.data[i], h.data[largest] = h.data[largest], h.data[i]
+		i = largest
+	}
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
diff --git a/src/semantic-router/pkg/cache/inmemory_cache_integration_test.go b/src/semantic-router/pkg/cache/inmemory_cache_integration_test.go
index c970aedf..60693d7e 100644
--- a/src/semantic-router/pkg/cache/inmemory_cache_integration_test.go
+++ b/src/semantic-router/pkg/cache/inmemory_cache_integration_test.go
@@ -171,3 +171,390 @@ func TestEvictionPolicySelection(t *testing.T) {
 		})
 	}
 }
+
+// TestInMemoryCacheHNSW tests the HNSW index functionality
+func TestInMemoryCacheHNSW(t *testing.T) {
+	if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
+		t.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	// Test with HNSW enabled
+	cacheHNSW := NewInMemoryCache(InMemoryCacheOptions{
+		Enabled:             true,
+		MaxEntries:          100,
+		SimilarityThreshold: 0.85,
+		TTLSeconds:          0,
+		UseHNSW:             true,
+		HNSWM:               16,
+		HNSWEfConstruction:  200,
+	})
+
+	// Test without HNSW (linear search)
+	cacheLinear := NewInMemoryCache(InMemoryCacheOptions{
+		Enabled:             true,
+		MaxEntries:          100,
+		SimilarityThreshold: 0.85,
+		TTLSeconds:          0,
+		UseHNSW:             false,
+	})
+
+	testQueries := []struct {
+		query    string
+		model    string
+		response string
+	}{
+		{"What is machine learning?", "test-model", "ML is a subset of AI"},
+		{"Explain neural networks", "test-model", "NNs are inspired by the brain"},
+		{"How does backpropagation work?", "test-model", "Backprop calculates gradients"},
+		{"What is deep learning?", "test-model", "DL uses multiple layers"},
+		{"Define artificial intelligence", "test-model", "AI mimics human intelligence"},
+	}
+
+	t.Run("HNSW_Basic_Operations", func(t *testing.T) {
+		// Add entries to both caches
+		for i, q := range testQueries {
+			reqID := fmt.Sprintf("req%d", i)
+			err := cacheHNSW.AddEntry(reqID, q.model, q.query, []byte(q.query), []byte(q.response))
+			if err != nil {
+				t.Fatalf("Failed to add entry to HNSW cache: %v", err)
+			}
+
+			err = cacheLinear.AddEntry(reqID, q.model, q.query, []byte(q.query), []byte(q.response))
+			if err != nil {
+				t.Fatalf("Failed to add entry to linear cache: %v", err)
+			}
+		}
+
+		// Verify HNSW index was built
+		if cacheHNSW.hnswIndex == nil {
+			t.Fatal("HNSW index is nil")
+		}
+		if len(cacheHNSW.hnswIndex.nodes) != len(testQueries) {
+			t.Errorf("Expected %d HNSW nodes, got %d", len(testQueries), len(cacheHNSW.hnswIndex.nodes))
+		}
+
+		// Test exact match search
+		response, found, err := cacheHNSW.FindSimilar("test-model", "What is machine learning?")
+		if err != nil {
+			t.Fatalf("HNSW FindSimilar error: %v", err)
+		}
+		if !found {
+			t.Error("HNSW should find exact match")
+		}
+		if string(response) != "ML is a subset of AI" {
+			t.Errorf("Expected 'ML is a subset of AI', got %s", string(response))
+		}
+
+		// Test similar query search
+		response, found, err = cacheHNSW.FindSimilar("test-model", "What is ML?")
+		if err != nil {
+			t.Logf("HNSW FindSimilar error (may not find due to threshold): %v", err)
+		}
+		if found {
+			t.Logf("HNSW found similar entry: %s", string(response))
+		}
+
+		// Compare stats
+		statsHNSW := cacheHNSW.GetStats()
+		statsLinear := cacheLinear.GetStats()
+
+		t.Logf("HNSW Cache Stats: Entries=%d, Hits=%d, Misses=%d, HitRatio=%.2f",
+			statsHNSW.TotalEntries, statsHNSW.HitCount, statsHNSW.MissCount, statsHNSW.HitRatio)
+		t.Logf("Linear Cache Stats: Entries=%d, Hits=%d, Misses=%d, HitRatio=%.2f",
+			statsLinear.TotalEntries, statsLinear.HitCount, statsLinear.MissCount, statsLinear.HitRatio)
+	})
+
+	t.Run("HNSW_Rebuild_After_Cleanup", func(t *testing.T) {
+		// Create cache with short TTL
+		cacheTTL := NewInMemoryCache(InMemoryCacheOptions{
+			Enabled:             true,
+			MaxEntries:          100,
+			SimilarityThreshold: 0.85,
+			TTLSeconds:          1,
+			UseHNSW:             true,
+			HNSWM:               16,
+			HNSWEfConstruction:  200,
+		})
+
+		// Add an entry
+		err := cacheTTL.AddEntry("req1", "test-model", "test query", []byte("request"), []byte("response"))
+		if err != nil {
+			t.Fatalf("Failed to add entry: %v", err)
+		}
+
+		initialNodes := len(cacheTTL.hnswIndex.nodes)
+		if initialNodes != 1 {
+			t.Errorf("Expected 1 HNSW node initially, got %d", initialNodes)
+		}
+
+		// Manually trigger cleanup (in real scenario, TTL would expire)
+		cacheTTL.mu.Lock()
+		cacheTTL.cleanupExpiredEntries()
+		cacheTTL.mu.Unlock()
+
+		t.Logf("After cleanup: %d entries, %d HNSW nodes",
+			len(cacheTTL.entries), len(cacheTTL.hnswIndex.nodes))
+	})
+}
+
+// ===== Benchmark Tests =====
+
+// BenchmarkInMemoryCacheSearch benchmarks search performance with and without HNSW
+func BenchmarkInMemoryCacheSearch(b *testing.B) {
+	if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	// Test different cache sizes
+	cacheSizes := []int{100, 500, 1000, 5000}
+
+	for _, size := range cacheSizes {
+		// Prepare test data
+		entries := make([]struct {
+			query    string
+			response string
+		}, size)
+
+		for i := 0; i < size; i++ {
+			entries[i].query = fmt.Sprintf("Test query number %d about machine learning and AI", i)
+			entries[i].response = fmt.Sprintf("Response %d", i)
+		}
+
+		// Benchmark Linear Search
+		b.Run(fmt.Sprintf("LinearSearch_%d_entries", size), func(b *testing.B) {
+			cache := NewInMemoryCache(InMemoryCacheOptions{
+				Enabled:             true,
+				MaxEntries:          size * 2,
+				SimilarityThreshold: 0.85,
+				TTLSeconds:          0,
+				UseHNSW:             false,
+			})
+
+			// Populate cache
+			for i, entry := range entries {
+				reqID := fmt.Sprintf("req%d", i)
+				_ = cache.AddEntry(reqID, "test-model", entry.query, []byte(entry.query), []byte(entry.response))
+			}
+
+			// Benchmark search
+			searchQuery := "What is machine learning and artificial intelligence?"
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_, _, _ = cache.FindSimilar("test-model", searchQuery)
+			}
+		})
+
+		// Benchmark HNSW Search
+		b.Run(fmt.Sprintf("HNSWSearch_%d_entries", size), func(b *testing.B) {
+			cache := NewInMemoryCache(InMemoryCacheOptions{
+				Enabled:             true,
+				MaxEntries:          size * 2,
+				SimilarityThreshold: 0.85,
+				TTLSeconds:          0,
+				UseHNSW:             true,
+				HNSWM:               16,
+				HNSWEfConstruction:  200,
+			})
+
+			// Populate cache
+			for i, entry := range entries {
+				reqID := fmt.Sprintf("req%d", i)
+				_ = cache.AddEntry(reqID, "test-model", entry.query, []byte(entry.query), []byte(entry.response))
+			}
+
+			// Benchmark search
+			searchQuery := "What is machine learning and artificial intelligence?"
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_, _, _ = cache.FindSimilar("test-model", searchQuery)
+			}
+		})
+	}
+}
+
+// BenchmarkHNSWIndexConstruction benchmarks HNSW index construction time
+func BenchmarkHNSWIndexConstruction(b *testing.B) {
+	if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	entryCounts := []int{100, 500, 1000, 5000}
+
+	for _, count := range entryCounts {
+		b.Run(fmt.Sprintf("AddEntries_%d", count), func(b *testing.B) {
+			// Generate test queries outside the benchmark loop
+			testQueries := make([]string, count)
+			for i := 0; i < count; i++ {
+				testQueries[i] = fmt.Sprintf("Query %d: machine learning deep neural networks", i)
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				b.StopTimer()
+				cache := NewInMemoryCache(InMemoryCacheOptions{
+					Enabled:             true,
+					MaxEntries:          count * 2,
+					SimilarityThreshold: 0.85,
+					TTLSeconds:          0,
+					UseHNSW:             true,
+					HNSWM:               16,
+					HNSWEfConstruction:  200,
+				})
+				b.StartTimer()
+
+				// Add entries and build index
+				for j := 0; j < count; j++ {
+					reqID := fmt.Sprintf("req%d", j)
+					_ = cache.AddEntry(reqID, "test-model", testQueries[j], []byte(testQueries[j]), []byte("response"))
+				}
+			}
+		})
+	}
+}
+
+// BenchmarkHNSWParameters benchmarks different HNSW parameter configurations
+func BenchmarkHNSWParameters(b *testing.B) {
+	if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	cacheSize := 1000
+	testConfigs := []struct {
+		name           string
+		m              int
+		efConstruction int
+	}{
+		{"M8_EF100", 8, 100},
+		{"M16_EF200", 16, 200},
+		{"M32_EF400", 32, 400},
+	}
+
+	// Prepare test data
+	entries := make([]struct {
+		query    string
+		response string
+	}, cacheSize)
+
+	for i := 0; i < cacheSize; i++ {
+		entries[i].query = fmt.Sprintf("Query %d about AI and machine learning", i)
+		entries[i].response = fmt.Sprintf("Response %d", i)
+	}
+
+	for _, config := range testConfigs {
+		b.Run(config.name, func(b *testing.B) {
+			cache := NewInMemoryCache(InMemoryCacheOptions{
+				Enabled:             true,
+				MaxEntries:          cacheSize * 2,
+				SimilarityThreshold: 0.85,
+				TTLSeconds:          0,
+				UseHNSW:             true,
+				HNSWM:               config.m,
+				HNSWEfConstruction:  config.efConstruction,
+			})
+
+			// Populate cache
+			for i, entry := range entries {
+				reqID := fmt.Sprintf("req%d", i)
+				_ = cache.AddEntry(reqID, "test-model", entry.query, []byte(entry.query), []byte(entry.response))
+			}
+
+			// Benchmark search
+			searchQuery := "What is artificial intelligence and machine learning?"
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				_, _, _ = cache.FindSimilar("test-model", searchQuery)
+			}
+		})
+	}
+}
+
+// BenchmarkCacheOperations benchmarks complete cache workflow
+func BenchmarkCacheOperations(b *testing.B) {
+	if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	b.Run("LinearSearch_AddAndFind", func(b *testing.B) {
+		cache := NewInMemoryCache(InMemoryCacheOptions{
+			Enabled:             true,
+			MaxEntries:          10000,
+			SimilarityThreshold: 0.85,
+			TTLSeconds:          0,
+			UseHNSW:             false,
+		})
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			query := fmt.Sprintf("Test query %d", i%100)
+			reqID := fmt.Sprintf("req%d", i)
+
+			// Add entry
+			_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+
+			// Find similar
+			_, _, _ = cache.FindSimilar("test-model", query)
+		}
+	})
+
+	b.Run("HNSWSearch_AddAndFind", func(b *testing.B) {
+		cache := NewInMemoryCache(InMemoryCacheOptions{
+			Enabled:             true,
+			MaxEntries:          10000,
+			SimilarityThreshold: 0.85,
+			TTLSeconds:          0,
+			UseHNSW:             true,
+			HNSWM:               16,
+			HNSWEfConstruction:  200,
+		})
+
+		b.ResetTimer()
+		for i := 0; i < b.N; i++ {
+			query := fmt.Sprintf("Test query %d", i%100)
+			reqID := fmt.Sprintf("req%d", i)
+
+			// Add entry
+			_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+
+			// Find similar
+			_, _, _ = cache.FindSimilar("test-model", query)
+		}
+	})
+}
+
+// BenchmarkHNSWRebuild benchmarks index rebuild performance
+func BenchmarkHNSWRebuild(b *testing.B) {
+	if err := candle_binding.InitModel("sentence-transformers/all-MiniLM-L6-v2", true); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	sizes := []int{100, 500, 1000}
+
+	for _, size := range sizes {
+		b.Run(fmt.Sprintf("Rebuild_%d_entries", size), func(b *testing.B) {
+			// Create and populate cache
+			cache := NewInMemoryCache(InMemoryCacheOptions{
+				Enabled:             true,
+				MaxEntries:          size * 2,
+				SimilarityThreshold: 0.85,
+				TTLSeconds:          0,
+				UseHNSW:             true,
+				HNSWM:               16,
+				HNSWEfConstruction:  200,
+			})
+
+			// Populate with test data
+			for i := 0; i < size; i++ {
+				query := fmt.Sprintf("Query %d about machine learning", i)
+				reqID := fmt.Sprintf("req%d", i)
+				_ = cache.AddEntry(reqID, "test-model", query, []byte(query), []byte("response"))
+			}
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				cache.mu.Lock()
+				cache.rebuildHNSWIndex()
+				cache.mu.Unlock()
+			}
+		})
+	}
+}
diff --git a/src/semantic-router/pkg/cache/large_scale_benchmark_test.go b/src/semantic-router/pkg/cache/large_scale_benchmark_test.go
new file mode 100644
index 00000000..81e69129
--- /dev/null
+++ b/src/semantic-router/pkg/cache/large_scale_benchmark_test.go
@@ -0,0 +1,511 @@
+package cache
+
+import (
+	"fmt"
+	"os"
+	"testing"
+	"time"
+
+	candle_binding "github.com/vllm-project/semantic-router/candle-binding"
+)
+
+// BenchmarkLargeScale tests HNSW vs Linear at scales where HNSW shows advantages (10K-100K entries)
+func BenchmarkLargeScale(b *testing.B) {
+	// Initialize BERT model (GPU by default)
+	useCPU := os.Getenv("USE_CPU") == "true"
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	// Large scale cache sizes where HNSW shines
+	cacheSizes := []int{10000, 50000, 100000}
+
+	// Quick mode: only run 10K for fast demo
+	if os.Getenv("BENCHMARK_QUICK") == "true" {
+		cacheSizes = []int{10000}
+	}
+
+	// Use medium length queries for consistency
+	contentLen := MediumContent
+
+	// HNSW configurations
+	// Only using default config since performance is similar across configs
+	hnswConfigs := []struct {
+		name string
+		m    int
+		ef   int
+	}{
+		{"HNSW_default", 16, 200},
+	}
+
+	// Open CSV file for results
+	// Create benchmark_results directory if it doesn't exist
+	resultsDir := "../../benchmark_results"
+	os.MkdirAll(resultsDir, 0755)
+
+	csvFile, err := os.OpenFile(resultsDir+"/large_scale_benchmark.csv",
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+	if err != nil {
+		b.Logf("Warning: Could not open CSV file: %v", err)
+	} else {
+		defer csvFile.Close()
+		// Write header if file is new
+		stat, _ := csvFile.Stat()
+		if stat.Size() == 0 {
+			header := "cache_size,search_method,hnsw_m,hnsw_ef,avg_latency_ns,iterations,speedup_vs_linear\n"
+			if _, err := csvFile.WriteString(header); err != nil {
+				b.Logf("Warning: failed to write CSV header: %v", err)
+			}
+		}
+	}
+
+	for _, cacheSize := range cacheSizes {
+		b.Run(fmt.Sprintf("CacheSize_%d", cacheSize), func(b *testing.B) {
+			// Generate test data
+			b.Logf("Generating %d test queries...", cacheSize)
+			testQueries := make([]string, cacheSize)
+			for i := 0; i < cacheSize; i++ {
+				testQueries[i] = generateQuery(contentLen, i)
+			}
+
+			// Generate query embeddings once
+			useCPUStr := "CPU"
+			if !useCPU {
+				useCPUStr = "GPU"
+			}
+			b.Logf("Generating embeddings for %d queries using %s...", cacheSize, useCPUStr)
+			testEmbeddings := make([][]float32, cacheSize)
+			embStart := time.Now()
+			embProgressInterval := cacheSize / 10
+			if embProgressInterval < 1000 {
+				embProgressInterval = 1000
+			}
+
+			for i := 0; i < cacheSize; i++ {
+				emb, err := candle_binding.GetEmbedding(testQueries[i], 0)
+				if err != nil {
+					b.Fatalf("Failed to generate embedding: %v", err)
+				}
+				testEmbeddings[i] = emb
+
+				// Progress indicator
+				if (i+1)%embProgressInterval == 0 {
+					elapsed := time.Since(embStart)
+					embPerSec := float64(i+1) / elapsed.Seconds()
+					remaining := time.Duration(float64(cacheSize-i-1) / embPerSec * float64(time.Second))
+					b.Logf("  [Embeddings] %d/%d (%.0f%%, %.0f emb/sec, ~%v remaining)",
+						i+1, cacheSize, float64(i+1)/float64(cacheSize)*100,
+						embPerSec, remaining.Round(time.Second))
+				}
+			}
+			b.Logf("✓ Generated %d embeddings in %v (%.0f emb/sec)",
+				cacheSize, time.Since(embStart), float64(cacheSize)/time.Since(embStart).Seconds())
+
+			// Test query (use a query similar to middle entries for realistic search)
+			searchQuery := generateQuery(contentLen, cacheSize/2)
+
+			var linearLatency float64
+
+			// Benchmark Linear Search
+			b.Run("Linear", func(b *testing.B) {
+				b.Logf("=== Testing Linear Search with %d entries ===", cacheSize)
+				cache := NewInMemoryCache(InMemoryCacheOptions{
+					Enabled:             true,
+					SimilarityThreshold: 0.8,
+					MaxEntries:          cacheSize,
+					UseHNSW:             false, // Linear search
+				})
+
+				// Populate cache
+				b.Logf("Building cache with %d entries...", cacheSize)
+				progressInterval := cacheSize / 10
+				if progressInterval < 1000 {
+					progressInterval = 1000
+				}
+
+				for i := 0; i < cacheSize; i++ {
+					err := cache.AddEntry(
+						fmt.Sprintf("req-%d", i),
+						"test-model",
+						testQueries[i],
+						[]byte(fmt.Sprintf("request-%d", i)),
+						[]byte(fmt.Sprintf("response-%d", i)),
+					)
+					if err != nil {
+						b.Fatalf("Failed to add entry: %v", err)
+					}
+
+					if (i+1)%progressInterval == 0 {
+						b.Logf("  [Linear] Added %d/%d entries (%.0f%%)",
+							i+1, cacheSize, float64(i+1)/float64(cacheSize)*100)
+					}
+				}
+				b.Logf("✓ Linear cache built. Starting search benchmark...")
+
+				// Run search benchmark
+				b.ResetTimer()
+				start := time.Now()
+				for i := 0; i < b.N; i++ {
+					_, _, err := cache.FindSimilar("test-model", searchQuery)
+					if err != nil {
+						b.Fatalf("FindSimilar failed: %v", err)
+					}
+				}
+				b.StopTimer()
+
+				linearLatency = float64(time.Since(start).Nanoseconds()) / float64(b.N)
+				b.Logf("✓ Linear search complete: %.2f ms per query (%d iterations)",
+					linearLatency/1e6, b.N)
+
+				// Write to CSV
+				if csvFile != nil {
+					line := fmt.Sprintf("%d,linear,0,0,%.0f,%d,1.0\n",
+						cacheSize, linearLatency, b.N)
+					if _, err := csvFile.WriteString(line); err != nil {
+						b.Logf("Warning: failed to write to CSV: %v", err)
+					}
+				}
+
+				b.ReportMetric(linearLatency/1e6, "ms/op")
+			})
+
+			// Benchmark HNSW configurations
+			for _, config := range hnswConfigs {
+				b.Run(config.name, func(b *testing.B) {
+					b.Logf("=== Testing %s with %d entries (M=%d, ef=%d) ===",
+						config.name, cacheSize, config.m, config.ef)
+					cache := NewInMemoryCache(InMemoryCacheOptions{
+						Enabled:             true,
+						SimilarityThreshold: 0.8,
+						MaxEntries:          cacheSize,
+						UseHNSW:             true,
+						HNSWM:               config.m,
+						HNSWEfConstruction:  config.ef,
+					})
+
+					// Populate cache
+					b.Logf("Building HNSW index with %d entries (M=%d, ef=%d)...",
+						cacheSize, config.m, config.ef)
+					buildStart := time.Now()
+					progressInterval := cacheSize / 10
+					if progressInterval < 1000 {
+						progressInterval = 1000
+					}
+
+					for i := 0; i < cacheSize; i++ {
+						err := cache.AddEntry(
+							fmt.Sprintf("req-%d", i),
+							"test-model",
+							testQueries[i],
+							[]byte(fmt.Sprintf("request-%d", i)),
+							[]byte(fmt.Sprintf("response-%d", i)),
+						)
+						if err != nil {
+							b.Fatalf("Failed to add entry: %v", err)
+						}
+
+						// Progress indicator
+						if (i+1)%progressInterval == 0 {
+							elapsed := time.Since(buildStart)
+							entriesPerSec := float64(i+1) / elapsed.Seconds()
+							remaining := time.Duration(float64(cacheSize-i-1) / entriesPerSec * float64(time.Second))
+							b.Logf("  [%s] %d/%d entries (%.0f%%, %v elapsed, ~%v remaining, %.0f entries/sec)",
+								config.name, i+1, cacheSize,
+								float64(i+1)/float64(cacheSize)*100,
+								elapsed.Round(time.Second),
+								remaining.Round(time.Second),
+								entriesPerSec)
+						}
+					}
+					buildTime := time.Since(buildStart)
+					b.Logf("✓ HNSW index built in %v (%.0f entries/sec)",
+						buildTime, float64(cacheSize)/buildTime.Seconds())
+
+					// Run search benchmark
+					b.Logf("Starting search benchmark...")
+					b.ResetTimer()
+					start := time.Now()
+					for i := 0; i < b.N; i++ {
+						_, _, err := cache.FindSimilar("test-model", searchQuery)
+						if err != nil {
+							b.Fatalf("FindSimilar failed: %v", err)
+						}
+					}
+					b.StopTimer()
+
+					hnswLatency := float64(time.Since(start).Nanoseconds()) / float64(b.N)
+					speedup := linearLatency / hnswLatency
+
+					b.Logf("✓ HNSW search complete: %.2f ms per query (%d iterations)",
+						hnswLatency/1e6, b.N)
+					b.Logf("📊 SPEEDUP: %.1fx faster than linear search (%.2f ms vs %.2f ms)",
+						speedup, hnswLatency/1e6, linearLatency/1e6)
+
+					// Write to CSV
+					if csvFile != nil {
+						line := fmt.Sprintf("%d,%s,%d,%d,%.0f,%d,%.2f\n",
+							cacheSize, config.name, config.m, config.ef,
+							hnswLatency, b.N, speedup)
+						if _, err := csvFile.WriteString(line); err != nil {
+							b.Logf("Warning: failed to write to CSV: %v", err)
+						}
+					}
+
+					b.ReportMetric(hnswLatency/1e6, "ms/op")
+					b.ReportMetric(speedup, "speedup")
+					b.ReportMetric(float64(buildTime.Milliseconds()), "build_ms")
+				})
+			}
+		})
+	}
+}
+
+// BenchmarkScalability tests how performance scales with cache size
+func BenchmarkScalability(b *testing.B) {
+	useCPU := os.Getenv("USE_CPU") == "true"
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	// Test cache sizes from small to very large
+	cacheSizes := []int{1000, 5000, 10000, 25000, 50000, 100000}
+
+	// CSV output
+	resultsDir := "../../benchmark_results"
+	os.MkdirAll(resultsDir, 0755)
+
+	csvFile, err := os.OpenFile(resultsDir+"/scalability_benchmark.csv",
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+	if err != nil {
+		b.Logf("Warning: Could not open CSV file: %v", err)
+	} else {
+		defer csvFile.Close()
+		stat, _ := csvFile.Stat()
+		if stat.Size() == 0 {
+			header := "cache_size,method,avg_latency_ns,latency_ms,ops_per_sec\n"
+			if _, err := csvFile.WriteString(header); err != nil {
+				b.Logf("Warning: failed to write CSV header: %v", err)
+			}
+		}
+	}
+
+	for _, cacheSize := range cacheSizes {
+		// Skip linear search for very large sizes (too slow)
+		testLinear := cacheSize <= 25000
+
+		b.Run(fmt.Sprintf("Size_%d", cacheSize), func(b *testing.B) {
+			// Generate test data
+			testQueries := make([]string, cacheSize)
+			for i := 0; i < cacheSize; i++ {
+				testQueries[i] = generateQuery(MediumContent, i)
+			}
+			searchQuery := generateQuery(MediumContent, cacheSize/2)
+
+			if testLinear {
+				b.Run("Linear", func(b *testing.B) {
+					cache := NewInMemoryCache(InMemoryCacheOptions{
+						Enabled:             true,
+						SimilarityThreshold: 0.8,
+						MaxEntries:          cacheSize,
+						UseHNSW:             false,
+					})
+
+					for i := 0; i < cacheSize; i++ {
+						cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
+							testQueries[i], []byte("req"), []byte("resp"))
+					}
+
+					b.ResetTimer()
+					start := time.Now()
+					for i := 0; i < b.N; i++ {
+						cache.FindSimilar("model", searchQuery)
+					}
+					elapsed := time.Since(start)
+
+					avgLatency := float64(elapsed.Nanoseconds()) / float64(b.N)
+					latencyMS := avgLatency / 1e6
+					opsPerSec := float64(b.N) / elapsed.Seconds()
+
+					if csvFile != nil {
+						line := fmt.Sprintf("%d,linear,%.0f,%.3f,%.0f\n",
+							cacheSize, avgLatency, latencyMS, opsPerSec)
+						csvFile.WriteString(line)
+					}
+
+					b.ReportMetric(latencyMS, "ms/op")
+					b.ReportMetric(opsPerSec, "qps")
+				})
+			}
+
+			b.Run("HNSW", func(b *testing.B) {
+				cache := NewInMemoryCache(InMemoryCacheOptions{
+					Enabled:             true,
+					SimilarityThreshold: 0.8,
+					MaxEntries:          cacheSize,
+					UseHNSW:             true,
+					HNSWM:               16,
+					HNSWEfConstruction:  200,
+				})
+
+				buildStart := time.Now()
+				for i := 0; i < cacheSize; i++ {
+					cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
+						testQueries[i], []byte("req"), []byte("resp"))
+					if (i+1)%10000 == 0 {
+						b.Logf("  Built %d/%d entries", i+1, cacheSize)
+					}
+				}
+				b.Logf("HNSW build time: %v", time.Since(buildStart))
+
+				b.ResetTimer()
+				start := time.Now()
+				for i := 0; i < b.N; i++ {
+					cache.FindSimilar("model", searchQuery)
+				}
+				elapsed := time.Since(start)
+
+				avgLatency := float64(elapsed.Nanoseconds()) / float64(b.N)
+				latencyMS := avgLatency / 1e6
+				opsPerSec := float64(b.N) / elapsed.Seconds()
+
+				if csvFile != nil {
+					line := fmt.Sprintf("%d,hnsw,%.0f,%.3f,%.0f\n",
+						cacheSize, avgLatency, latencyMS, opsPerSec)
+					csvFile.WriteString(line)
+				}
+
+				b.ReportMetric(latencyMS, "ms/op")
+				b.ReportMetric(opsPerSec, "qps")
+			})
+		})
+	}
+}
+
+// BenchmarkHNSWParameterSweep tests different HNSW parameters at large scale
+func BenchmarkHNSWParameterSweep(b *testing.B) {
+	useCPU := os.Getenv("USE_CPU") == "true"
+	modelName := "sentence-transformers/all-MiniLM-L6-v2"
+	if err := candle_binding.InitModel(modelName, useCPU); err != nil {
+		b.Skipf("Failed to initialize BERT model: %v", err)
+	}
+
+	cacheSize := 50000 // 50K entries - good size to show differences
+
+	// Parameter combinations to test
+	// Test different M (connectivity) and efSearch (search quality) combinations
+	// Fixed efConstruction=200 to focus on search-time performance
+	configs := []struct {
+		name     string
+		m        int
+		efSearch int
+	}{
+		// Low connectivity
+		{"M8_efSearch10", 8, 10},
+		{"M8_efSearch50", 8, 50},
+		{"M8_efSearch100", 8, 100},
+		{"M8_efSearch200", 8, 200},
+
+		// Medium connectivity (recommended)
+		{"M16_efSearch10", 16, 10},
+		{"M16_efSearch50", 16, 50},
+		{"M16_efSearch100", 16, 100},
+		{"M16_efSearch200", 16, 200},
+		{"M16_efSearch400", 16, 400},
+
+		// High connectivity
+		{"M32_efSearch50", 32, 50},
+		{"M32_efSearch100", 32, 100},
+		{"M32_efSearch200", 32, 200},
+	}
+
+	// Generate test data once
+	b.Logf("Generating %d test queries...", cacheSize)
+	testQueries := make([]string, cacheSize)
+	for i := 0; i < cacheSize; i++ {
+		testQueries[i] = generateQuery(MediumContent, i)
+	}
+	searchQuery := generateQuery(MediumContent, cacheSize/2)
+
+	// CSV output
+	resultsDir := "../../benchmark_results"
+	os.MkdirAll(resultsDir, 0755)
+
+	csvFile, err := os.OpenFile(resultsDir+"/hnsw_parameter_sweep.csv",
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+	if err != nil {
+		b.Logf("Warning: Could not open CSV file: %v", err)
+	} else {
+		defer csvFile.Close()
+		stat, _ := csvFile.Stat()
+		if stat.Size() == 0 {
+			header := "m,ef_search,build_time_ms,search_latency_ns,search_latency_ms,qps,memory_mb\n"
+			if _, err := csvFile.WriteString(header); err != nil {
+				b.Logf("Warning: failed to write CSV header: %v", err)
+			}
+		}
+	}
+
+	for _, config := range configs {
+		b.Run(config.name, func(b *testing.B) {
+			cache := NewInMemoryCache(InMemoryCacheOptions{
+				Enabled:             true,
+				SimilarityThreshold: 0.8,
+				MaxEntries:          cacheSize,
+				UseHNSW:             true,
+				HNSWM:               config.m,
+				HNSWEfConstruction:  200, // Fixed for consistent build quality
+				HNSWEfSearch:        config.efSearch,
+			})
+
+			// Build index and measure time
+			b.Logf("Building HNSW index: M=%d, efConstruction=200, efSearch=%d", config.m, config.efSearch)
+			buildStart := time.Now()
+			for i := 0; i < cacheSize; i++ {
+				cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
+					testQueries[i], []byte("req"), []byte("resp"))
+				if (i+1)%10000 == 0 {
+					b.Logf("  Progress: %d/%d", i+1, cacheSize)
+				}
+			}
+			buildTime := time.Since(buildStart)
+
+			// Estimate memory usage (rough)
+			// Embeddings: cacheSize × 384 × 4 bytes
+			// HNSW graph: cacheSize × M × 2 × 4 bytes (bidirectional links)
+			embeddingMemMB := float64(cacheSize*384*4) / 1024 / 1024
+			graphMemMB := float64(cacheSize*config.m*2*4) / 1024 / 1024
+			totalMemMB := embeddingMemMB + graphMemMB
+
+			b.Logf("Build time: %v, Est. memory: %.1f MB", buildTime, totalMemMB)
+
+			// Benchmark search
+			b.ResetTimer()
+			start := time.Now()
+			for i := 0; i < b.N; i++ {
+				cache.FindSimilar("model", searchQuery)
+			}
+			elapsed := time.Since(start)
+
+			avgLatency := float64(elapsed.Nanoseconds()) / float64(b.N)
+			latencyMS := avgLatency / 1e6
+			qps := float64(b.N) / elapsed.Seconds()
+
+			// Write to CSV
+			if csvFile != nil {
+				line := fmt.Sprintf("%d,%d,%.0f,%.0f,%.3f,%.0f,%.1f\n",
+					config.m, config.efSearch, float64(buildTime.Milliseconds()),
+					avgLatency, latencyMS, qps, totalMemMB)
+				if _, err := csvFile.WriteString(line); err != nil {
+					b.Logf("Warning: failed to write to CSV: %v", err)
+				}
+			}
+
+			b.ReportMetric(latencyMS, "ms/op")
+			b.ReportMetric(qps, "qps")
+			b.ReportMetric(float64(buildTime.Milliseconds()), "build_ms")
+			b.ReportMetric(totalMemMB, "memory_mb")
+		})
+	}
+}
diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go
index 10a65e8e..0e0e463c 100644
--- a/src/semantic-router/pkg/cache/milvus_cache.go
+++ b/src/semantic-router/pkg/cache/milvus_cache.go
@@ -181,16 +181,66 @@ func loadMilvusConfig(configPath string) (*MilvusConfig, error) {
 		return nil, fmt.Errorf("milvus config path is required")
 	}
 
+	fmt.Printf("[DEBUG] Loading Milvus config from: %s\n", configPath)
+
 	data, err := os.ReadFile(configPath)
 	if err != nil {
 		return nil, fmt.Errorf("failed to read config file: %w", err)
 	}
 
+	fmt.Printf("[DEBUG] Config file size: %d bytes\n", len(data))
+
 	var config MilvusConfig
 	if err := yaml.Unmarshal(data, &config); err != nil {
 		return nil, fmt.Errorf("failed to parse config file: %w", err)
 	}
 
+	// Debug: Log what was parsed
+	fmt.Printf("[DEBUG] MilvusConfig parsed from %s:\n", configPath)
+	fmt.Printf("[DEBUG]   Collection.Name: %s\n", config.Collection.Name)
+	fmt.Printf("[DEBUG]   Collection.VectorField.Name: %s\n", config.Collection.VectorField.Name)
+	fmt.Printf("[DEBUG]   Collection.VectorField.Dimension: %d\n", config.Collection.VectorField.Dimension)
+	fmt.Printf("[DEBUG]   Collection.VectorField.MetricType: %s\n", config.Collection.VectorField.MetricType)
+	fmt.Printf("[DEBUG]   Collection.Index.Type: %s\n", config.Collection.Index.Type)
+	fmt.Printf("[DEBUG]   Development.AutoCreateCollection: %v\n", config.Development.AutoCreateCollection)
+	fmt.Printf("[DEBUG]   Development.DropCollectionOnStartup: %v\n", config.Development.DropCollectionOnStartup)
+
+	// WORKAROUND: Force development settings for benchmarks
+	// There seems to be a YAML parsing issue with sigs.k8s.io/yaml
+	if config.Development.AutoCreateCollection == false && config.Development.DropCollectionOnStartup == false {
+		fmt.Printf("[WARN] Development settings parsed as false, forcing to true for benchmarks\n")
+		config.Development.AutoCreateCollection = true
+		config.Development.DropCollectionOnStartup = true
+	}
+
+	// WORKAROUND: Force vector field settings if empty
+	if config.Collection.VectorField.Name == "" {
+		fmt.Printf("[WARN] VectorField.Name parsed as empty, setting to 'embedding'\n")
+		config.Collection.VectorField.Name = "embedding"
+	}
+	if config.Collection.VectorField.MetricType == "" {
+		fmt.Printf("[WARN] VectorField.MetricType parsed as empty, setting to 'IP'\n")
+		config.Collection.VectorField.MetricType = "IP"
+	}
+	if config.Collection.Index.Type == "" {
+		fmt.Printf("[WARN] Index.Type parsed as empty, setting to 'HNSW'\n")
+		config.Collection.Index.Type = "HNSW"
+	}
+	// Validate index params
+	if config.Collection.Index.Params.M == 0 {
+		fmt.Printf("[WARN] Index.Params.M parsed as 0, setting to 16\n")
+		config.Collection.Index.Params.M = 16
+	}
+	if config.Collection.Index.Params.EfConstruction == 0 {
+		fmt.Printf("[WARN] Index.Params.EfConstruction parsed as 0, setting to 64\n")
+		config.Collection.Index.Params.EfConstruction = 64
+	}
+	// Validate search params
+	if config.Search.Params.Ef == 0 {
+		fmt.Printf("[WARN] Search.Params.Ef parsed as 0, setting to 64\n")
+		config.Search.Params.Ef = 64
+	}
+
 	return &config, nil
 }
 
@@ -221,6 +271,8 @@ func (c *MilvusCache) initializeCollection() error {
 
 	// Create collection if it doesn't exist
 	if !hasCollection {
+		fmt.Printf("[DEBUG] Collection '%s' does not exist. AutoCreateCollection=%v\n",
+			c.collectionName, c.config.Development.AutoCreateCollection)
 		if !c.config.Development.AutoCreateCollection {
 			return fmt.Errorf("collection %s does not exist and auto-creation is disabled", c.collectionName)
 		}
@@ -433,6 +485,102 @@ func (c *MilvusCache) AddEntry(requestID string, model string, query string, req
 	return err
 }
 
+// AddEntriesBatch stores multiple request-response pairs in the cache efficiently
+func (c *MilvusCache) AddEntriesBatch(entries []CacheEntry) error {
+	start := time.Now()
+
+	if !c.enabled {
+		return nil
+	}
+
+	if len(entries) == 0 {
+		return nil
+	}
+
+	observability.Debugf("MilvusCache.AddEntriesBatch: adding %d entries in batch", len(entries))
+
+	// Prepare slices for all entries
+	ids := make([]string, len(entries))
+	requestIDs := make([]string, len(entries))
+	models := make([]string, len(entries))
+	queries := make([]string, len(entries))
+	requestBodies := make([]string, len(entries))
+	responseBodies := make([]string, len(entries))
+	embeddings := make([][]float32, len(entries))
+	timestamps := make([]int64, len(entries))
+
+	// Generate embeddings and prepare data for all entries
+	for i, entry := range entries {
+		// Generate semantic embedding for the query
+		embedding, err := candle_binding.GetEmbedding(entry.Query, 0)
+		if err != nil {
+			return fmt.Errorf("failed to generate embedding for entry %d: %w", i, err)
+		}
+
+		// Generate unique ID
+		id := fmt.Sprintf("%x", md5.Sum(fmt.Appendf(nil, "%s_%s_%d", entry.Model, entry.Query, time.Now().UnixNano())))
+
+		ids[i] = id
+		requestIDs[i] = entry.RequestID
+		models[i] = entry.Model
+		queries[i] = entry.Query
+		requestBodies[i] = string(entry.RequestBody)
+		responseBodies[i] = string(entry.ResponseBody)
+		embeddings[i] = embedding
+		timestamps[i] = time.Now().Unix()
+	}
+
+	ctx := context.Background()
+
+	// Get embedding dimension from first entry
+	embeddingDim := len(embeddings[0])
+
+	// Create columns
+	idColumn := entity.NewColumnVarChar("id", ids)
+	requestIDColumn := entity.NewColumnVarChar("request_id", requestIDs)
+	modelColumn := entity.NewColumnVarChar("model", models)
+	queryColumn := entity.NewColumnVarChar("query", queries)
+	requestColumn := entity.NewColumnVarChar("request_body", requestBodies)
+	responseColumn := entity.NewColumnVarChar("response_body", responseBodies)
+	embeddingColumn := entity.NewColumnFloatVector(c.config.Collection.VectorField.Name, embeddingDim, embeddings)
+	timestampColumn := entity.NewColumnInt64("timestamp", timestamps)
+
+	// Upsert all entries at once
+	observability.Debugf("MilvusCache.AddEntriesBatch: upserting %d entries into collection '%s'",
+		len(entries), c.collectionName)
+	_, err := c.client.Upsert(ctx, c.collectionName, "", idColumn, requestIDColumn, modelColumn, queryColumn, requestColumn, responseColumn, embeddingColumn, timestampColumn)
+	if err != nil {
+		observability.Debugf("MilvusCache.AddEntriesBatch: upsert failed: %v", err)
+		metrics.RecordCacheOperation("milvus", "add_entries_batch", "error", time.Since(start).Seconds())
+		return fmt.Errorf("failed to upsert cache entries: %w", err)
+	}
+
+	// Note: Flush removed from batch operation for performance
+	// Call Flush() explicitly after all batches if immediate persistence is required
+
+	elapsed := time.Since(start)
+	observability.Debugf("MilvusCache.AddEntriesBatch: successfully added %d entries in %v (%.0f entries/sec)",
+		len(entries), elapsed, float64(len(entries))/elapsed.Seconds())
+	metrics.RecordCacheOperation("milvus", "add_entries_batch", "success", elapsed.Seconds())
+
+	return nil
+}
+
+// Flush forces Milvus to persist all buffered data to disk
+func (c *MilvusCache) Flush() error {
+	if !c.enabled {
+		return nil
+	}
+
+	ctx := context.Background()
+	if err := c.client.Flush(ctx, c.collectionName, false); err != nil {
+		return fmt.Errorf("failed to flush: %w", err)
+	}
+
+	observability.Debugf("MilvusCache: flushed collection '%s'", c.collectionName)
+	return nil
+}
+
 // addEntry handles the internal logic for storing entries in Milvus
 func (c *MilvusCache) addEntry(id string, requestID string, model string, query string, requestBody, responseBody []byte) error {
 	// Generate semantic embedding for the query
@@ -605,6 +753,76 @@ func (c *MilvusCache) FindSimilarWithThreshold(model string, query string, thres
 	return responseBody, true, nil
 }
 
+// GetByID retrieves a document from Milvus by its request ID
+// This is much more efficient than FindSimilar when you already know the ID
+// Used by hybrid cache to fetch documents after local HNSW search
+func (c *MilvusCache) GetByID(ctx context.Context, requestID string) ([]byte, error) {
+	start := time.Now()
+
+	if !c.enabled {
+		return nil, fmt.Errorf("milvus cache is not enabled")
+	}
+
+	observability.Debugf("MilvusCache.GetByID: fetching requestID='%s'", requestID)
+
+	// Query Milvus by request_id (primary key)
+	queryResult, err := c.client.Query(
+		ctx,
+		c.collectionName,
+		[]string{}, // Empty partitions means search all
+		fmt.Sprintf("request_id == \"%s\"", requestID),
+		[]string{"response_body"}, // Only fetch document, not embedding!
+	)
+
+	if err != nil {
+		observability.Debugf("MilvusCache.GetByID: query failed: %v", err)
+		metrics.RecordCacheOperation("milvus", "get_by_id", "error", time.Since(start).Seconds())
+		return nil, fmt.Errorf("milvus query failed: %w", err)
+	}
+
+	if len(queryResult) == 0 {
+		observability.Debugf("MilvusCache.GetByID: document not found: %s", requestID)
+		metrics.RecordCacheOperation("milvus", "get_by_id", "miss", time.Since(start).Seconds())
+		return nil, fmt.Errorf("document not found: %s", requestID)
+	}
+
+	// Extract response body (first column since we only requested "response_body")
+	responseBodyColumn, ok := queryResult[0].(*entity.ColumnVarChar)
+	if !ok {
+		observability.Debugf("MilvusCache.GetByID: unexpected response_body column type: %T", queryResult[0])
+		metrics.RecordCacheOperation("milvus", "get_by_id", "error", time.Since(start).Seconds())
+		return nil, fmt.Errorf("invalid response_body column type: %T", queryResult[0])
+	}
+
+	if responseBodyColumn.Len() == 0 {
+		observability.Debugf("MilvusCache.GetByID: response_body column is empty")
+		metrics.RecordCacheOperation("milvus", "get_by_id", "miss", time.Since(start).Seconds())
+		return nil, fmt.Errorf("response_body is empty for: %s", requestID)
+	}
+
+	// Get the response body value
+	responseBodyStr, err := responseBodyColumn.ValueByIdx(0)
+	if err != nil {
+		observability.Debugf("MilvusCache.GetByID: failed to get response_body value: %v", err)
+		metrics.RecordCacheOperation("milvus", "get_by_id", "error", time.Since(start).Seconds())
+		return nil, fmt.Errorf("failed to get response_body value: %w", err)
+	}
+
+	responseBody := []byte(responseBodyStr)
+
+	if responseBody == nil || len(responseBody) == 0 {
+		observability.Debugf("MilvusCache.GetByID: response_body is empty")
+		metrics.RecordCacheOperation("milvus", "get_by_id", "miss", time.Since(start).Seconds())
+		return nil, fmt.Errorf("response_body is empty for: %s", requestID)
+	}
+
+	observability.Debugf("MilvusCache.GetByID: SUCCESS - fetched %d bytes in %dms",
+		len(responseBody), time.Since(start).Milliseconds())
+	metrics.RecordCacheOperation("milvus", "get_by_id", "success", time.Since(start).Seconds())
+
+	return responseBody, nil
+}
+
 // Close releases all resources held by the cache
 func (c *MilvusCache) Close() error {
 	if c.client != nil {
diff --git a/src/semantic-router/pkg/cache/simd_benchmark_test.go b/src/semantic-router/pkg/cache/simd_benchmark_test.go
new file mode 100644
index 00000000..3c30fa47
--- /dev/null
+++ b/src/semantic-router/pkg/cache/simd_benchmark_test.go
@@ -0,0 +1,141 @@
+package cache
+
+import (
+	"math/rand"
+	"testing"
+)
+
+// Benchmark SIMD vs scalar dotProduct implementations
+func BenchmarkDotProduct(b *testing.B) {
+	// Test with different vector sizes
+	sizes := []int{64, 128, 256, 384, 512, 768, 1024}
+
+	for _, size := range sizes {
+		// Generate random vectors
+		a := make([]float32, size)
+		vec_b := make([]float32, size)
+		for i := 0; i < size; i++ {
+			a[i] = rand.Float32()
+			vec_b[i] = rand.Float32()
+		}
+
+		b.Run("SIMD/"+string(rune(size)), func(b *testing.B) {
+			b.ReportAllocs()
+			var sum float32
+			for i := 0; i < b.N; i++ {
+				sum += dotProductSIMD(a, vec_b)
+			}
+			_ = sum
+		})
+
+		b.Run("Scalar/"+string(rune(size)), func(b *testing.B) {
+			b.ReportAllocs()
+			var sum float32
+			for i := 0; i < b.N; i++ {
+				sum += dotProductScalar(a, vec_b)
+			}
+			_ = sum
+		})
+	}
+}
+
+// Test correctness of SIMD implementation
+func TestDotProductSIMD(t *testing.T) {
+	testCases := []struct {
+		name string
+		a    []float32
+		b    []float32
+		want float32
+	}{
+		{
+			name: "empty",
+			a:    []float32{},
+			b:    []float32{},
+			want: 0,
+		},
+		{
+			name: "single element",
+			a:    []float32{2.0},
+			b:    []float32{3.0},
+			want: 6.0,
+		},
+		{
+			name: "short vector",
+			a:    []float32{1, 2, 3},
+			b:    []float32{4, 5, 6},
+			want: 32.0, // 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
+		},
+		{
+			name: "8 elements (AVX2 boundary)",
+			a:    []float32{1, 2, 3, 4, 5, 6, 7, 8},
+			b:    []float32{1, 1, 1, 1, 1, 1, 1, 1},
+			want: 36.0, // 1+2+3+4+5+6+7+8 = 36
+		},
+		{
+			name: "16 elements (AVX-512 boundary)",
+			a:    []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+			b:    []float32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+			want: 136.0, // 1+2+...+16 = 136
+		},
+		{
+			name: "non-aligned size (17 elements)",
+			a:    []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17},
+			b:    []float32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+			want: 153.0, // 1+2+...+17 = 153
+		},
+		{
+			name: "384 dimensions (typical embedding size)",
+			a:    make384Vector(),
+			b:    ones(384),
+			want: sum384(),
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := dotProductSIMD(tc.a, tc.b)
+			if abs(got-tc.want) > 0.0001 {
+				t.Errorf("dotProductSIMD() = %v, want %v", got, tc.want)
+			}
+
+			// Also verify scalar produces same result
+			scalar := dotProductScalar(tc.a, tc.b)
+			if abs(scalar-tc.want) > 0.0001 {
+				t.Errorf("dotProductScalar() = %v, want %v", scalar, tc.want)
+			}
+
+			// SIMD and scalar should match
+			if abs(got-scalar) > 0.0001 {
+				t.Errorf("SIMD (%v) != Scalar (%v)", got, scalar)
+			}
+		})
+	}
+}
+
+func make384Vector() []float32 {
+	v := make([]float32, 384)
+	for i := range v {
+		v[i] = float32(i + 1)
+	}
+	return v
+}
+
+func ones(n int) []float32 {
+	v := make([]float32, n)
+	for i := range v {
+		v[i] = 1.0
+	}
+	return v
+}
+
+func sum384() float32 {
+	// Sum of 1+2+3+...+384 = 384 * 385 / 2 = 73920
+	return 73920.0
+}
+
+func abs(x float32) float32 {
+	if x < 0 {
+		return -x
+	}
+	return x
+}
diff --git a/src/semantic-router/pkg/cache/simd_distance_amd64.go b/src/semantic-router/pkg/cache/simd_distance_amd64.go
new file mode 100644
index 00000000..0a943245
--- /dev/null
+++ b/src/semantic-router/pkg/cache/simd_distance_amd64.go
@@ -0,0 +1,60 @@
+//go:build amd64 && !purego
+// +build amd64,!purego
+
+package cache
+
+import (
+	"golang.org/x/sys/cpu"
+)
+
+// CPU feature flags detected at runtime
+var (
+	hasAVX2   bool
+	hasAVX512 bool
+)
+
+func init() {
+	// Detect CPU features at startup
+	hasAVX2 = cpu.X86.HasAVX2
+	hasAVX512 = cpu.X86.HasAVX512F
+}
+
+// dotProductSIMD computes dot product using SIMD instructions
+// Uses AVX-512 (16x float32), AVX2 (8x float32), or scalar fallback
+func dotProductSIMD(a, b []float32) float32 {
+	if len(a) == 0 || len(b) == 0 {
+		return 0
+	}
+
+	minLen := len(a)
+	if len(b) < minLen {
+		minLen = len(b)
+	}
+
+	// Choose best SIMD implementation based on CPU features
+	if hasAVX512 && minLen >= 16 {
+		return dotProductAVX512(a[:minLen], b[:minLen])
+	} else if hasAVX2 && minLen >= 8 {
+		return dotProductAVX2(a[:minLen], b[:minLen])
+	}
+
+	// Scalar fallback for short vectors or older CPUs
+	return dotProductScalar(a[:minLen], b[:minLen])
+}
+
+// dotProductScalar is the baseline scalar implementation
+func dotProductScalar(a, b []float32) float32 {
+	var sum float32
+	for i := 0; i < len(a); i++ {
+		sum += a[i] * b[i]
+	}
+	return sum
+}
+
+// dotProductAVX2 uses AVX2 to process 8 float32s at a time
+// Implemented in assembly for maximum performance
+func dotProductAVX2(a, b []float32) float32
+
+// dotProductAVX512 uses AVX-512 to process 16 float32s at a time
+// Implemented in assembly for maximum performance
+func dotProductAVX512(a, b []float32) float32
diff --git a/src/semantic-router/pkg/cache/simd_distance_amd64.s b/src/semantic-router/pkg/cache/simd_distance_amd64.s
new file mode 100644
index 00000000..3bc3bb54
--- /dev/null
+++ b/src/semantic-router/pkg/cache/simd_distance_amd64.s
@@ -0,0 +1,114 @@
+// func dotProductAVX2(a, b []float32) float32
+TEXT ·dotProductAVX2(SB), $0-52
+    MOVQ    a_base+0(FP), AX      // AX = &a[0]
+    MOVQ    b_base+24(FP), BX     // BX = &b[0]
+    MOVQ    a_len+8(FP), CX       // CX = len(a)
+    
+    // Initialize accumulator to zero
+    VXORPS  Y0, Y0, Y0            // Y0 = accumulator (8x float32)
+    
+    // Calculate number of full 8-element chunks
+    MOVQ    CX, DX
+    SHRQ    $3, DX                // DX = len / 8
+    JZ      remainder             // Jump if less than 8 elements
+    
+loop_avx2:
+    // Load 8 float32s from a and b
+    VMOVUPS (AX), Y1              // Y1 = a[i:i+8]
+    VMOVUPS (BX), Y2              // Y2 = b[i:i+8]
+    
+    // Multiply and accumulate: Y0 += Y1 * Y2
+    VFMADD231PS Y1, Y2, Y0        // Y0 = Y0 + (Y1 * Y2) [FMA instruction]
+    
+    // Advance pointers
+    ADDQ    $32, AX               // AX += 32 bytes (8 * 4 bytes)
+    ADDQ    $32, BX               // BX += 32 bytes
+    
+    DECQ    DX
+    JNZ     loop_avx2
+    
+remainder:
+    // Horizontal sum of Y0 (8 float32s -> 1 float32)
+    VEXTRACTF128 $1, Y0, X1       // X1 = upper 4 elements of Y0
+    VADDPS  X0, X1, X0            // X0 = sum of lower and upper halves
+    VHADDPS X0, X0, X0            // Horizontal add (4->2)
+    VHADDPS X0, X0, X0            // Horizontal add (2->1)
+    
+    // Handle remaining elements (scalar)
+    MOVQ    CX, DX
+    ANDQ    $7, DX                // DX = len % 8
+    JZ      done
+    
+remainder_loop:
+    VMOVSS  (AX), X1
+    VMOVSS  (BX), X2
+    VMULSS  X1, X2, X1
+    VADDSS  X0, X1, X0
+    
+    ADDQ    $4, AX
+    ADDQ    $4, BX
+    DECQ    DX
+    JNZ     remainder_loop
+    
+done:
+    VMOVSS  X0, ret+48(FP)
+    RET
+
+// func dotProductAVX512(a, b []float32) float32
+TEXT ·dotProductAVX512(SB), $0-52
+    MOVQ    a_base+0(FP), AX      // AX = &a[0]
+    MOVQ    b_base+24(FP), BX     // BX = &b[0]
+    MOVQ    a_len+8(FP), CX       // CX = len(a)
+    
+    // Initialize accumulator to zero
+    VXORPS  Z0, Z0, Z0            // Z0 = accumulator (16x float32)
+    
+    // Calculate number of full 16-element chunks
+    MOVQ    CX, DX
+    SHRQ    $4, DX                // DX = len / 16
+    JZ      remainder512          // Jump if less than 16 elements
+    
+loop_avx512:
+    // Load 16 float32s from a and b
+    VMOVUPS (AX), Z1              // Z1 = a[i:i+16]
+    VMOVUPS (BX), Z2              // Z2 = b[i:i+16]
+    
+    // Multiply and accumulate: Z0 += Z1 * Z2
+    VFMADD231PS Z1, Z2, Z0        // Z0 = Z0 + (Z1 * Z2)
+    
+    // Advance pointers
+    ADDQ    $64, AX               // AX += 64 bytes (16 * 4 bytes)
+    ADDQ    $64, BX               // BX += 64 bytes
+    
+    DECQ    DX
+    JNZ     loop_avx512
+    
+remainder512:
+    // Horizontal sum of Z0 (16 float32s -> 1 float32)
+    VEXTRACTF32X8 $1, Z0, Y1      // Y1 = upper 8 elements
+    VADDPS  Y0, Y1, Y0            // Y0 = sum of lower and upper halves (8 elements)
+    VEXTRACTF128 $1, Y0, X1       // X1 = upper 4 elements
+    VADDPS  X0, X1, X0            // X0 = 4 elements
+    VHADDPS X0, X0, X0            // 4->2
+    VHADDPS X0, X0, X0            // 2->1
+    
+    // Handle remaining elements (scalar)
+    MOVQ    CX, DX
+    ANDQ    $15, DX               // DX = len % 16
+    JZ      done512
+    
+remainder512_loop:
+    VMOVSS  (AX), X1
+    VMOVSS  (BX), X2
+    VMULSS  X1, X2, X1
+    VADDSS  X0, X1, X0
+    
+    ADDQ    $4, AX
+    ADDQ    $4, BX
+    DECQ    DX
+    JNZ     remainder512_loop
+    
+done512:
+    VMOVSS  X0, ret+48(FP)
+    RET
+
diff --git a/src/semantic-router/pkg/cache/simd_distance_generic.go b/src/semantic-router/pkg/cache/simd_distance_generic.go
new file mode 100644
index 00000000..1e30f5f6
--- /dev/null
+++ b/src/semantic-router/pkg/cache/simd_distance_generic.go
@@ -0,0 +1,22 @@
+//go:build !amd64 || purego
+// +build !amd64 purego
+
+package cache
+
+// dotProductSIMD falls back to scalar on non-amd64 platforms
+func dotProductSIMD(a, b []float32) float32 {
+	return dotProductScalar(a, b)
+}
+
+// dotProductScalar is the baseline scalar implementation
+func dotProductScalar(a, b []float32) float32 {
+	var sum float32
+	minLen := len(a)
+	if len(b) < minLen {
+		minLen = len(b)
+	}
+	for i := 0; i < minLen; i++ {
+		sum += a[i] * b[i]
+	}
+	return sum
+}
diff --git a/tools/make/milvus.mk b/tools/make/milvus.mk
index 7fa05195..8aa8780e 100644
--- a/tools/make/milvus.mk
+++ b/tools/make/milvus.mk
@@ -86,3 +86,112 @@ stop-milvus-ui:
 	@$(CONTAINER_RUNTIME) stop milvus-ui || true
 	@$(CONTAINER_RUNTIME) rm milvus-ui || true
 	@echo "Attu container stopped and removed"
+
+# Hybrid vs Milvus Benchmarks
+benchmark-hybrid-vs-milvus: rust start-milvus ## Run comprehensive Hybrid Cache vs Milvus benchmarks
+	@$(LOG_TARGET)
+	@echo "═══════════════════════════════════════════════════════════"
+	@echo "  Hybrid Cache vs Milvus Benchmark Suite"
+	@echo "  Validating claims from hybrid HNSW storage paper"
+	@echo "  Cache sizes: 10K, 50K, 100K entries"
+	@echo "═══════════════════════════════════════════════════════════"
+	@echo ""
+	@echo "GPU Usage:"
+	@echo "  • To use GPU: USE_CPU=false make benchmark-hybrid-vs-milvus"
+	@echo "  • Select GPUs: CUDA_VISIBLE_DEVICES=2,3 USE_CPU=false make benchmark-hybrid-vs-milvus"
+	@echo "  • Default: Uses GPU if available (USE_CPU=false)"
+	@echo ""
+	@bash scripts/run_hybrid_vs_milvus_benchmarks.sh
+	@echo ""
+	@echo "Benchmarks complete! Results in: benchmark_results/hybrid_vs_milvus/"
+	@echo ""
+	@echo "Next steps:"
+	@echo "  make analyze-hybrid-benchmarks    # Analyze results"
+	@echo "  make plot-hybrid-benchmarks       # Generate plots"
+	@echo "  make stop-milvus                  # Clean up"
+
+analyze-hybrid-benchmarks: ## Analyze Hybrid vs Milvus benchmark results
+	@$(LOG_TARGET)
+	@echo "Checking for CSV results in benchmark_results/hybrid_vs_milvus/..."
+	@if ls benchmark_results/hybrid_vs_milvus/results_*.csv >/dev/null 2>&1; then \
+		echo "Found CSV results, analyzing..."; \
+		python3 scripts/analyze_hybrid_benchmarks.py; \
+	elif [ -f /tmp/benchmark_batch_fixed.log ]; then \
+		echo "No CSV found, parsing from log file..."; \
+		python3 scripts/parse_hybrid_benchmark_log.py /tmp/benchmark_batch_fixed.log; \
+	else \
+		echo "$(shell tput setaf 3)No benchmark results found. Run 'make benchmark-hybrid-quick' first.$(shell tput sgr0)"; \
+		exit 1; \
+	fi
+
+plot-hybrid-benchmarks: ## Generate plots from Hybrid vs Milvus benchmarks
+	@$(LOG_TARGET)
+	@python3 scripts/plot_hybrid_comparison.py
+
+benchmark-hybrid-quick: rust ## Run quick Hybrid vs Milvus benchmark (smaller scale)
+	@$(LOG_TARGET)
+	@echo "═══════════════════════════════════════════════════════════"
+	@echo "  Quick Hybrid vs Milvus Benchmark (10K entries only)"
+	@echo "  Estimated time: 7-10 minutes"
+	@echo "═══════════════════════════════════════════════════════════"
+	@echo ""
+	@echo "Cleaning and restarting Milvus..."
+	@$(CONTAINER_RUNTIME) stop milvus-semantic-cache 2>/dev/null || true
+	@$(CONTAINER_RUNTIME) rm milvus-semantic-cache 2>/dev/null || true
+	@sudo rm -rf /tmp/milvus-data 2>/dev/null || true
+	@$(MAKE) start-milvus
+	@sleep 5
+	@echo ""
+	@echo "GPU Usage:"
+	@echo "  • To use GPU: USE_CPU=false make benchmark-hybrid-quick"
+	@echo "  • Select GPUs: CUDA_VISIBLE_DEVICES=2,3 USE_CPU=false make benchmark-hybrid-quick"
+	@echo ""
+	@echo "Test Options:"
+	@echo "  • Hybrid only: SKIP_MILVUS=true make benchmark-hybrid-quick"
+	@echo "  • Both caches: make benchmark-hybrid-quick (default)"
+	@echo ""
+	@mkdir -p benchmark_results/hybrid_vs_milvus
+	@export LD_LIBRARY_PATH=$${PWD}/candle-binding/target/release && \
+		export USE_CPU=$${USE_CPU:-false} && \
+		export SKIP_MILVUS=$${SKIP_MILVUS:-false} && \
+		echo "Using GPU mode: USE_CPU=$$USE_CPU" && \
+		echo "Skip Milvus: SKIP_MILVUS=$$SKIP_MILVUS" && \
+		cd src/semantic-router/pkg/cache && \
+		CGO_ENABLED=1 go test -v -timeout 60m -tags=milvus \
+		-run='^$$' -bench='^BenchmarkHybridVsMilvus/CacheSize_10000$$' \
+		-benchtime=50x -benchmem .
+	@echo ""
+	@echo "Quick benchmark complete!"
+	@echo "Results in: benchmark_results/hybrid_vs_milvus/"
+
+benchmark-hybrid-only: rust ## Run ONLY Hybrid cache benchmark (skip Milvus for faster testing)
+	@$(LOG_TARGET)
+	@echo "═══════════════════════════════════════════════════════════"
+	@echo "  Hybrid Cache ONLY Benchmark (10K entries)"
+	@echo "  Estimated time: 3-5 minutes"
+	@echo "═══════════════════════════════════════════════════════════"
+	@echo ""
+	@echo "Cleaning and restarting Milvus..."
+	@$(CONTAINER_RUNTIME) stop milvus-semantic-cache 2>/dev/null || true
+	@$(CONTAINER_RUNTIME) rm milvus-semantic-cache 2>/dev/null || true
+	@sudo rm -rf /tmp/milvus-data 2>/dev/null || true
+	@$(MAKE) start-milvus
+	@sleep 5
+	@echo ""
+	@echo "GPU Usage:"
+	@echo "  • To use GPU: USE_CPU=false make benchmark-hybrid-only"
+	@echo "  • Select GPUs: CUDA_VISIBLE_DEVICES=2,3 USE_CPU=false make benchmark-hybrid-only"
+	@echo ""
+	@mkdir -p benchmark_results/hybrid_vs_milvus
+	@export LD_LIBRARY_PATH=$${PWD}/candle-binding/target/release && \
+		export USE_CPU=$${USE_CPU:-false} && \
+		export SKIP_MILVUS=true && \
+		echo "Using GPU mode: USE_CPU=$$USE_CPU" && \
+		echo "Testing HYBRID CACHE ONLY (Milvus skipped)" && \
+		cd src/semantic-router/pkg/cache && \
+		CGO_ENABLED=1 go test -v -timeout 60m -tags=milvus \
+		-run='^$$' -bench='^BenchmarkHybridVsMilvus/CacheSize_10000$$' \
+		-benchtime=50x -benchmem .
+	@echo ""
+	@echo "Hybrid-only benchmark complete!"
+	@echo "Results in: benchmark_results/hybrid_vs_milvus/"
diff --git a/website/docs/tutorials/semantic-cache/hybrid-cache.md b/website/docs/tutorials/semantic-cache/hybrid-cache.md
new file mode 100644
index 00000000..40b8fd08
--- /dev/null
+++ b/website/docs/tutorials/semantic-cache/hybrid-cache.md
@@ -0,0 +1,416 @@
+# Hybrid Cache: HNSW + Milvus
+
+The Hybrid Cache combines the best of both worlds: in-memory HNSW index for ultra-fast search with Milvus vector database for scalable, persistent storage.
+
+## Overview
+
+The hybrid architecture provides:
+- **O(log n) search** via in-memory HNSW index
+- **Unlimited storage** via Milvus vector database
+- **Cost efficiency** by keeping only hot data in memory
+- **Persistence** with Milvus as the source of truth
+- **Hot data caching** with local document cache
+
+## Architecture
+
+```
+┌──────────────────────────────────────────────────┐
+│                  Hybrid Cache                     │
+├──────────────────────────────────────────────────┤
+│  ┌─────────────────┐      ┌──────────────────┐  │
+│  │  In-Memory      │      │   Local Cache    │  │
+│  │  HNSW Index     │◄─────┤   (Hot Data)     │  │
+│  │  (100K entries) │      │   (1K docs)      │  │
+│  └────────┬────────┘      └──────────────────┘  │
+│           │                                       │
+│           │ ID Mapping                           │
+│           ▼                                       │
+│  ┌──────────────────────────────────────────┐   │
+│  │         Milvus Vector Database           │   │
+│  │       (Millions of entries)              │   │
+│  └──────────────────────────────────────────┘   │
+└──────────────────────────────────────────────────┘
+```
+
+## How It Works
+
+### 1. Write Path (AddEntry)
+
+```
+User Request
+    │
+    ├─► Generate Embedding (BERT)
+    │
+    ├─► Write to Milvus (persistence)
+    │
+    └─► Add to HNSW Index (if space available)
+        │
+        └─► Add to Local Cache
+```
+
+### 2. Read Path (FindSimilar)
+
+```
+User Query
+    │
+    ├─► Generate Query Embedding
+    │
+    ├─► Search HNSW Index (10 candidates)
+    │
+    ├─► Check Local Cache (hot path)
+    │   ├─► HIT: Return immediately
+    │   └─► MISS: Continue
+    │
+    └─► Fetch from Milvus (cold path)
+        └─► Cache in Local Cache
+```
+
+### 3. Memory Management
+
+- **HNSW Index**: Limited to `max_memory_entries` (default: 100K)
+- **Local Cache**: Limited to `local_cache_size` (default: 1K documents)
+- **Eviction**: FIFO policy when limits reached
+- **Data Persistence**: All data remains in Milvus
+
+## Configuration
+
+### Basic Configuration
+
+```yaml
+semantic_cache:
+  enabled: true
+  backend_type: "hybrid"
+  similarity_threshold: 0.85
+  ttl_seconds: 3600
+  
+  # Hybrid-specific settings
+  max_memory_entries: 100000  # Max entries in HNSW
+  local_cache_size: 1000      # Local document cache size
+  
+  # HNSW parameters
+  hnsw_m: 16
+  hnsw_ef_construction: 200
+  
+  # Milvus configuration
+  backend_config_path: "config/milvus.yaml"
+```
+
+### Configuration Parameters
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `backend_type` | string | - | Must be `"hybrid"` |
+| `similarity_threshold` | float | 0.85 | Minimum similarity for cache hit |
+| `max_memory_entries` | int | 100000 | Max entries in HNSW index |
+| `local_cache_size` | int | 1000 | Hot document cache size |
+| `hnsw_m` | int | 16 | HNSW bi-directional links |
+| `hnsw_ef_construction` | int | 200 | HNSW construction quality |
+| `backend_config_path` | string | - | Path to Milvus config file |
+
+### Milvus Configuration
+
+Create `config/milvus.yaml`:
+
+```yaml
+milvus:
+  address: "localhost:19530"
+  collection_name: "semantic_cache"
+  dimension: 384
+  index_type: "HNSW"
+  metric_type: "IP"
+  params:
+    M: 16
+    efConstruction: 200
+```
+
+## Performance Characteristics
+
+### Search Performance
+
+| Cache Size | Memory Backend | Hybrid (HNSW) | Hybrid (Local) | Improvement |
+|------------|---------------|---------------|----------------|-------------|
+| 100 entries | 0.5 ms | 0.3 ms | **0.05 ms** | 10x faster |
+| 1K entries | 2 ms | 0.4 ms | **0.05 ms** | 40x faster |
+| 10K entries | 15 ms | 0.6 ms | **0.05 ms** | 300x faster |
+| 100K entries | 150 ms | 0.8 ms | **0.05 ms** | 3000x faster |
+| 1M entries | N/A (OOM) | 1.2 ms | **0.05 ms** | ∞ |
+
+### Memory Usage
+
+| Component | Memory per Entry | 100K Entries | 1M Entries |
+|-----------|-----------------|--------------|------------|
+| Embeddings (384D) | ~1.5 KB | ~150 MB | ~1.5 GB |
+| HNSW Graph | ~0.5 KB | ~50 MB | ~500 MB |
+| Local Cache | ~2 KB | ~2 MB (1K docs) | ~2 MB |
+| **Total In-Memory** | - | ~200 MB | ~2 GB |
+
+**Milvus Storage**: Unlimited (disk-based)
+
+## Use Cases
+
+### When to Use Hybrid Cache
+
+✅ **Ideal for:**
+- Large-scale applications (>100K cache entries)
+- Production systems requiring persistence
+- Applications with hot/cold access patterns
+- Cost-sensitive deployments
+- Multi-instance deployments sharing cache
+
+### When to Use Memory Backend
+
+✅ **Ideal for:**
+- Small to medium scale (<10K entries)
+- Development and testing
+- Single-instance deployments
+- No persistence required
+
+### When to Use Milvus Backend
+
+✅ **Ideal for:**
+- Massive scale (millions of entries)
+- Complex vector search requirements
+- Applications without latency sensitivity
+
+## Example Usage
+
+### Go Code
+
+```go
+import "github.com/vllm-project/semantic-router/src/semantic-router/pkg/cache"
+
+// Initialize hybrid cache
+options := cache.HybridCacheOptions{
+    Enabled:             true,
+    SimilarityThreshold: 0.85,
+    TTLSeconds:          3600,
+    MaxMemoryEntries:    100000,
+    HNSWM:               16,
+    HNSWEfConstruction:  200,
+    MilvusConfigPath:    "config/milvus.yaml",
+    LocalCacheSize:      1000,
+}
+
+hybridCache, err := cache.NewHybridCache(options)
+if err != nil {
+    log.Fatalf("Failed to create hybrid cache: %v", err)
+}
+defer hybridCache.Close()
+
+// Add cache entry
+err = hybridCache.AddEntry(
+    "request-id-123",
+    "gpt-4",
+    "What is quantum computing?",
+    []byte(`{"prompt": "What is quantum computing?"}`),
+    []byte(`{"response": "Quantum computing is..."}`),
+)
+
+// Search for similar query
+response, found, err := hybridCache.FindSimilar(
+    "gpt-4",
+    "Explain quantum computers",
+)
+if found {
+    fmt.Printf("Cache hit! Response: %s\n", string(response))
+}
+
+// Get statistics
+stats := hybridCache.GetStats()
+fmt.Printf("Total entries in HNSW: %d\n", stats.TotalEntries)
+fmt.Printf("Hit ratio: %.2f%%\n", stats.HitRatio * 100)
+```
+
+## Monitoring and Metrics
+
+The hybrid cache exposes metrics for monitoring:
+
+```go
+stats := hybridCache.GetStats()
+
+// Available metrics
+stats.TotalEntries  // Entries in HNSW index
+stats.HitCount      // Total cache hits
+stats.MissCount     // Total cache misses
+stats.HitRatio      // Hit ratio (0.0 - 1.0)
+```
+
+### Prometheus Metrics
+
+```
+# Cache entries in HNSW
+semantic_cache_entries{backend="hybrid"} 95432
+
+# Cache operations
+semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="hit_local"} 12453
+semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="hit_milvus"} 3421
+semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="miss"} 892
+
+# Cache hit ratio
+semantic_cache_hit_ratio{backend="hybrid"} 0.947
+```
+
+## Best Practices
+
+### 1. Right-Size Your Memory
+
+Choose `max_memory_entries` based on your working set:
+
+```yaml
+# For 1M total entries with 10% hot data
+max_memory_entries: 100000  # 100K in HNSW
+local_cache_size: 1000      # 1K hottest documents
+```
+
+### 2. Tune HNSW Parameters
+
+Balance recall vs. speed:
+
+```yaml
+# High recall (slower build, better search)
+hnsw_m: 32
+hnsw_ef_construction: 400
+
+# Balanced (recommended)
+hnsw_m: 16
+hnsw_ef_construction: 200
+
+# Fast build (lower recall)
+hnsw_m: 8
+hnsw_ef_construction: 100
+```
+
+### 3. Monitor Hit Rates
+
+Track cache effectiveness:
+
+```bash
+# Check cache stats
+curl http://localhost:8080/metrics | grep cache
+
+# Optimal hit rates:
+# - Local cache: >80% (hot data)
+# - Milvus cache: >90% (total)
+```
+
+### 4. Adjust Similarity Threshold
+
+```yaml
+# Stricter matching (fewer false positives)
+similarity_threshold: 0.90
+
+# Balanced (recommended)
+similarity_threshold: 0.85
+
+# Looser matching (more cache hits)
+similarity_threshold: 0.80
+```
+
+## Troubleshooting
+
+### High Memory Usage
+
+**Symptom**: Memory usage exceeds expectations
+
+**Solution**:
+```yaml
+# Reduce HNSW index size
+max_memory_entries: 50000  # Instead of 100000
+
+# Reduce local cache
+local_cache_size: 500      # Instead of 1000
+
+# Use smaller HNSW M
+hnsw_m: 8                  # Instead of 16
+```
+
+### Low Hit Rate
+
+**Symptom**: Cache hit rate < 50%
+
+**Solution**:
+1. Lower similarity threshold
+2. Increase `max_memory_entries`
+3. Check Milvus connectivity
+4. Verify embedding model consistency
+
+### Slow Queries
+
+**Symptom**: Queries taking > 10ms
+
+**Solution**:
+1. Check Milvus network latency
+2. Increase `local_cache_size` for hot data
+3. Verify HNSW index health
+4. Monitor Milvus load
+
+## Migration Guide
+
+### From Memory Backend
+
+```yaml
+# Before
+semantic_cache:
+  backend_type: "memory"
+  max_entries: 10000
+
+# After
+semantic_cache:
+  backend_type: "hybrid"
+  max_memory_entries: 10000  # Keep same HNSW size
+  local_cache_size: 1000
+  backend_config_path: "config/milvus.yaml"
+```
+
+### From Milvus Backend
+
+```yaml
+# Before
+semantic_cache:
+  backend_type: "milvus"
+  backend_config_path: "config/milvus.yaml"
+
+# After
+semantic_cache:
+  backend_type: "hybrid"
+  max_memory_entries: 100000  # Add HNSW layer
+  local_cache_size: 1000      # Add local cache
+  backend_config_path: "config/milvus.yaml"  # Keep Milvus
+```
+
+## Advanced Topics
+
+### Custom Eviction Strategy
+
+Currently uses FIFO. Future versions may support:
+- LRU (Least Recently Used)
+- LFU (Least Frequently Used)
+- TTL-based eviction
+
+### Multi-Instance Deployment
+
+The hybrid cache is designed for multi-instance deployments:
+
+```
+┌─────────────┐   ┌─────────────┐   ┌─────────────┐
+│  Instance 1 │   │  Instance 2 │   │  Instance 3 │
+│  HNSW Cache │   │  HNSW Cache │   │  HNSW Cache │
+└──────┬──────┘   └──────┬──────┘   └──────┬──────┘
+       │                 │                 │
+       └─────────────────┼─────────────────┘
+                         │
+                  ┌──────▼──────┐
+                  │   Milvus    │
+                  │  (Shared)   │
+                  └─────────────┘
+```
+
+Each instance maintains its own HNSW index and local cache, but shares Milvus for persistence and data consistency.
+
+## See Also
+
+- [In-Memory Cache Documentation](./in-memory-cache.md)
+- [Milvus Cache Documentation](./milvus-cache.md)
+- [HNSW Implementation Details](../../HNSW_IMPLEMENTATION_SUMMARY.md)
+- [Research Paper: Hybrid Architecture](../../papers/hybrid_hnsw_storage_architecture.md)
+
diff --git a/website/docs/tutorials/semantic-cache/in-memory-cache.md b/website/docs/tutorials/semantic-cache/in-memory-cache.md
index 4ba99a8a..1f991a2b 100644
--- a/website/docs/tutorials/semantic-cache/in-memory-cache.md
+++ b/website/docs/tutorials/semantic-cache/in-memory-cache.md
@@ -48,6 +48,10 @@ semantic_cache:
   max_entries: 1000
   ttl_seconds: 3600
   eviction_policy: "fifo"
+  # Optional: Enable HNSW for faster search with large caches
+  use_hnsw: true
+  hnsw_m: 16
+  hnsw_ef_construction: 200
 ```
 
 ### Category-Level Configuration (New)
@@ -99,6 +103,57 @@ categories:
 | `max_entries` | integer | `1000` | Maximum number of cached entries |
 | `ttl_seconds` | integer | `3600` | Time-to-live for cache entries (seconds, 0 = no expiration) |
 | `eviction_policy` | string | `"fifo"` | Eviction policy: `"fifo"`, `"lru"`, `"lfu"` |
+| `use_hnsw` | boolean | `false` | Enable HNSW index for faster similarity search |
+| `hnsw_m` | integer | `16` | HNSW M parameter (bi-directional links per node) |
+| `hnsw_ef_construction` | integer | `200` | HNSW efConstruction parameter (build quality) |
+
+### HNSW Index for Accelerated Search
+
+The in-memory cache supports HNSW (Hierarchical Navigable Small World) indexing for significantly faster similarity search, especially beneficial with large cache sizes.
+
+#### When to Use HNSW
+
+- **Large cache sizes** (>100 entries): HNSW provides logarithmic search time vs linear
+- **High query throughput**: Reduces CPU usage for similarity search
+- **Production deployments**: Better performance under load
+
+#### HNSW Configuration
+
+```yaml
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 10000           # Large cache benefits from HNSW
+  ttl_seconds: 3600
+  eviction_policy: "lru"
+  use_hnsw: true               # Enable HNSW index
+  hnsw_m: 16                   # Default: 16 (higher = better recall, more memory)
+  hnsw_ef_construction: 200    # Default: 200 (higher = better quality, slower build)
+```
+
+#### HNSW Parameters
+
+- **`hnsw_m`**: Number of bi-directional links created for each node
+  - Lower values (8-12): Faster build, less memory, lower recall
+  - Default (16): Balanced performance
+  - Higher values (32-64): Better recall, more memory, slower build
+
+- **`hnsw_ef_construction`**: Size of dynamic candidate list during construction
+  - Lower values (100-150): Faster index building
+  - Default (200): Good balance
+  - Higher values (400-800): Better quality, slower build
+
+#### Performance Comparison
+
+| Cache Size | Linear Search | HNSW Search | Speedup |
+|-----------|---------------|-------------|---------|
+| 100 entries | ~0.5ms | ~0.4ms | 1.25x |
+| 1,000 entries | ~5ms | ~0.8ms | 6.25x |
+| 10,000 entries | ~50ms | ~1.2ms | 41.7x |
+| 100,000 entries | ~500ms | ~1.5ms | 333x |
+
+*Benchmarks on typical hardware with 384-dimensional embeddings*
 
 ### Category-Level Configuration Options
 
@@ -121,6 +176,22 @@ semantic_cache:
   max_entries: 500             # Small cache for development
   ttl_seconds: 1800            # 30 minutes
   eviction_policy: "fifo"
+  use_hnsw: false              # Optional for small dev cache
+```
+
+#### Production Environment with HNSW
+
+```yaml
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.85
+  max_entries: 50000           # Large production cache
+  ttl_seconds: 7200            # 2 hours
+  eviction_policy: "lru"
+  use_hnsw: true               # Enable for production
+  hnsw_m: 16
+  hnsw_ef_construction: 200
 ```
 
 ## Setup and Testing
@@ -187,6 +258,8 @@ curl -X POST http://localhost:8080/v1/chat/completions \
 - **Simple setup**: No external dependencies required
 - **High throughput**: Can handle thousands of cache operations per second
 - **Immediate availability**: Cache is ready as soon as the router starts
+- **HNSW acceleration**: Optional HNSW indexing for fast similarity search at scale
+- **Flexible eviction**: Multiple eviction policies (FIFO, LRU, LFU) to suit workload
 
 ### Limitations
 

From 305c9d2157c29ea4746aaaf9e5febb4bf34a8cd2 Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Tue, 21 Oct 2025 20:41:22 +0000
Subject: [PATCH 02/13] chore: run go mod tidy to clean up module dependencies

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 src/semantic-router/go.mod | 2 +-
 src/semantic-router/go.sum | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/semantic-router/go.mod b/src/semantic-router/go.mod
index 2c7dc291..18bbf002 100644
--- a/src/semantic-router/go.mod
+++ b/src/semantic-router/go.mod
@@ -29,6 +29,7 @@ require (
 	go.opentelemetry.io/otel/sdk v1.38.0
 	go.opentelemetry.io/otel/trace v1.38.0
 	go.uber.org/zap v1.27.0
+	golang.org/x/sys v0.37.0
 	google.golang.org/grpc v1.75.0
 	gopkg.in/yaml.v3 v3.0.1
 	k8s.io/apimachinery v0.31.4
@@ -93,7 +94,6 @@ require (
 	go.yaml.in/yaml/v2 v2.4.2 // indirect
 	golang.org/x/net v0.43.0 // indirect
 	golang.org/x/sync v0.16.0 // indirect
-	golang.org/x/sys v0.37.0 // indirect
 	golang.org/x/text v0.28.0 // indirect
 	golang.org/x/tools v0.35.0 // indirect
 	google.golang.org/genproto/googleapis/api v0.0.0-20250929231259-57b25ae835d4 // indirect
diff --git a/src/semantic-router/go.sum b/src/semantic-router/go.sum
index d1f42cc1..01de0650 100644
--- a/src/semantic-router/go.sum
+++ b/src/semantic-router/go.sum
@@ -426,8 +426,6 @@ golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBc
 golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.0.0-20220209214540-3681064d5158/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI=
-golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
 golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
 golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
 golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=

From 2d06b40b4f987a47e0a5f640b0a051d7b77e15be Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Tue, 21 Oct 2025 22:42:56 +0000
Subject: [PATCH 03/13] conditionally build candle cuda support

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 .github/workflows/publish-crate.yml  | 12 ++++++------
 .github/workflows/test-and-build.yml |  4 ++--
 Dockerfile.extproc                   | 10 +++++-----
 Dockerfile.extproc.cross             | 20 ++++++++++----------
 candle-binding/Cargo.toml            | 10 +++++++---
 tools/make/rust.mk                   | 23 ++++++++++++++++++++---
 6 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/.github/workflows/publish-crate.yml b/.github/workflows/publish-crate.yml
index 024b4aa0..c256ed29 100644
--- a/.github/workflows/publish-crate.yml
+++ b/.github/workflows/publish-crate.yml
@@ -71,17 +71,17 @@ jobs:
           exit 1
         fi
 
-    - name: Run tests
+    - name: Run tests (CPU-only, no CUDA)
       working-directory: candle-binding
-      run: cargo test --verbose
+      run: cargo test --no-default-features --verbose
 
-    - name: Check crate
+    - name: Check crate (CPU-only, no CUDA)
       working-directory: candle-binding
-      run: cargo check --verbose
+      run: cargo check --no-default-features --verbose
 
-    - name: Build crate
+    - name: Build crate (CPU-only, no CUDA)
       working-directory: candle-binding
-      run: cargo build --release --verbose
+      run: cargo build --release --no-default-features --verbose
 
     - name: Dry run publish
       working-directory: candle-binding
diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml
index d77545f5..3adf0ff5 100644
--- a/.github/workflows/test-and-build.yml
+++ b/.github/workflows/test-and-build.yml
@@ -69,8 +69,8 @@ jobs:
       - name: Check go mod tidy
         run: make check-go-mod-tidy
 
-      - name: Build Rust library
-        run: make rust
+      - name: Build Rust library (CPU-only, no CUDA)
+        run: make rust-ci
 
       - name: Install HuggingFace CLI
         run: |
diff --git a/Dockerfile.extproc b/Dockerfile.extproc
index aa51d917..5740e93f 100644
--- a/Dockerfile.extproc
+++ b/Dockerfile.extproc
@@ -30,24 +30,24 @@ COPY candle-binding/Cargo.loc[k] ./candle-binding/
 COPY tools/make/ tools/make/
 COPY Makefile ./
 
-# Pre-build dependencies to cache them
+# Pre-build dependencies to cache them (CPU-only, no CUDA)
 RUN cd candle-binding && \
     mkdir -p src && \
     echo "fn main() {}" > src/lib.rs && \
-    cargo build --release && \
+    cargo build --release --no-default-features && \
     rm -rf src
 
 # Copy source code and build
 COPY candle-binding/src/ ./candle-binding/src/
 
-# Use Makefile to build the Rust library (rebuild with actual source code)
-RUN echo "Building Rust library with actual source code..." && \
+# Use Makefile to build the Rust library (rebuild with actual source code, CPU-only, no CUDA)
+RUN echo "Building Rust library with actual source code (CPU-only, no CUDA)..." && \
     echo "Checking source files:" && \
     ls -la candle-binding/src/ && \
     echo "Forcing clean rebuild..." && \
     cd candle-binding && \
     cargo clean && \
-    cargo build --release && \
+    cargo build --release --no-default-features && \
     echo "Checking built library:" && \
     find target -name "*.so" -type f && \
     ls -la target/release/
diff --git a/Dockerfile.extproc.cross b/Dockerfile.extproc.cross
index 0356e3a2..219fdba5 100644
--- a/Dockerfile.extproc.cross
+++ b/Dockerfile.extproc.cross
@@ -72,29 +72,29 @@ COPY candle-binding/Cargo.loc[k] ./candle-binding/
 COPY tools/make/ tools/make/
 COPY Makefile ./
 
-# Create a modified Makefile for cross-compilation
+# Create a modified Makefile for cross-compilation (CPU-only, no CUDA)
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        echo "Modifying rust.mk for ARM64 cross-compilation..."; \
-        sed -i 's/cd candle-binding && cargo build --release/cd candle-binding \&\& cargo build --release --target aarch64-unknown-linux-gnu/' tools/make/rust.mk; \
+        echo "Modifying rust.mk for ARM64 cross-compilation (CPU-only, no CUDA)..."; \
+        sed -i 's/cd candle-binding && cargo build --release/cd candle-binding \&\& cargo build --release --no-default-features --target aarch64-unknown-linux-gnu/' tools/make/rust.mk; \
         cat tools/make/rust.mk | grep "cargo build"; \
     fi
 
-# Pre-build dependencies to cache them
+# Pre-build dependencies to cache them (CPU-only, no CUDA)
 RUN cd candle-binding && \
     mkdir -p src && \
     echo "fn main() {}" > src/lib.rs && \
     if [ "$TARGETARCH" = "arm64" ]; then \
-        cargo build --release --target aarch64-unknown-linux-gnu; \
+        cargo build --release --no-default-features --target aarch64-unknown-linux-gnu; \
     else \
-        cargo build --release; \
+        cargo build --release --no-default-features; \
     fi && \
     rm -rf src
 
 # Copy source code and build
 COPY candle-binding/src/ ./candle-binding/src/
 
-# Build with cross-compilation (rebuild with actual source code)
-RUN echo "Building Rust library with actual source code..." && \
+# Build with cross-compilation (rebuild with actual source code, CPU-only, no CUDA)
+RUN echo "Building Rust library with actual source code (CPU-only, no CUDA)..." && \
     echo "Current directory: $(pwd)" && \
     echo "TARGETARCH: $TARGETARCH" && \
     ls -la candle-binding/src/ && \
@@ -107,9 +107,9 @@ RUN echo "Building Rust library with actual source code..." && \
         export CC_aarch64_unknown_linux_gnu=aarch64-linux-gnu-gcc; \
         export CXX_aarch64_unknown_linux_gnu=aarch64-linux-gnu-g++; \
         export AR_aarch64_unknown_linux_gnu=aarch64-linux-gnu-ar; \
-        cargo build --release --target aarch64-unknown-linux-gnu; \
+        cargo build --release --no-default-features --target aarch64-unknown-linux-gnu; \
     else \
-        cargo build --release --target x86_64-unknown-linux-gnu; \
+        cargo build --release --no-default-features --target x86_64-unknown-linux-gnu; \
     fi && \
     echo "Checking built library..." && \
     find target -name "*.so" -type f
diff --git a/candle-binding/Cargo.toml b/candle-binding/Cargo.toml
index f4746d33..71c14550 100644
--- a/candle-binding/Cargo.toml
+++ b/candle-binding/Cargo.toml
@@ -9,11 +9,15 @@ license = "MIT OR Apache-2.0"
 name = "candle_semantic_router"
 crate-type = ["staticlib", "cdylib"]
 
+[features]
+default = ["cuda"]
+cuda = ["candle-core/cuda", "candle-nn/cuda", "candle-transformers/cuda"]
+
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-core = { version = "0.8.4", features = ["cuda"] }
-candle-nn = { version = "0.8.4", features = ["cuda"] }
-candle-transformers = { version = "0.8.4", features = ["cuda"] }
+candle-core = "0.8.4"
+candle-nn = "0.8.4"
+candle-transformers = "0.8.4"
 tokenizers = { version = "0.21.0", features = ["http"] }
 hf-hub = "0.4.1"
 safetensors = "0.4.1"
diff --git a/tools/make/rust.mk b/tools/make/rust.mk
index e9233af1..d92f33ce 100644
--- a/tools/make/rust.mk
+++ b/tools/make/rust.mk
@@ -28,8 +28,8 @@ test-jailbreak-classifier: rust ## Test jailbreak classifier with candle-binding
 	@export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release && \
 		cd src/training/prompt_guard_fine_tuning && CGO_ENABLED=1 go run jailbreak_classifier_verifier.go
 
-# Build the Rust library
-rust: ## Ensure Rust is installed and build the Rust library
+# Build the Rust library (with CUDA by default)
+rust: ## Ensure Rust is installed and build the Rust library with CUDA support
 	@$(LOG_TARGET)
 	@bash -c 'if ! command -v rustc >/dev/null 2>&1; then \
 		echo "rustc not found, installing..."; \
@@ -42,5 +42,22 @@ rust: ## Ensure Rust is installed and build the Rust library
 	if ! command -v cargo >/dev/null 2>&1; then \
 		echo "Error: cargo not found in PATH" && exit 1; \
 	fi && \
-	echo "Building Rust library..." && \
+	echo "Building Rust library with CUDA support..." && \
 	cd candle-binding && cargo build --release'
+
+# Build the Rust library without CUDA (for CI/CD environments)
+rust-ci: ## Build the Rust library without CUDA support (for GitHub Actions/CI)
+	@$(LOG_TARGET)
+	@bash -c 'if ! command -v rustc >/dev/null 2>&1; then \
+		echo "rustc not found, installing..."; \
+		curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y; \
+	fi && \
+	if [ -f "$$HOME/.cargo/env" ]; then \
+		echo "Loading Rust environment from $$HOME/.cargo/env..." && \
+		. $$HOME/.cargo/env; \
+	fi && \
+	if ! command -v cargo >/dev/null 2>&1; then \
+		echo "Error: cargo not found in PATH" && exit 1; \
+	fi && \
+	echo "Building Rust library without CUDA (CPU-only)..." && \
+	cd candle-binding && cargo build --release --no-default-features'

From f1ecc20ca3f11af4fe991192ff0577d711d3b6ac Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Tue, 21 Oct 2025 23:00:41 +0000
Subject: [PATCH 04/13] rebuild index upon restart

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 src/semantic-router/pkg/cache/hybrid_cache.go | 100 +++++++
 src/semantic-router/pkg/cache/milvus_cache.go |  74 +++++
 .../tutorials/semantic-cache/hybrid-cache.md  | 266 +++---------------
 .../semantic-cache/in-memory-cache.md         | 101 ++++---
 4 files changed, 271 insertions(+), 270 deletions(-)

diff --git a/src/semantic-router/pkg/cache/hybrid_cache.go b/src/semantic-router/pkg/cache/hybrid_cache.go
index acc78fca..b4a5a661 100644
--- a/src/semantic-router/pkg/cache/hybrid_cache.go
+++ b/src/semantic-router/pkg/cache/hybrid_cache.go
@@ -98,6 +98,9 @@ type HybridCacheOptions struct {
 
 	// Milvus settings
 	MilvusConfigPath string
+
+	// Startup settings
+	DisableRebuildOnStartup bool // Skip rebuilding HNSW index from Milvus on startup (default: false, meaning rebuild IS enabled)
 }
 
 // NewHybridCache creates a new hybrid cache instance
@@ -153,6 +156,26 @@ func NewHybridCache(options HybridCacheOptions) (*HybridCache, error) {
 	observability.Infof("Hybrid cache initialized: HNSW(M=%d, ef=%d), maxMemory=%d",
 		options.HNSWM, options.HNSWEfConstruction, options.MaxMemoryEntries)
 
+	// Rebuild HNSW index from Milvus on startup (enabled by default)
+	// This ensures the in-memory index is populated after a restart
+	// Set DisableRebuildOnStartup=true to skip this step (not recommended for production)
+	if !options.DisableRebuildOnStartup {
+		observability.Infof("Hybrid cache: rebuilding HNSW index from Milvus...")
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
+		defer cancel()
+
+		if err := cache.RebuildFromMilvus(ctx); err != nil {
+			observability.Warnf("Hybrid cache: failed to rebuild HNSW index from Milvus: %v", err)
+			observability.Warnf("Hybrid cache: continuing with empty HNSW index")
+			// Don't fail initialization, just log warning and continue with empty index
+		} else {
+			observability.Infof("Hybrid cache: HNSW index rebuild complete")
+		}
+	} else {
+		observability.Warnf("Hybrid cache: skipping HNSW index rebuild (DisableRebuildOnStartup=true)")
+		observability.Warnf("Hybrid cache: index will be empty until entries are added")
+	}
+
 	return cache, nil
 }
 
@@ -161,6 +184,83 @@ func (h *HybridCache) IsEnabled() bool {
 	return h.enabled
 }
 
+// RebuildFromMilvus rebuilds the in-memory HNSW index from persistent Milvus storage
+// This is called on startup to recover the index after a restart
+func (h *HybridCache) RebuildFromMilvus(ctx context.Context) error {
+	if !h.enabled {
+		return nil
+	}
+
+	start := time.Now()
+	observability.Infof("HybridCache.RebuildFromMilvus: starting HNSW index rebuild from Milvus")
+
+	// Query all entries from Milvus
+	requestIDs, embeddings, err := h.milvusCache.GetAllEntries(ctx)
+	if err != nil {
+		return fmt.Errorf("failed to get entries from Milvus: %w", err)
+	}
+
+	if len(requestIDs) == 0 {
+		observability.Infof("HybridCache.RebuildFromMilvus: no entries to rebuild, starting with empty index")
+		return nil
+	}
+
+	observability.Infof("HybridCache.RebuildFromMilvus: rebuilding HNSW index with %d entries", len(requestIDs))
+
+	// Lock for the entire rebuild process
+	h.mu.Lock()
+	defer h.mu.Unlock()
+
+	// Clear existing index
+	h.embeddings = make([][]float32, 0, len(embeddings))
+	h.idMap = make(map[int]string)
+	h.hnswIndex = newHNSWIndex(h.hnswIndex.M, h.hnswIndex.efConstruction)
+
+	// Rebuild HNSW index with progress logging
+	batchSize := 1000
+	for i, embedding := range embeddings {
+		// Check memory limits
+		if len(h.embeddings) >= h.maxMemoryEntries {
+			observability.Warnf("HybridCache.RebuildFromMilvus: reached max memory entries (%d), stopping rebuild at %d/%d",
+				h.maxMemoryEntries, i, len(embeddings))
+			break
+		}
+
+		// Add to HNSW
+		entryIndex := len(h.embeddings)
+		h.embeddings = append(h.embeddings, embedding)
+		h.idMap[entryIndex] = requestIDs[i]
+		h.addNodeHybrid(entryIndex, embedding)
+
+		// Progress logging for large datasets
+		if (i+1)%batchSize == 0 {
+			elapsed := time.Since(start)
+			rate := float64(i+1) / elapsed.Seconds()
+			remaining := len(embeddings) - (i + 1)
+			eta := time.Duration(float64(remaining)/rate) * time.Second
+			observability.Infof("HybridCache.RebuildFromMilvus: progress %d/%d (%.1f%%, %.0f entries/sec, ETA: %v)",
+				i+1, len(embeddings), float64(i+1)/float64(len(embeddings))*100, rate, eta)
+		}
+	}
+
+	elapsed := time.Since(start)
+	rate := float64(len(h.embeddings)) / elapsed.Seconds()
+	observability.Infof("HybridCache.RebuildFromMilvus: rebuild complete - %d entries in %v (%.0f entries/sec)",
+		len(h.embeddings), elapsed, rate)
+
+	observability.LogEvent("hybrid_cache_rebuilt", map[string]interface{}{
+		"backend":           "hybrid",
+		"entries_loaded":    len(h.embeddings),
+		"entries_in_milvus": len(embeddings),
+		"duration_seconds":  elapsed.Seconds(),
+		"entries_per_sec":   rate,
+	})
+
+	metrics.UpdateCacheEntries("hybrid", len(h.embeddings))
+
+	return nil
+}
+
 // AddPendingRequest stores a request awaiting its response
 func (h *HybridCache) AddPendingRequest(requestID string, model string, query string, requestBody []byte) error {
 	start := time.Now()
diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go
index 0e0e463c..68792ab4 100644
--- a/src/semantic-router/pkg/cache/milvus_cache.go
+++ b/src/semantic-router/pkg/cache/milvus_cache.go
@@ -753,6 +753,80 @@ func (c *MilvusCache) FindSimilarWithThreshold(model string, query string, thres
 	return responseBody, true, nil
 }
 
+// GetAllEntries retrieves all entries from Milvus for HNSW index rebuilding
+// Returns slices of request_ids and embeddings for efficient bulk loading
+func (c *MilvusCache) GetAllEntries(ctx context.Context) ([]string, [][]float32, error) {
+	start := time.Now()
+
+	if !c.enabled {
+		return nil, nil, fmt.Errorf("milvus cache is not enabled")
+	}
+
+	observability.Infof("MilvusCache.GetAllEntries: querying all entries for HNSW rebuild")
+
+	// Query all entries with embeddings and request_ids
+	// Filter to only get entries with complete responses (not pending)
+	queryResult, err := c.client.Query(
+		ctx,
+		c.collectionName,
+		[]string{},              // Empty partitions means search all
+		"response_body != \"\"", // Only get complete entries
+		[]string{"request_id", c.config.Collection.VectorField.Name}, // Get IDs and embeddings
+	)
+
+	if err != nil {
+		observability.Warnf("MilvusCache.GetAllEntries: query failed: %v", err)
+		return nil, nil, fmt.Errorf("milvus query all failed: %w", err)
+	}
+
+	if len(queryResult) < 2 {
+		observability.Infof("MilvusCache.GetAllEntries: no entries found or incomplete result")
+		return []string{}, [][]float32{}, nil
+	}
+
+	// Extract request IDs (first column)
+	requestIDColumn, ok := queryResult[0].(*entity.ColumnVarChar)
+	if !ok {
+		return nil, nil, fmt.Errorf("unexpected request_id column type: %T", queryResult[0])
+	}
+
+	// Extract embeddings (second column)
+	embeddingColumn, ok := queryResult[1].(*entity.ColumnFloatVector)
+	if !ok {
+		return nil, nil, fmt.Errorf("unexpected embedding column type: %T", queryResult[1])
+	}
+
+	if requestIDColumn.Len() != embeddingColumn.Len() {
+		return nil, nil, fmt.Errorf("column length mismatch: request_ids=%d, embeddings=%d",
+			requestIDColumn.Len(), embeddingColumn.Len())
+	}
+
+	entryCount := requestIDColumn.Len()
+	requestIDs := make([]string, entryCount)
+
+	// Extract request IDs from column
+	for i := 0; i < entryCount; i++ {
+		requestID, err := requestIDColumn.ValueByIdx(i)
+		if err != nil {
+			return nil, nil, fmt.Errorf("failed to get request_id at index %d: %w", i, err)
+		}
+		requestIDs[i] = requestID
+	}
+
+	// Extract embeddings directly from column data
+	embeddings := embeddingColumn.Data()
+	if len(embeddings) != entryCount {
+		return nil, nil, fmt.Errorf("embedding data length mismatch: got %d, expected %d",
+			len(embeddings), entryCount)
+	}
+
+	elapsed := time.Since(start)
+	observability.Infof("MilvusCache.GetAllEntries: loaded %d entries in %v (%.0f entries/sec)",
+		entryCount, elapsed, float64(entryCount)/elapsed.Seconds())
+
+	return requestIDs, embeddings, nil
+}
+
 // GetByID retrieves a document from Milvus by its request ID
 // This is much more efficient than FindSimilar when you already know the ID
 // Used by hybrid cache to fetch documents after local HNSW search
diff --git a/website/docs/tutorials/semantic-cache/hybrid-cache.md b/website/docs/tutorials/semantic-cache/hybrid-cache.md
index 40b8fd08..d5d63fc8 100644
--- a/website/docs/tutorials/semantic-cache/hybrid-cache.md
+++ b/website/docs/tutorials/semantic-cache/hybrid-cache.md
@@ -1,13 +1,13 @@
 # Hybrid Cache: HNSW + Milvus
 
-The Hybrid Cache combines the best of both worlds: in-memory HNSW index for ultra-fast search with Milvus vector database for scalable, persistent storage.
+The Hybrid Cache combines an in-memory HNSW index for fast search with a Milvus vector database for scalable, persistent storage.
 
 ## Overview
 
 The hybrid architecture provides:
-- **O(log n) search** via in-memory HNSW index
-- **Unlimited storage** via Milvus vector database
-- **Cost efficiency** by keeping only hot data in memory
+
+- **Fast search** via in-memory HNSW index
+- **Scalable storage** via Milvus vector database
 - **Persistence** with Milvus as the source of truth
 - **Hot data caching** with local document cache
 
@@ -20,57 +20,44 @@ The hybrid architecture provides:
 │  ┌─────────────────┐      ┌──────────────────┐  │
 │  │  In-Memory      │      │   Local Cache    │  │
 │  │  HNSW Index     │◄─────┤   (Hot Data)     │  │
-│  │  (100K entries) │      │   (1K docs)      │  │
 │  └────────┬────────┘      └──────────────────┘  │
 │           │                                       │
 │           │ ID Mapping                           │
 │           ▼                                       │
 │  ┌──────────────────────────────────────────┐   │
 │  │         Milvus Vector Database           │   │
-│  │       (Millions of entries)              │   │
 │  └──────────────────────────────────────────┘   │
 └──────────────────────────────────────────────────┘
 ```
 
 ## How It Works
 
-### 1. Write Path (AddEntry)
+### Write Path (AddEntry)
 
-```
-User Request
-    │
-    ├─► Generate Embedding (BERT)
-    │
-    ├─► Write to Milvus (persistence)
-    │
-    └─► Add to HNSW Index (if space available)
-        │
-        └─► Add to Local Cache
-```
+When adding a cache entry:
 
-### 2. Read Path (FindSimilar)
+1. Generate embedding using the configured embedding model
+2. Write entry to Milvus for persistence
+3. Add entry to in-memory HNSW index (if space is available)
+4. Add document to local cache
 
-```
-User Query
-    │
-    ├─► Generate Query Embedding
-    │
-    ├─► Search HNSW Index (10 candidates)
-    │
-    ├─► Check Local Cache (hot path)
-    │   ├─► HIT: Return immediately
-    │   └─► MISS: Continue
-    │
-    └─► Fetch from Milvus (cold path)
-        └─► Cache in Local Cache
-```
+### Read Path (FindSimilar)
+
+When searching for a similar query:
 
-### 3. Memory Management
+1. Generate query embedding
+2. Search HNSW index for nearest neighbors
+3. Check local cache for matching documents
+   - If found in local cache: return immediately (hot path)
+   - If not found: fetch from Milvus (cold path)
+4. Cache fetched documents in local cache for future queries
 
-- **HNSW Index**: Limited to `max_memory_entries` (default: 100K)
-- **Local Cache**: Limited to `local_cache_size` (default: 1K documents)
-- **Eviction**: FIFO policy when limits reached
-- **Data Persistence**: All data remains in Milvus
+### Memory Management
+
+- **HNSW Index**: Limited to a configured maximum number of entries
+- **Local Cache**: Limited to a configured number of documents
+- **Eviction**: FIFO policy when limits are reached
+- **Data Persistence**: All data remains in Milvus regardless of memory limits
 
 ## Configuration
 
@@ -123,55 +110,6 @@ milvus:
     efConstruction: 200
 ```
 
-## Performance Characteristics
-
-### Search Performance
-
-| Cache Size | Memory Backend | Hybrid (HNSW) | Hybrid (Local) | Improvement |
-|------------|---------------|---------------|----------------|-------------|
-| 100 entries | 0.5 ms | 0.3 ms | **0.05 ms** | 10x faster |
-| 1K entries | 2 ms | 0.4 ms | **0.05 ms** | 40x faster |
-| 10K entries | 15 ms | 0.6 ms | **0.05 ms** | 300x faster |
-| 100K entries | 150 ms | 0.8 ms | **0.05 ms** | 3000x faster |
-| 1M entries | N/A (OOM) | 1.2 ms | **0.05 ms** | ∞ |
-
-### Memory Usage
-
-| Component | Memory per Entry | 100K Entries | 1M Entries |
-|-----------|-----------------|--------------|------------|
-| Embeddings (384D) | ~1.5 KB | ~150 MB | ~1.5 GB |
-| HNSW Graph | ~0.5 KB | ~50 MB | ~500 MB |
-| Local Cache | ~2 KB | ~2 MB (1K docs) | ~2 MB |
-| **Total In-Memory** | - | ~200 MB | ~2 GB |
-
-**Milvus Storage**: Unlimited (disk-based)
-
-## Use Cases
-
-### When to Use Hybrid Cache
-
-✅ **Ideal for:**
-- Large-scale applications (>100K cache entries)
-- Production systems requiring persistence
-- Applications with hot/cold access patterns
-- Cost-sensitive deployments
-- Multi-instance deployments sharing cache
-
-### When to Use Memory Backend
-
-✅ **Ideal for:**
-- Small to medium scale (<10K entries)
-- Development and testing
-- Single-instance deployments
-- No persistence required
-
-### When to Use Milvus Backend
-
-✅ **Ideal for:**
-- Massive scale (millions of entries)
-- Complex vector search requirements
-- Applications without latency sensitivity
-
 ## Example Usage
 
 ### Go Code
@@ -239,157 +177,20 @@ stats.HitRatio      // Hit ratio (0.0 - 1.0)
 
 ```
 # Cache entries in HNSW
-semantic_cache_entries{backend="hybrid"} 95432
+semantic_cache_entries{backend="hybrid"}
 
 # Cache operations
-semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="hit_local"} 12453
-semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="hit_milvus"} 3421
-semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="miss"} 892
+semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="hit_local"}
+semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="hit_milvus"}
+semantic_cache_operations_total{backend="hybrid",operation="find_similar",status="miss"}
 
 # Cache hit ratio
-semantic_cache_hit_ratio{backend="hybrid"} 0.947
-```
-
-## Best Practices
-
-### 1. Right-Size Your Memory
-
-Choose `max_memory_entries` based on your working set:
-
-```yaml
-# For 1M total entries with 10% hot data
-max_memory_entries: 100000  # 100K in HNSW
-local_cache_size: 1000      # 1K hottest documents
-```
-
-### 2. Tune HNSW Parameters
-
-Balance recall vs. speed:
-
-```yaml
-# High recall (slower build, better search)
-hnsw_m: 32
-hnsw_ef_construction: 400
-
-# Balanced (recommended)
-hnsw_m: 16
-hnsw_ef_construction: 200
-
-# Fast build (lower recall)
-hnsw_m: 8
-hnsw_ef_construction: 100
-```
-
-### 3. Monitor Hit Rates
-
-Track cache effectiveness:
-
-```bash
-# Check cache stats
-curl http://localhost:8080/metrics | grep cache
-
-# Optimal hit rates:
-# - Local cache: >80% (hot data)
-# - Milvus cache: >90% (total)
-```
-
-### 4. Adjust Similarity Threshold
-
-```yaml
-# Stricter matching (fewer false positives)
-similarity_threshold: 0.90
-
-# Balanced (recommended)
-similarity_threshold: 0.85
-
-# Looser matching (more cache hits)
-similarity_threshold: 0.80
+semantic_cache_hit_ratio{backend="hybrid"}
 ```
 
-## Troubleshooting
+## Multi-Instance Deployment
 
-### High Memory Usage
-
-**Symptom**: Memory usage exceeds expectations
-
-**Solution**:
-```yaml
-# Reduce HNSW index size
-max_memory_entries: 50000  # Instead of 100000
-
-# Reduce local cache
-local_cache_size: 500      # Instead of 1000
-
-# Use smaller HNSW M
-hnsw_m: 8                  # Instead of 16
-```
-
-### Low Hit Rate
-
-**Symptom**: Cache hit rate < 50%
-
-**Solution**:
-1. Lower similarity threshold
-2. Increase `max_memory_entries`
-3. Check Milvus connectivity
-4. Verify embedding model consistency
-
-### Slow Queries
-
-**Symptom**: Queries taking > 10ms
-
-**Solution**:
-1. Check Milvus network latency
-2. Increase `local_cache_size` for hot data
-3. Verify HNSW index health
-4. Monitor Milvus load
-
-## Migration Guide
-
-### From Memory Backend
-
-```yaml
-# Before
-semantic_cache:
-  backend_type: "memory"
-  max_entries: 10000
-
-# After
-semantic_cache:
-  backend_type: "hybrid"
-  max_memory_entries: 10000  # Keep same HNSW size
-  local_cache_size: 1000
-  backend_config_path: "config/milvus.yaml"
-```
-
-### From Milvus Backend
-
-```yaml
-# Before
-semantic_cache:
-  backend_type: "milvus"
-  backend_config_path: "config/milvus.yaml"
-
-# After
-semantic_cache:
-  backend_type: "hybrid"
-  max_memory_entries: 100000  # Add HNSW layer
-  local_cache_size: 1000      # Add local cache
-  backend_config_path: "config/milvus.yaml"  # Keep Milvus
-```
-
-## Advanced Topics
-
-### Custom Eviction Strategy
-
-Currently uses FIFO. Future versions may support:
-- LRU (Least Recently Used)
-- LFU (Least Frequently Used)
-- TTL-based eviction
-
-### Multi-Instance Deployment
-
-The hybrid cache is designed for multi-instance deployments:
+The hybrid cache supports multi-instance deployments where each instance maintains its own HNSW index and local cache, but shares Milvus for persistence and data consistency:
 
 ```
 ┌─────────────┐   ┌─────────────┐   ┌─────────────┐
@@ -405,12 +206,9 @@ The hybrid cache is designed for multi-instance deployments:
                   └─────────────┘
 ```
 
-Each instance maintains its own HNSW index and local cache, but shares Milvus for persistence and data consistency.
-
 ## See Also
 
 - [In-Memory Cache Documentation](./in-memory-cache.md)
 - [Milvus Cache Documentation](./milvus-cache.md)
 - [HNSW Implementation Details](../../HNSW_IMPLEMENTATION_SUMMARY.md)
 - [Research Paper: Hybrid Architecture](../../papers/hybrid_hnsw_storage_architecture.md)
-
diff --git a/website/docs/tutorials/semantic-cache/in-memory-cache.md b/website/docs/tutorials/semantic-cache/in-memory-cache.md
index 1f991a2b..d4d211cc 100644
--- a/website/docs/tutorials/semantic-cache/in-memory-cache.md
+++ b/website/docs/tutorials/semantic-cache/in-memory-cache.md
@@ -1,15 +1,10 @@
 # In-Memory Semantic Cache
 
-The in-memory cache backend provides fast, local caching for development environments and single-instance deployments. It stores semantic embeddings and cached responses directly in memory for maximum performance.
+The in-memory cache backend stores semantic embeddings and cached responses directly in memory for fast local caching.
 
 ## Overview
 
-The in-memory cache is ideal for:
-
-- **Development and testing** environments
-- **Single-instance** deployments
-- **Quick prototyping** and experimentation
-- **Low-latency** requirements where external dependencies should be minimized
+The in-memory cache stores all cache data in the application's memory, providing low-latency access without external dependencies.
 
 ## Architecture
 
@@ -35,6 +30,30 @@ graph TB
     style K fill:#87CEEB
 ```
 
+## How It Works
+
+### Write Path
+When caching a response:
+
+1. Generate embedding for the query using the configured embedding model
+2. Store the embedding and response in memory
+3. Apply TTL if configured
+4. Evict oldest/least-used entries if max_entries limit is reached
+
+### Read Path
+When searching for a cached response:
+
+1. Generate embedding for the incoming query
+2. Search in-memory cache for similar embeddings
+3. If similarity exceeds threshold, return cached response (cache hit)
+4. Otherwise, forward to LLM and cache the new response (cache miss)
+
+### Search Methods
+The cache supports two search methods:
+
+- **Linear Search**: Compares query embedding against all cached embeddings
+- **HNSW Index**: Uses hierarchical graph structure for faster approximate nearest neighbor search
+
 ## Configuration
 
 ### Basic Configuration
@@ -48,7 +67,19 @@ semantic_cache:
   max_entries: 1000
   ttl_seconds: 3600
   eviction_policy: "fifo"
-  # Optional: Enable HNSW for faster search with large caches
+```
+
+### Configuration with HNSW
+
+```yaml
+semantic_cache:
+  enabled: true
+  backend_type: "memory"
+  similarity_threshold: 0.8
+  max_entries: 1000
+  ttl_seconds: 3600
+  eviction_policy: "fifo"
+  # HNSW index for faster search
   use_hnsw: true
   hnsw_m: 16
   hnsw_ef_construction: 200
@@ -103,11 +134,11 @@ categories:
 | `max_entries` | integer | `1000` | Maximum number of cached entries |
 | `ttl_seconds` | integer | `3600` | Time-to-live for cache entries (seconds, 0 = no expiration) |
 | `eviction_policy` | string | `"fifo"` | Eviction policy: `"fifo"`, `"lru"`, `"lfu"` |
-| `use_hnsw` | boolean | `false` | Enable HNSW index for faster similarity search |
+| `use_hnsw` | boolean | `false` | Enable HNSW index for similarity search |
 | `hnsw_m` | integer | `16` | HNSW M parameter (bi-directional links per node) |
 | `hnsw_ef_construction` | integer | `200` | HNSW efConstruction parameter (build quality) |
 
-### HNSW Index for Accelerated Search
+### HNSW Parameters
 
 The in-memory cache supports HNSW (Hierarchical Navigable Small World) indexing for significantly faster similarity search, especially beneficial with large cache sizes.
 
@@ -134,12 +165,12 @@ semantic_cache:
 
 #### HNSW Parameters
 
-- **`hnsw_m`**: Number of bi-directional links created for each node
+- **`hnsw_m`**: Number of bi-directional links created for each node in the graph
   - Lower values (8-12): Faster build, less memory, lower recall
   - Default (16): Balanced performance
   - Higher values (32-64): Better recall, more memory, slower build
 
-- **`hnsw_ef_construction`**: Size of dynamic candidate list during construction
+- **`hnsw_ef_construction`**: Size of dynamic candidate list during index construction
   - Lower values (100-150): Faster index building
   - Default (200): Good balance
   - Higher values (400-800): Better quality, slower build
@@ -196,7 +227,7 @@ semantic_cache:
 
 ## Setup and Testing
 
-### 1. Enable In-Memory Cache
+### Enable In-Memory Cache
 
 Update your configuration file:
 
@@ -212,7 +243,7 @@ semantic_cache:
 EOF
 ```
 
-### 2. Start the Router
+### Start the Router
 
 ```bash
 # Start the semantic router
@@ -222,9 +253,9 @@ make run-router
 ./bin/router --config config/config.yaml
 ```
 
-### 3. Test Cache Functionality
+### Test Cache Functionality
 
-Send identical requests to verify cache hits:
+Send requests to verify cache behavior:
 
 ```bash
 # First request (cache miss)
@@ -252,35 +283,33 @@ curl -X POST http://localhost:8080/v1/chat/completions \
   }'
 ```
 
-### Advantages
+## Characteristics
+
+### Storage
 
-- **Ultra-low latency**: Direct memory access, no network overhead
-- **Simple setup**: No external dependencies required
-- **High throughput**: Can handle thousands of cache operations per second
-- **Immediate availability**: Cache is ready as soon as the router starts
-- **HNSW acceleration**: Optional HNSW indexing for fast similarity search at scale
-- **Flexible eviction**: Multiple eviction policies (FIFO, LRU, LFU) to suit workload
+- Data is stored in application memory
+- Cache is cleared when the application restarts
+- Limited by available system memory
 
-### Limitations
+### Access Pattern
 
-- **Volatile storage**: Cache is lost when the router restarts
-- **Single instance**: Cannot be shared across multiple router instances
-- **Memory constraints**: Limited by available system memory
-- **No persistence**: No data recovery after crashes
+- Direct memory access without network overhead
+- No external dependencies required
 
-## Memory Management
+### Eviction Policies
 
-### Automatic Cleanup
+- **FIFO**: First In, First Out - removes oldest entries
+- **LRU**: Least Recently Used - removes least recently accessed entries
+- **LFU**: Least Frequently Used - removes least frequently accessed entries
 
-The in-memory cache automatically manages memory through:
+### TTL Management
 
-1. **TTL Expiration**: Entries are removed after `ttl_seconds`
-2. **LRU Eviction**: Least recently used entries are removed when `max_entries` is reached
-3. **Periodic Cleanup**: Expired entries are cleaned every `cleanup_interval_seconds`
-4. **Memory Pressure**: Aggressive cleanup when approaching `memory_limit_mb`
+- Entries can have a time-to-live (TTL)
+- Expired entries are removed during cleanup operations
 
 ## Next Steps
 
-- **[Milvus Cache](./milvus-cache.md)** - Set up persistent, distributed caching
+- **[Hybrid Cache](./hybrid-cache.md)** - Learn about HNSW + Milvus hybrid caching
+- **[Milvus Cache](./milvus-cache.md)** - Learn about persistent vector database caching
 - **[Cache Overview](./overview.md)** - Learn about semantic caching concepts
 - **[Observability](../observability/overview.md)** - Monitor cache performance

From 91012e0eb91331d91fa8d88ba52aa9774fa8e80b Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Tue, 21 Oct 2025 23:15:16 +0000
Subject: [PATCH 05/13] precommit fix

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 .../pkg/cache/comprehensive_benchmark_test.go | 21 ++++----
 .../pkg/cache/hybrid_cache_test.go            | 24 +++++----
 .../pkg/cache/large_scale_benchmark_test.go   | 50 +++++++++++++------
 src/semantic-router/pkg/cache/milvus_cache.go |  5 +-
 .../tutorials/semantic-cache/hybrid-cache.md  |  4 +-
 5 files changed, 65 insertions(+), 39 deletions(-)

diff --git a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
index a2a82fc9..009d55f3 100644
--- a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
+++ b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
@@ -33,7 +33,7 @@ func (c ContentLength) String() string {
 // GenerateQuery generates a query with maximum semantic diversity using hash-based randomization
 func generateQuery(length ContentLength, index int) string {
 	// Hash the index to get pseudo-random values (deterministic but well-distributed)
-	hash := uint64(index)
+	hash := uint64(index)               // #nosec G115 -- index is always positive and bounded
 	hash = hash*2654435761 + 1013904223 // Knuth's multiplicative hash
 
 	// Expanded templates for maximum diversity
@@ -119,16 +119,16 @@ func generateQuery(length ContentLength, index int) string {
 	}
 
 	// Use hash to pseudo-randomly select (but deterministic for same index)
-	templateIdx := int(hash % uint64(len(templates)))
-	hash = hash * 16807 % 2147483647 // LCG for next random
+	templateIdx := int(hash % uint64(len(templates))) // #nosec G115 -- modulo operation is bounded by array length
+	hash = hash * 16807 % 2147483647                  // LCG for next random
 
-	topic1Idx := int(hash % uint64(len(topics)))
+	topic1Idx := int(hash % uint64(len(topics))) // #nosec G115 -- modulo operation is bounded by array length
 	hash = hash * 16807 % 2147483647
 
-	topic2Idx := int(hash % uint64(len(topics)))
+	topic2Idx := int(hash % uint64(len(topics))) // #nosec G115 -- modulo operation is bounded by array length
 	hash = hash * 16807 % 2147483647
 
-	topic3Idx := int(hash % uint64(len(topics)))
+	topic3Idx := int(hash % uint64(len(topics))) // #nosec G115 -- modulo operation is bounded by array length
 	hash = hash * 16807 % 2147483647
 
 	// Build query with selected template and topics
@@ -136,7 +136,7 @@ func generateQuery(length ContentLength, index int) string {
 		topics[topic1Idx],
 		topics[topic2Idx],
 		topics[topic3Idx],
-		modifiers[int(hash%uint64(len(modifiers)))])
+		modifiers[int(hash%uint64(len(modifiers)))]) // #nosec G115 -- modulo operation is bounded by array length
 
 	// Add unique identifier to guarantee uniqueness
 	query += fmt.Sprintf(" [Request ID: REQ-%d]", index)
@@ -144,10 +144,10 @@ func generateQuery(length ContentLength, index int) string {
 	// Add extra context for longer queries
 	if length > MediumContent {
 		hash = hash * 16807 % 2147483647
-		extraTopicIdx := int(hash % uint64(len(topics)))
+		extraTopicIdx := int(hash % uint64(len(topics))) // #nosec G115 -- modulo operation is bounded by array length
 		query += fmt.Sprintf(" Also considering %s integration and %s compatibility requirements.",
 			topics[extraTopicIdx],
-			modifiers[int(hash%uint64(len(modifiers)))])
+			modifiers[int(hash%uint64(len(modifiers)))]) // #nosec G115 -- modulo operation is bounded by array length
 	}
 
 	return query
@@ -182,7 +182,8 @@ func BenchmarkComprehensive(b *testing.B) {
 	}
 
 	// Open CSV file for results
-	csvFile, err := os.OpenFile("../../benchmark_results/benchmark_data.csv",
+	csvFile, err := os.OpenFile(
+		"../../benchmark_results/benchmark_data.csv",
 		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
 	if err != nil {
 		b.Logf("Warning: Could not open CSV file: %v", err)
diff --git a/src/semantic-router/pkg/cache/hybrid_cache_test.go b/src/semantic-router/pkg/cache/hybrid_cache_test.go
index 38ae188e..00cb750a 100644
--- a/src/semantic-router/pkg/cache/hybrid_cache_test.go
+++ b/src/semantic-router/pkg/cache/hybrid_cache_test.go
@@ -58,7 +58,8 @@ milvus:
   params:
     M: 16
     efConstruction: 200
-`), 0644)
+`),
+		0644)
 	if err != nil {
 		t.Fatalf("Failed to create test config: %v", err)
 	}
@@ -112,7 +113,7 @@ milvus:
 	}
 
 	// Test FindSimilar with similar query (should hit)
-	response, found, err = cache.FindSimilar("gpt-4", "What's the meaning of life?")
+	_, found, err = cache.FindSimilar("gpt-4", "What's the meaning of life?")
 	if err != nil {
 		t.Fatalf("FindSimilar failed: %v", err)
 	}
@@ -154,7 +155,8 @@ milvus:
   dimension: 384
   index_type: "HNSW"
   metric_type: "IP"
-`), 0644)
+`),
+		0644)
 	if err != nil {
 		t.Fatalf("Failed to create test config: %v", err)
 	}
@@ -217,7 +219,8 @@ milvus:
   dimension: 384
   index_type: "HNSW"
   metric_type: "IP"
-`), 0644)
+`),
+		0644)
 	if err != nil {
 		t.Fatalf("Failed to create test config: %v", err)
 	}
@@ -264,7 +267,7 @@ milvus:
 	}
 
 	// Try to find an old evicted entry (should be in Milvus)
-	_, found, err = cache.FindSimilar("gpt-4", "Query number 0")
+	_, _, err = cache.FindSimilar("gpt-4", "Query number 0")
 	if err != nil {
 		t.Fatalf("FindSimilar failed: %v", err)
 	}
@@ -287,7 +290,8 @@ milvus:
   dimension: 384
   index_type: "HNSW"
   metric_type: "IP"
-`), 0644)
+`),
+		0644)
 	if err != nil {
 		t.Fatalf("Failed to create test config: %v", err)
 	}
@@ -316,7 +320,7 @@ milvus:
 	time.Sleep(100 * time.Millisecond)
 
 	// First search - should populate local cache
-	response, found, err := cache.FindSimilar("gpt-4", testQuery)
+	_, found, err := cache.FindSimilar("gpt-4", testQuery)
 	if err != nil {
 		t.Fatalf("FindSimilar failed: %v", err)
 	}
@@ -363,7 +367,8 @@ milvus:
   dimension: 384
   index_type: "HNSW"
   metric_type: "IP"
-`), 0644)
+`),
+		0644)
 	if err != nil {
 		b.Fatalf("Failed to create test config: %v", err)
 	}
@@ -406,7 +411,8 @@ milvus:
   dimension: 384
   index_type: "HNSW"
   metric_type: "IP"
-`), 0644)
+`),
+		0644)
 	if err != nil {
 		b.Fatalf("Failed to create test config: %v", err)
 	}
diff --git a/src/semantic-router/pkg/cache/large_scale_benchmark_test.go b/src/semantic-router/pkg/cache/large_scale_benchmark_test.go
index 81e69129..6f3a00b4 100644
--- a/src/semantic-router/pkg/cache/large_scale_benchmark_test.go
+++ b/src/semantic-router/pkg/cache/large_scale_benchmark_test.go
@@ -42,7 +42,9 @@ func BenchmarkLargeScale(b *testing.B) {
 	// Open CSV file for results
 	// Create benchmark_results directory if it doesn't exist
 	resultsDir := "../../benchmark_results"
-	os.MkdirAll(resultsDir, 0755)
+	if err := os.MkdirAll(resultsDir, 0755); err != nil {
+		b.Logf("Warning: Could not create results directory: %v", err)
+	}
 
 	csvFile, err := os.OpenFile(resultsDir+"/large_scale_benchmark.csv",
 		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
@@ -274,7 +276,9 @@ func BenchmarkScalability(b *testing.B) {
 
 	// CSV output
 	resultsDir := "../../benchmark_results"
-	os.MkdirAll(resultsDir, 0755)
+	if err := os.MkdirAll(resultsDir, 0755); err != nil {
+		b.Logf("Warning: Could not create results directory: %v", err)
+	}
 
 	csvFile, err := os.OpenFile(resultsDir+"/scalability_benchmark.csv",
 		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
@@ -313,14 +317,18 @@ func BenchmarkScalability(b *testing.B) {
 					})
 
 					for i := 0; i < cacheSize; i++ {
-						cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
-							testQueries[i], []byte("req"), []byte("resp"))
+						if err := cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
+							testQueries[i], []byte("req"), []byte("resp")); err != nil {
+							b.Fatalf("AddEntry failed: %v", err)
+						}
 					}
 
 					b.ResetTimer()
 					start := time.Now()
 					for i := 0; i < b.N; i++ {
-						cache.FindSimilar("model", searchQuery)
+						if _, _, err := cache.FindSimilar("model", searchQuery); err != nil {
+							b.Fatalf("FindSimilar failed: %v", err)
+						}
 					}
 					elapsed := time.Since(start)
 
@@ -331,7 +339,9 @@ func BenchmarkScalability(b *testing.B) {
 					if csvFile != nil {
 						line := fmt.Sprintf("%d,linear,%.0f,%.3f,%.0f\n",
 							cacheSize, avgLatency, latencyMS, opsPerSec)
-						csvFile.WriteString(line)
+						if _, err := csvFile.WriteString(line); err != nil {
+							b.Logf("Warning: failed to write to CSV: %v", err)
+						}
 					}
 
 					b.ReportMetric(latencyMS, "ms/op")
@@ -351,8 +361,10 @@ func BenchmarkScalability(b *testing.B) {
 
 				buildStart := time.Now()
 				for i := 0; i < cacheSize; i++ {
-					cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
-						testQueries[i], []byte("req"), []byte("resp"))
+					if err := cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
+						testQueries[i], []byte("req"), []byte("resp")); err != nil {
+						b.Fatalf("AddEntry failed: %v", err)
+					}
 					if (i+1)%10000 == 0 {
 						b.Logf("  Built %d/%d entries", i+1, cacheSize)
 					}
@@ -362,7 +374,9 @@ func BenchmarkScalability(b *testing.B) {
 				b.ResetTimer()
 				start := time.Now()
 				for i := 0; i < b.N; i++ {
-					cache.FindSimilar("model", searchQuery)
+					if _, _, err := cache.FindSimilar("model", searchQuery); err != nil {
+						b.Fatalf("FindSimilar failed: %v", err)
+					}
 				}
 				elapsed := time.Since(start)
 
@@ -373,7 +387,9 @@ func BenchmarkScalability(b *testing.B) {
 				if csvFile != nil {
 					line := fmt.Sprintf("%d,hnsw,%.0f,%.3f,%.0f\n",
 						cacheSize, avgLatency, latencyMS, opsPerSec)
-					csvFile.WriteString(line)
+					if _, err := csvFile.WriteString(line); err != nil {
+						b.Logf("Warning: failed to write to CSV: %v", err)
+					}
 				}
 
 				b.ReportMetric(latencyMS, "ms/op")
@@ -430,7 +446,9 @@ func BenchmarkHNSWParameterSweep(b *testing.B) {
 
 	// CSV output
 	resultsDir := "../../benchmark_results"
-	os.MkdirAll(resultsDir, 0755)
+	if err := os.MkdirAll(resultsDir, 0755); err != nil {
+		b.Logf("Warning: Could not create results directory: %v", err)
+	}
 
 	csvFile, err := os.OpenFile(resultsDir+"/hnsw_parameter_sweep.csv",
 		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
@@ -463,8 +481,10 @@ func BenchmarkHNSWParameterSweep(b *testing.B) {
 			b.Logf("Building HNSW index: M=%d, efConstruction=200, efSearch=%d", config.m, config.efSearch)
 			buildStart := time.Now()
 			for i := 0; i < cacheSize; i++ {
-				cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
-					testQueries[i], []byte("req"), []byte("resp"))
+				if err := cache.AddEntry(fmt.Sprintf("req-%d", i), "model",
+					testQueries[i], []byte("req"), []byte("resp")); err != nil {
+					b.Fatalf("AddEntry failed: %v", err)
+				}
 				if (i+1)%10000 == 0 {
 					b.Logf("  Progress: %d/%d", i+1, cacheSize)
 				}
@@ -484,7 +504,9 @@ func BenchmarkHNSWParameterSweep(b *testing.B) {
 			b.ResetTimer()
 			start := time.Now()
 			for i := 0; i < b.N; i++ {
-				cache.FindSimilar("model", searchQuery)
+				if _, _, err := cache.FindSimilar("model", searchQuery); err != nil {
+					b.Fatalf("FindSimilar failed: %v", err)
+				}
 			}
 			elapsed := time.Since(start)
 
diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go
index 68792ab4..f8b57d73 100644
--- a/src/semantic-router/pkg/cache/milvus_cache.go
+++ b/src/semantic-router/pkg/cache/milvus_cache.go
@@ -207,7 +207,7 @@ func loadMilvusConfig(configPath string) (*MilvusConfig, error) {
 
 	// WORKAROUND: Force development settings for benchmarks
 	// There seems to be a YAML parsing issue with sigs.k8s.io/yaml
-	if config.Development.AutoCreateCollection == false && config.Development.DropCollectionOnStartup == false {
+	if !config.Development.AutoCreateCollection && !config.Development.DropCollectionOnStartup {
 		fmt.Printf("[WARN] Development settings parsed as false, forcing to true for benchmarks\n")
 		config.Development.AutoCreateCollection = true
 		config.Development.DropCollectionOnStartup = true
@@ -773,7 +773,6 @@ func (c *MilvusCache) GetAllEntries(ctx context.Context) ([]string, [][]float32,
 		"response_body != \"\"", // Only get complete entries
 		[]string{"request_id", c.config.Collection.VectorField.Name}, // Get IDs and embeddings
 	)
-
 	if err != nil {
 		observability.Warnf("MilvusCache.GetAllEntries: query failed: %v", err)
 		return nil, nil, fmt.Errorf("milvus query all failed: %w", err)
@@ -884,7 +883,7 @@ func (c *MilvusCache) GetByID(ctx context.Context, requestID string) ([]byte, er
 
 	responseBody := []byte(responseBodyStr)
 
-	if responseBody == nil || len(responseBody) == 0 {
+	if len(responseBody) == 0 {
 		observability.Debugf("MilvusCache.GetByID: response_body is empty")
 		metrics.RecordCacheOperation("milvus", "get_by_id", "miss", time.Since(start).Seconds())
 		return nil, fmt.Errorf("response_body is empty for: %s", requestID)
diff --git a/website/docs/tutorials/semantic-cache/hybrid-cache.md b/website/docs/tutorials/semantic-cache/hybrid-cache.md
index d5d63fc8..5bcf0e80 100644
--- a/website/docs/tutorials/semantic-cache/hybrid-cache.md
+++ b/website/docs/tutorials/semantic-cache/hybrid-cache.md
@@ -209,6 +209,4 @@ The hybrid cache supports multi-instance deployments where each instance maintai
 ## See Also
 
 - [In-Memory Cache Documentation](./in-memory-cache.md)
-- [Milvus Cache Documentation](./milvus-cache.md)
-- [HNSW Implementation Details](../../HNSW_IMPLEMENTATION_SUMMARY.md)
-- [Research Paper: Hybrid Architecture](../../papers/hybrid_hnsw_storage_architecture.md)
+- [Milvus Cache Documentation](./milvus-cache.md)
\ No newline at end of file

From 4677c46d0430b44a206773aadf27935df9c0652e Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Tue, 21 Oct 2025 23:24:52 +0000
Subject: [PATCH 06/13] fix precommit

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 .pre-commit-config.yaml                            | 2 +-
 src/semantic-router/pkg/cache/hybrid_cache_test.go | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a95280e0..828d6308 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -81,7 +81,7 @@ repos:
         pass_filenames: false
       - id: cargo-check
         name: cargo check
-        entry: bash -c 'cd candle-binding && cargo check'
+        entry: bash -c 'cd candle-binding && cargo check --no-default-features'
         language: system
         files: \.rs$
         pass_filenames: false
diff --git a/src/semantic-router/pkg/cache/hybrid_cache_test.go b/src/semantic-router/pkg/cache/hybrid_cache_test.go
index 00cb750a..52d70504 100644
--- a/src/semantic-router/pkg/cache/hybrid_cache_test.go
+++ b/src/semantic-router/pkg/cache/hybrid_cache_test.go
@@ -330,7 +330,7 @@ milvus:
 
 	// Second search - should hit local cache (much faster)
 	startTime := time.Now()
-	response, found, err = cache.FindSimilar("gpt-4", testQuery)
+	response, found, err := cache.FindSimilar("gpt-4", testQuery)
 	localLatency := time.Since(startTime)
 	if err != nil {
 		t.Fatalf("FindSimilar failed: %v", err)

From 1349768d3564ff591616bc49432333fcc07be5ef Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Tue, 21 Oct 2025 23:34:10 +0000
Subject: [PATCH 07/13] fix precommit

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 website/docs/tutorials/semantic-cache/hybrid-cache.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/tutorials/semantic-cache/hybrid-cache.md b/website/docs/tutorials/semantic-cache/hybrid-cache.md
index 5bcf0e80..a55aae34 100644
--- a/website/docs/tutorials/semantic-cache/hybrid-cache.md
+++ b/website/docs/tutorials/semantic-cache/hybrid-cache.md
@@ -209,4 +209,4 @@ The hybrid cache supports multi-instance deployments where each instance maintai
 ## See Also
 
 - [In-Memory Cache Documentation](./in-memory-cache.md)
-- [Milvus Cache Documentation](./milvus-cache.md)
\ No newline at end of file
+- [Milvus Cache Documentation](./milvus-cache.md)

From 09ef22f71d3ee4e557a2e536abc52ad84d0ded70 Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Tue, 21 Oct 2025 23:37:59 +0000
Subject: [PATCH 08/13] fix precommit

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 .../pkg/cache/comprehensive_benchmark_test.go       |  3 ++-
 src/semantic-router/pkg/cache/hybrid_cache_test.go  | 13 ++++++-------
 .../pkg/cache/large_scale_benchmark_test.go         | 13 +++++++------
 src/semantic-router/pkg/cache/milvus_cache.go       |  1 -
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
index 009d55f3..92726bba 100644
--- a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
+++ b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
@@ -184,7 +184,8 @@ func BenchmarkComprehensive(b *testing.B) {
 	// Open CSV file for results
 	csvFile, err := os.OpenFile(
 		"../../benchmark_results/benchmark_data.csv",
-		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY,
+		0o644)
 	if err != nil {
 		b.Logf("Warning: Could not open CSV file: %v", err)
 	} else {
diff --git a/src/semantic-router/pkg/cache/hybrid_cache_test.go b/src/semantic-router/pkg/cache/hybrid_cache_test.go
index 52d70504..00f8ac87 100644
--- a/src/semantic-router/pkg/cache/hybrid_cache_test.go
+++ b/src/semantic-router/pkg/cache/hybrid_cache_test.go
@@ -58,8 +58,7 @@ milvus:
   params:
     M: 16
     efConstruction: 200
-`),
-		0644)
+`), 0o644)
 	if err != nil {
 		t.Fatalf("Failed to create test config: %v", err)
 	}
@@ -156,7 +155,7 @@ milvus:
   index_type: "HNSW"
   metric_type: "IP"
 `),
-		0644)
+		0o644)
 	if err != nil {
 		t.Fatalf("Failed to create test config: %v", err)
 	}
@@ -220,7 +219,7 @@ milvus:
   index_type: "HNSW"
   metric_type: "IP"
 `),
-		0644)
+		0o644)
 	if err != nil {
 		t.Fatalf("Failed to create test config: %v", err)
 	}
@@ -291,7 +290,7 @@ milvus:
   index_type: "HNSW"
   metric_type: "IP"
 `),
-		0644)
+		0o644)
 	if err != nil {
 		t.Fatalf("Failed to create test config: %v", err)
 	}
@@ -368,7 +367,7 @@ milvus:
   index_type: "HNSW"
   metric_type: "IP"
 `),
-		0644)
+		0o644)
 	if err != nil {
 		b.Fatalf("Failed to create test config: %v", err)
 	}
@@ -412,7 +411,7 @@ milvus:
   index_type: "HNSW"
   metric_type: "IP"
 `),
-		0644)
+		0o644)
 	if err != nil {
 		b.Fatalf("Failed to create test config: %v", err)
 	}
diff --git a/src/semantic-router/pkg/cache/large_scale_benchmark_test.go b/src/semantic-router/pkg/cache/large_scale_benchmark_test.go
index 6f3a00b4..4a981ba4 100644
--- a/src/semantic-router/pkg/cache/large_scale_benchmark_test.go
+++ b/src/semantic-router/pkg/cache/large_scale_benchmark_test.go
@@ -42,12 +42,13 @@ func BenchmarkLargeScale(b *testing.B) {
 	// Open CSV file for results
 	// Create benchmark_results directory if it doesn't exist
 	resultsDir := "../../benchmark_results"
-	if err := os.MkdirAll(resultsDir, 0755); err != nil {
+	if err := os.MkdirAll(resultsDir, 0o755); err != nil {
 		b.Logf("Warning: Could not create results directory: %v", err)
 	}
 
 	csvFile, err := os.OpenFile(resultsDir+"/large_scale_benchmark.csv",
-		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY,
+		0o644)
 	if err != nil {
 		b.Logf("Warning: Could not open CSV file: %v", err)
 	} else {
@@ -276,12 +277,12 @@ func BenchmarkScalability(b *testing.B) {
 
 	// CSV output
 	resultsDir := "../../benchmark_results"
-	if err := os.MkdirAll(resultsDir, 0755); err != nil {
+	if err := os.MkdirAll(resultsDir, 0o755); err != nil {
 		b.Logf("Warning: Could not create results directory: %v", err)
 	}
 
 	csvFile, err := os.OpenFile(resultsDir+"/scalability_benchmark.csv",
-		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
 	if err != nil {
 		b.Logf("Warning: Could not open CSV file: %v", err)
 	} else {
@@ -446,12 +447,12 @@ func BenchmarkHNSWParameterSweep(b *testing.B) {
 
 	// CSV output
 	resultsDir := "../../benchmark_results"
-	if err := os.MkdirAll(resultsDir, 0755); err != nil {
+	if err := os.MkdirAll(resultsDir, 0o755); err != nil {
 		b.Logf("Warning: Could not create results directory: %v", err)
 	}
 
 	csvFile, err := os.OpenFile(resultsDir+"/hnsw_parameter_sweep.csv",
-		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
+		os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0o644)
 	if err != nil {
 		b.Logf("Warning: Could not open CSV file: %v", err)
 	} else {
diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go
index f8b57d73..0ac6d198 100644
--- a/src/semantic-router/pkg/cache/milvus_cache.go
+++ b/src/semantic-router/pkg/cache/milvus_cache.go
@@ -846,7 +846,6 @@ func (c *MilvusCache) GetByID(ctx context.Context, requestID string) ([]byte, er
 		fmt.Sprintf("request_id == \"%s\"", requestID),
 		[]string{"response_body"}, // Only fetch document, not embedding!
 	)
-
 	if err != nil {
 		observability.Debugf("MilvusCache.GetByID: query failed: %v", err)
 		metrics.RecordCacheOperation("milvus", "get_by_id", "error", time.Since(start).Seconds())

From 1fb7ca67b189f3505ae3386e8c049cb81f21cb61 Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Tue, 21 Oct 2025 23:51:58 +0000
Subject: [PATCH 09/13] disable cuda build on ci

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 .github/workflows/pre-commit.yml     |  2 ++
 .github/workflows/test-and-build.yml |  1 +
 tools/make/build-run-test.mk         |  4 ++--
 tools/make/rust.mk                   | 16 ++++++++--------
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 24aca521..0926cc8e 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -97,6 +97,8 @@ jobs:
 
     - name: Run pre-commit check
       run: make precommit-check
+      env:
+        CI: true
 
     - name: Show pre-commit results
       if: failure()
diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml
index 3adf0ff5..c14a9b4d 100644
--- a/.github/workflows/test-and-build.yml
+++ b/.github/workflows/test-and-build.yml
@@ -86,6 +86,7 @@ jobs:
       - name: Run semantic router tests
         run: make test
         env:
+          CI: true
           CGO_ENABLED: 1
           LD_LIBRARY_PATH: ${{ github.workspace }}/candle-binding/target/release
 
diff --git a/tools/make/build-run-test.mk b/tools/make/build-run-test.mk
index 36c26498..65a330ea 100644
--- a/tools/make/build-run-test.mk
+++ b/tools/make/build-run-test.mk
@@ -8,9 +8,9 @@
 build: ## Build the Rust library and Golang binding
 build: rust build-router
 
-# Build router
+# Build router (conditionally use rust-ci in CI environments)
 build-router: ## Build the router binary
-build-router: rust
+build-router: $(if $(CI),rust-ci,rust)
 	@$(LOG_TARGET)
 	@mkdir -p bin
 	@cd src/semantic-router && go build --tags=milvus -o ../../bin/router cmd/main.go
diff --git a/tools/make/rust.mk b/tools/make/rust.mk
index d92f33ce..47c4ed9b 100644
--- a/tools/make/rust.mk
+++ b/tools/make/rust.mk
@@ -4,26 +4,26 @@
 
 ##@ Rust
 
-# Test the Rust library
-test-binding: rust ## Run Go tests with the Rust static library
+# Test the Rust library (conditionally use rust-ci in CI environments)
+test-binding: $(if $(CI),rust-ci,rust) ## Run Go tests with the Rust static library
 	@$(LOG_TARGET)
 	@export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release && \
 		cd candle-binding && CGO_ENABLED=1 go test -v -race
 
-# Test with the candle-binding library
-test-category-classifier: rust ## Test domain classifier with candle-binding
+# Test with the candle-binding library (conditionally use rust-ci in CI environments)
+test-category-classifier: $(if $(CI),rust-ci,rust) ## Test domain classifier with candle-binding
 	@$(LOG_TARGET)
 	@export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release && \
 		cd src/training/classifier_model_fine_tuning && CGO_ENABLED=1 go run test_linear_classifier.go
 
-# Test the PII classifier
-test-pii-classifier: rust ## Test PII classifier with candle-binding
+# Test the PII classifier (conditionally use rust-ci in CI environments)
+test-pii-classifier: $(if $(CI),rust-ci,rust) ## Test PII classifier with candle-binding
 	@$(LOG_TARGET)
 	@export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release && \
 		cd src/training/pii_model_fine_tuning && CGO_ENABLED=1 go run pii_classifier_verifier.go
 
-# Test the jailbreak classifier
-test-jailbreak-classifier: rust ## Test jailbreak classifier with candle-binding
+# Test the jailbreak classifier (conditionally use rust-ci in CI environments)
+test-jailbreak-classifier: $(if $(CI),rust-ci,rust) ## Test jailbreak classifier with candle-binding
 	@$(LOG_TARGET)
 	@export LD_LIBRARY_PATH=${PWD}/candle-binding/target/release && \
 		cd src/training/prompt_guard_fine_tuning && CGO_ENABLED=1 go run jailbreak_classifier_verifier.go

From eaafb576b4927b0a52a3023d64178f2227807ad0 Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Wed, 22 Oct 2025 19:15:25 +0000
Subject: [PATCH 10/13] review feedback

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 .../pkg/cache/comprehensive_benchmark_test.go |  4 +--
 src/semantic-router/pkg/cache/hybrid_cache.go | 12 +++++++--
 .../cache/hybrid_vs_milvus_benchmark_test.go  | 25 ++++++++++++-------
 3 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
index 92726bba..9d6d0adb 100644
--- a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
+++ b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
@@ -33,8 +33,8 @@ func (c ContentLength) String() string {
 // GenerateQuery generates a query with maximum semantic diversity using hash-based randomization
 func generateQuery(length ContentLength, index int) string {
 	// Hash the index to get pseudo-random values (deterministic but well-distributed)
-	hash := uint64(index)               // #nosec G115 -- index is always positive and bounded
-	hash = hash*2654435761 + 1013904223 // Knuth's multiplicative hash
+	hash := uint64(index)    // #nosec G115 -- index is always positive and bounded
+	hash = hash * 2654435761 // Knuth's multiplicative hash
 
 	// Expanded templates for maximum diversity
 	templates := []string{
diff --git a/src/semantic-router/pkg/cache/hybrid_cache.go b/src/semantic-router/pkg/cache/hybrid_cache.go
index b4a5a661..19325d50 100644
--- a/src/semantic-router/pkg/cache/hybrid_cache.go
+++ b/src/semantic-router/pkg/cache/hybrid_cache.go
@@ -15,6 +15,14 @@ import (
 	"github.com/vllm-project/semantic-router/src/semantic-router/pkg/observability"
 )
 
+const (
+	// Buffer pool limits to prevent memory bloat
+	maxVisitedMapSize     = 1000 // Maximum size for visited map before discarding buffer
+	maxCandidatesCapacity = 200  // Maximum capacity for candidates heap before discarding buffer
+	maxResultsCapacity    = 200  // Maximum capacity for results heap before discarding buffer
+	maxHNSWLayers         = 16   // Maximum number of layers in HNSW index
+)
+
 // searchBuffers holds reusable buffers for HNSW search to reduce GC pressure
 type searchBuffers struct {
 	visited    map[int]bool
@@ -48,7 +56,7 @@ func getSearchBuffers() *searchBuffers {
 // putSearchBuffers returns buffers to pool
 func putSearchBuffers(buf *searchBuffers) {
 	// Don't return to pool if buffers grew too large (avoid memory bloat)
-	if len(buf.visited) > 1000 || cap(buf.candidates.data) > 200 || cap(buf.results.data) > 200 {
+	if len(buf.visited) > maxVisitedMapSize || cap(buf.candidates.data) > maxCandidatesCapacity || cap(buf.results.data) > maxResultsCapacity {
 		return
 	}
 	searchBufferPool.Put(buf)
@@ -782,7 +790,7 @@ func (h *HybridCache) selectLevelHybrid() int {
 	// Use exponential decay to select level
 	// Most nodes at layer 0, fewer at higher layers
 	level := 0
-	for level < 16 { // Max 16 layers
+	for level < maxHNSWLayers {
 		if randFloat() > h.hnswIndex.ml {
 			break
 		}
diff --git a/src/semantic-router/pkg/cache/hybrid_vs_milvus_benchmark_test.go b/src/semantic-router/pkg/cache/hybrid_vs_milvus_benchmark_test.go
index 629e8900..e2fc4609 100644
--- a/src/semantic-router/pkg/cache/hybrid_vs_milvus_benchmark_test.go
+++ b/src/semantic-router/pkg/cache/hybrid_vs_milvus_benchmark_test.go
@@ -92,14 +92,15 @@ func (dcc *DatabaseCallCounter) Reset() {
 
 // getMilvusConfigPath returns the path to milvus.yaml config file
 func getMilvusConfigPath() string {
-	// Try absolute path first (for direct test execution)
-	configPath := "/home/ubuntu/rootfs/back/semantic-router.bak/config/cache/milvus.yaml"
-	if _, err := os.Stat(configPath); err == nil {
-		return configPath
+	// Check for environment variable first
+	if envPath := os.Getenv("MILVUS_CONFIG_PATH"); envPath != "" {
+		if _, err := os.Stat(envPath); err == nil {
+			return envPath
+		}
 	}
 
 	// Try relative from project root (when run via make)
-	configPath = "config/cache/milvus.yaml"
+	configPath := "config/cache/milvus.yaml"
 	if _, err := os.Stat(configPath); err == nil {
 		return configPath
 	}
@@ -126,10 +127,16 @@ func BenchmarkHybridVsMilvus(b *testing.B) {
 	}
 
 	// CSV output file - save to project benchmark_results directory
-	// Determine project root by walking up from test directory
-	projectRoot := "/home/ubuntu/rootfs/back/semantic-router.bak"
-	if envRoot := os.Getenv("PROJECT_ROOT"); envRoot != "" {
-		projectRoot = envRoot
+	// Use PROJECT_ROOT environment variable, fallback to working directory
+	projectRoot := os.Getenv("PROJECT_ROOT")
+	if projectRoot == "" {
+		// If not set, use current working directory
+		var err error
+		projectRoot, err = os.Getwd()
+		if err != nil {
+			b.Logf("Warning: Could not determine working directory: %v", err)
+			projectRoot = "."
+		}
 	}
 	resultsDir := filepath.Join(projectRoot, "benchmark_results", "hybrid_vs_milvus")
 	os.MkdirAll(resultsDir, 0755)

From c0d7918424fe3064e29b5fb43eee7ea9d9ea3708 Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Wed, 22 Oct 2025 19:21:12 +0000
Subject: [PATCH 11/13] review feedback

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 src/semantic-router/pkg/cache/hybrid_cache.go        |  3 ---
 src/semantic-router/pkg/cache/milvus_cache.go        | 10 +++++++---
 src/semantic-router/pkg/cache/simd_benchmark_test.go |  5 +++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/semantic-router/pkg/cache/hybrid_cache.go b/src/semantic-router/pkg/cache/hybrid_cache.go
index 19325d50..55bc1dd2 100644
--- a/src/semantic-router/pkg/cache/hybrid_cache.go
+++ b/src/semantic-router/pkg/cache/hybrid_cache.go
@@ -580,9 +580,6 @@ func (h *HybridCache) FindSimilar(model string, query string) ([]byte, bool, err
 	metrics.RecordCacheOperation("hybrid", "find_similar", "miss", time.Since(start).Seconds())
 	metrics.RecordCacheMiss()
 
-	// Suppress context error to avoid noise
-	_ = ctx
-
 	return nil, false, nil
 }
 
diff --git a/src/semantic-router/pkg/cache/milvus_cache.go b/src/semantic-router/pkg/cache/milvus_cache.go
index 0ac6d198..e658e86b 100644
--- a/src/semantic-router/pkg/cache/milvus_cache.go
+++ b/src/semantic-router/pkg/cache/milvus_cache.go
@@ -205,10 +205,14 @@ func loadMilvusConfig(configPath string) (*MilvusConfig, error) {
 	fmt.Printf("[DEBUG]   Development.AutoCreateCollection: %v\n", config.Development.AutoCreateCollection)
 	fmt.Printf("[DEBUG]   Development.DropCollectionOnStartup: %v\n", config.Development.DropCollectionOnStartup)
 
-	// WORKAROUND: Force development settings for benchmarks
+	// WORKAROUND: Force development settings for benchmarks/tests only
 	// There seems to be a YAML parsing issue with sigs.k8s.io/yaml
-	if !config.Development.AutoCreateCollection && !config.Development.DropCollectionOnStartup {
-		fmt.Printf("[WARN] Development settings parsed as false, forcing to true for benchmarks\n")
+	// Only apply this workaround if SR_BENCHMARK_MODE or SR_TEST_MODE is set
+	benchmarkMode := os.Getenv("SR_BENCHMARK_MODE")
+	testMode := os.Getenv("SR_TEST_MODE")
+	if (benchmarkMode == "1" || benchmarkMode == "true" || testMode == "1" || testMode == "true") &&
+		!config.Development.AutoCreateCollection && !config.Development.DropCollectionOnStartup {
+		fmt.Printf("[WARN] Development settings parsed as false, forcing to true for benchmarks/tests\n")
 		config.Development.AutoCreateCollection = true
 		config.Development.DropCollectionOnStartup = true
 	}
diff --git a/src/semantic-router/pkg/cache/simd_benchmark_test.go b/src/semantic-router/pkg/cache/simd_benchmark_test.go
index 3c30fa47..06695385 100644
--- a/src/semantic-router/pkg/cache/simd_benchmark_test.go
+++ b/src/semantic-router/pkg/cache/simd_benchmark_test.go
@@ -1,6 +1,7 @@
 package cache
 
 import (
+	"fmt"
 	"math/rand"
 	"testing"
 )
@@ -19,7 +20,7 @@ func BenchmarkDotProduct(b *testing.B) {
 			vec_b[i] = rand.Float32()
 		}
 
-		b.Run("SIMD/"+string(rune(size)), func(b *testing.B) {
+		b.Run(fmt.Sprintf("SIMD/%d", size), func(b *testing.B) {
 			b.ReportAllocs()
 			var sum float32
 			for i := 0; i < b.N; i++ {
@@ -28,7 +29,7 @@ func BenchmarkDotProduct(b *testing.B) {
 			_ = sum
 		})
 
-		b.Run("Scalar/"+string(rune(size)), func(b *testing.B) {
+		b.Run(fmt.Sprintf("Scalar/%d", size), func(b *testing.B) {
 			b.ReportAllocs()
 			var sum float32
 			for i := 0; i < b.N; i++ {

From 973281c536f75f28d241b1ea1aa6742a26aace51 Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Wed, 22 Oct 2025 19:26:34 +0000
Subject: [PATCH 12/13] review feedback

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 src/semantic-router/pkg/cache/hybrid_cache.go | 123 ++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/src/semantic-router/pkg/cache/hybrid_cache.go b/src/semantic-router/pkg/cache/hybrid_cache.go
index 55bc1dd2..c96b38c2 100644
--- a/src/semantic-router/pkg/cache/hybrid_cache.go
+++ b/src/semantic-router/pkg/cache/hybrid_cache.go
@@ -583,6 +583,129 @@ func (h *HybridCache) FindSimilar(model string, query string) ([]byte, bool, err
 	return nil, false, nil
 }
 
+// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
+func (h *HybridCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
+	start := time.Now()
+
+	if !h.enabled {
+		return nil, false, nil
+	}
+
+	queryPreview := query
+	if len(query) > 50 {
+		queryPreview = query[:50] + "..."
+	}
+	observability.Debugf("HybridCache.FindSimilarWithThreshold: searching for model='%s', query='%s', threshold=%.3f",
+		model, queryPreview, threshold)
+
+	// Generate query embedding
+	queryEmbedding, err := candle_binding.GetEmbedding(query, 0)
+	if err != nil {
+		metrics.RecordCacheOperation("hybrid", "find_similar_threshold", "error", time.Since(start).Seconds())
+		return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	// Search HNSW index for candidates above similarity threshold
+	// For semantic cache, we only need the first match, so search with k=1
+	// and stop early when finding a match above threshold
+	h.mu.RLock()
+	candidates := h.searchKNNHybridWithThreshold(queryEmbedding, 1, 20, threshold)
+	h.mu.RUnlock()
+
+	// Filter by similarity threshold before fetching from Milvus
+	var qualifiedCandidates []searchResult
+	for _, candidate := range candidates {
+		if candidate.similarity >= threshold {
+			qualifiedCandidates = append(qualifiedCandidates, candidate)
+		}
+	}
+
+	// Map qualified candidates to Milvus IDs (need lock for idMap access)
+	type candidateWithID struct {
+		milvusID   string
+		similarity float32
+		index      int
+	}
+
+	h.mu.RLock()
+	candidatesWithIDs := make([]candidateWithID, 0, len(qualifiedCandidates))
+	for _, candidate := range qualifiedCandidates {
+		if milvusID, ok := h.idMap[candidate.index]; ok {
+			candidatesWithIDs = append(candidatesWithIDs, candidateWithID{
+				milvusID:   milvusID,
+				similarity: candidate.similarity,
+				index:      candidate.index,
+			})
+		}
+	}
+	h.mu.RUnlock()
+
+	if len(candidatesWithIDs) == 0 {
+		atomic.AddInt64(&h.missCount, 1)
+		if len(candidates) > 0 {
+			observability.Debugf("HybridCache.FindSimilarWithThreshold: %d candidates found but none above threshold %.3f",
+				len(candidates), threshold)
+		} else {
+			observability.Debugf("HybridCache.FindSimilarWithThreshold: no candidates found in HNSW")
+		}
+		metrics.RecordCacheOperation("hybrid", "find_similar_threshold", "miss", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	observability.Debugf("HybridCache.FindSimilarWithThreshold: HNSW returned %d candidates, %d above threshold",
+		len(candidates), len(candidatesWithIDs))
+
+	// Fetch document from Milvus for qualified candidates
+	ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	defer cancel()
+
+	// Try candidates in order (already sorted by similarity from HNSW)
+	for _, candidate := range candidatesWithIDs {
+		// Fetch document from Milvus by ID (direct lookup by primary key)
+		fetchCtx, fetchCancel := context.WithTimeout(ctx, 2*time.Second)
+		responseBody, err := h.milvusCache.GetByID(fetchCtx, candidate.milvusID)
+		fetchCancel()
+
+		if err != nil {
+			observability.Debugf("HybridCache.FindSimilarWithThreshold: Milvus GetByID failed for %s: %v",
+				candidate.milvusID, err)
+			continue
+		}
+
+		if responseBody != nil {
+			atomic.AddInt64(&h.hitCount, 1)
+			observability.Debugf("HybridCache.FindSimilarWithThreshold: MILVUS HIT - similarity=%.4f (threshold=%.3f)",
+				candidate.similarity, threshold)
+			observability.LogEvent("hybrid_cache_hit", map[string]interface{}{
+				"backend":    "hybrid",
+				"source":     "milvus",
+				"similarity": candidate.similarity,
+				"threshold":  threshold,
+				"model":      model,
+				"latency_ms": time.Since(start).Milliseconds(),
+			})
+			metrics.RecordCacheOperation("hybrid", "find_similar_threshold", "hit_milvus", time.Since(start).Seconds())
+			metrics.RecordCacheHit()
+			return responseBody, true, nil
+		}
+	}
+
+	// No match found above threshold
+	atomic.AddInt64(&h.missCount, 1)
+	observability.Debugf("HybridCache.FindSimilarWithThreshold: CACHE MISS - no match above threshold")
+	observability.LogEvent("hybrid_cache_miss", map[string]interface{}{
+		"backend":    "hybrid",
+		"threshold":  threshold,
+		"model":      model,
+		"candidates": len(candidatesWithIDs),
+	})
+	metrics.RecordCacheOperation("hybrid", "find_similar_threshold", "miss", time.Since(start).Seconds())
+	metrics.RecordCacheMiss()
+
+	return nil, false, nil
+}
+
 // Close releases all resources
 func (h *HybridCache) Close() error {
 	if !h.enabled {

From b929ec5e9c4ccc67c0251554534e7c60d23d934b Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Wed, 22 Oct 2025 19:35:48 +0000
Subject: [PATCH 13/13] review feedback

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 src/semantic-router/pkg/cache/comprehensive_benchmark_test.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
index 9d6d0adb..891074b3 100644
--- a/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
+++ b/src/semantic-router/pkg/cache/comprehensive_benchmark_test.go
@@ -33,8 +33,8 @@ func (c ContentLength) String() string {
 // GenerateQuery generates a query with maximum semantic diversity using hash-based randomization
 func generateQuery(length ContentLength, index int) string {
 	// Hash the index to get pseudo-random values (deterministic but well-distributed)
-	hash := uint64(index)    // #nosec G115 -- index is always positive and bounded
-	hash = hash * 2654435761 // Knuth's multiplicative hash
+	hash := uint64(index) // #nosec G115 -- index is always positive and bounded
+	hash *= 2654435761    // Knuth's multiplicative hash
 
 	// Expanded templates for maximum diversity
 	templates := []string{