vllm-project
diff --git a/‎candle-binding/Cargo.lock‎
Lines changed: 108 additions & 131 deletions b/‎candle-binding/Cargo.lock‎
Lines changed: 108 additions & 131 deletions
diff --git a/‎candle-binding/Cargo.toml‎
Lines changed: 3 additions & 3 deletions b/‎candle-binding/Cargo.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎candle-binding/src/model_architectures/embedding/qwen3_embedding.rs‎
Lines changed: 5 additions & 9 deletions b/‎candle-binding/src/model_architectures/embedding/qwen3_embedding.rs‎
Lines changed: 5 additions & 9 deletions
diff --git a/‎config/config.yaml‎
Lines changed: 5 additions & 0 deletions b/‎config/config.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎deploy/kubernetes/config.yaml‎
Lines changed: 4 additions & 1 deletion b/‎deploy/kubernetes/config.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎deploy/openshift/config-openshift.yaml‎
Lines changed: 3 additions & 0 deletions b/‎deploy/openshift/config-openshift.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/apis/vllm.ai/v1alpha1/filter_types.go‎
Lines changed: 6 additions & 0 deletions b/‎src/semantic-router/pkg/apis/vllm.ai/v1alpha1/filter_types.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/cache/cache_factory.go‎
Lines changed: 4 additions & 2 deletions b/‎src/semantic-router/pkg/cache/cache_factory.go‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/semantic-router/pkg/cache/cache_interface.go‎
Lines changed: 4 additions & 0 deletions b/‎src/semantic-router/pkg/cache/cache_interface.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/cache/cache_test.go‎
Lines changed: 23 additions & 0 deletions b/‎src/semantic-router/pkg/cache/cache_test.go‎
Lines changed: 23 additions & 0 deletions
@@ -19,9 +19,9 @@ flash-attn = ["candle-flash-attn"]
 
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-core = "0.8.4"
-candle-nn = "0.8.4"
-candle-transformers = "0.8.4"
+candle-core = { version = "0.8.4", features = ["cuda"] }
+candle-nn = { version = "0.8.4", features = ["cuda"] }
+candle-transformers = { version = "0.8.4", features = ["cuda"] }
 # Flash Attention 2 (optional, requires CUDA)
 # Reference: https://github.com/huggingface/candle/tree/main/candle-flash-attn
 candle-flash-attn = { version = "0.8.4", optional = true }
 
@@ -1326,7 +1326,7 @@ impl Qwen3Attention {
         q: &Tensor,
         k: &Tensor,
         v: &Tensor,
-        attention_mask: Option<&Tensor>,
+        _attention_mask: Option<&Tensor>,
     ) -> UnifiedResult<Tensor> {
         // Flash Attention 2 implementation using candle-flash-attn
         //
@@ -1363,8 +1363,8 @@ impl Qwen3Attention {
             &q_flash,
             &k_flash,
             &v_flash,
-            self.scale as f32, // softmax scaling factor
-            false,             // causal: false (Qwen3-Embedding is non-causal)
+            self.scaling as f32, // softmax scaling factor
+            false,               // causal: false (Qwen3-Embedding is non-causal)
         )
         .map_err(|e| UnifiedError::Processing {
             operation: "Flash Attention 2: flash_attn".to_string(),
@@ -1975,15 +1975,11 @@ impl Qwen3EmbeddingModel {
         #[cfg(not(feature = "flash-attn"))]
         {
             if config.max_position_embeddings > 8192 {
-                eprintln!("⚠️  WARNING: Flash Attention 2 not enabled!");
+                eprintln!("ℹ️  Note: Using standard attention");
                 eprintln!(
-                    "   For {}K sequence length, performance may degrade:",
+                    "   Sequence length: {}K tokens",
                     config.max_position_embeddings / 1024
                 );
-                eprintln!("   - Memory usage: +40% (estimated)");
-                eprintln!("   - Inference speed: -50% (estimated)");
-                eprintln!("   Official recommendation: Compile with --features flash-attn");
-                eprintln!("   Reference: https://github.com/qwenlm/qwen3-embedding#usage");
             }
         }
 
 
@@ -19,6 +19,11 @@ semantic_cache:
   # Combines in-memory HNSW for fast search with Milvus for scalable storage
   # max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
   # backend_config_path: "config/milvus.yaml" # Path to Milvus config
+  
+  # Embedding model for semantic similarity matching
+  # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
+  # Default: "bert" (fastest, lowest memory)
+  embedding_model: "bert"
 
 tools:
   enabled: true
 
@@ -9,7 +9,10 @@ semantic_cache:
   similarity_threshold: 0.8
   max_entries: 1000  # Only applies to memory backend
   ttl_seconds: 3600
-  eviction_policy: "fifo"  
+  eviction_policy: "fifo"
+  # Embedding model for semantic similarity matching
+  # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
+  embedding_model: "bert"  # Default: BERT (fastest, lowest memory for Kubernetes)  
 
 tools:
   enabled: true
 
@@ -10,6 +10,9 @@ semantic_cache:
   max_entries: 1000  # Only applies to memory backend
   ttl_seconds: 3600
   eviction_policy: "fifo"
+  # Embedding model for semantic similarity matching
+  # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
+  embedding_model: "bert"  # Default: BERT (fastest, lowest memory for OpenShift)
 
 tools:
   enabled: true
 
@@ -117,6 +117,12 @@ type SemanticCacheConfig struct {
 	// +kubebuilder:default=memory
 	Backend *string `json:"backend,omitempty"`
 
+	// EmbeddingModel defines which embedding model to use for semantic similarity
+	// +optional
+	// +kubebuilder:validation:Enum=bert;qwen3;gemma
+	// +kubebuilder:default=bert
+	EmbeddingModel *string `json:"embeddingModel,omitempty"`
+
 	// BackendConfig defines backend-specific configuration
 	// +optional
 	BackendConfig map[string]string `json:"backendConfig,omitempty"`
 
@@ -24,8 +24,9 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
 	switch config.BackendType {
 	case InMemoryCacheType, "":
 		// Use in-memory cache as the default backend
-		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f, UseHNSW: %t",
-			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold, config.UseHNSW)
+		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f, UseHNSW: %t, EmbeddingModel: %s",
+			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold, config.UseHNSW, config.EmbeddingModel)
+
 		options := InMemoryCacheOptions{
 			Enabled:             config.Enabled,
 			SimilarityThreshold: config.SimilarityThreshold,
@@ -35,6 +36,7 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
 			UseHNSW:             config.UseHNSW,
 			HNSWM:               config.HNSWM,
 			HNSWEfConstruction:  config.HNSWEfConstruction,
+			EmbeddingModel:      config.EmbeddingModel,
 		}
 		return NewInMemoryCache(options), nil
 
 
@@ -116,4 +116,8 @@ type CacheConfig struct {
 
 	// Hybrid cache specific settings
 	MaxMemoryEntries int `yaml:"max_memory_entries,omitempty"` // Max entries in HNSW for hybrid cache
+
+	// EmbeddingModel specifies which embedding model to use
+	// Options: "bert" (default), "qwen3", "gemma"
+	EmbeddingModel string `yaml:"embedding_model,omitempty"`
 }
@@ -51,6 +51,7 @@ var _ = Describe("Cache Package", func() {
 						SimilarityThreshold: 0.8,
 						MaxEntries:          1000,
 						TTLSeconds:          3600,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -66,6 +67,7 @@ var _ = Describe("Cache Package", func() {
 						SimilarityThreshold: 0.8,
 						MaxEntries:          1000,
 						TTLSeconds:          3600,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -81,6 +83,7 @@ var _ = Describe("Cache Package", func() {
 						SimilarityThreshold: 0.8,
 						MaxEntries:          500,
 						TTLSeconds:          1800,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -142,6 +145,7 @@ development:
 						SimilarityThreshold: 0.85,
 						TTLSeconds:          7200,
 						BackendConfigPath:   milvusConfigPath,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -169,6 +173,7 @@ development:
 						SimilarityThreshold: 0.8,
 						TTLSeconds:          3600,
 						BackendConfigPath:   milvusConfigPath,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -223,6 +228,7 @@ connection:
 						Enabled:             true,
 						SimilarityThreshold: 0.8,
 						TTLSeconds:          3600,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -240,6 +246,7 @@ connection:
 						SimilarityThreshold: -0.8, // invalid
 						MaxEntries:          10,
 						TTLSeconds:          -1, // invalid
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -259,6 +266,7 @@ connection:
 					SimilarityThreshold: 0.8,
 					MaxEntries:          1000,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 					EvictionPolicy:      "lru",
 				}
 
@@ -285,6 +293,7 @@ connection:
 					SimilarityThreshold: 1.5, // Invalid: > 1.0
 					MaxEntries:          1000,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -299,6 +308,7 @@ connection:
 					SimilarityThreshold: -0.1, // Invalid: < 0.0
 					MaxEntries:          1000,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -313,6 +323,7 @@ connection:
 					SimilarityThreshold: 0.8,
 					MaxEntries:          1000,
 					TTLSeconds:          -1, // Invalid: negative TTL
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -327,6 +338,7 @@ connection:
 					SimilarityThreshold: 0.8,
 					MaxEntries:          -1, // Invalid: negative max entries
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -341,6 +353,7 @@ connection:
 					SimilarityThreshold: 0.8,
 					MaxEntries:          1000,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 					EvictionPolicy:      "random", // unsupported
 				}
 
@@ -355,6 +368,7 @@ connection:
 					Enabled:             true,
 					SimilarityThreshold: 0.8,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 					// BackendConfigPath is missing
 				}
 
@@ -369,6 +383,7 @@ connection:
 					Enabled:             true,
 					SimilarityThreshold: 0.8,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 					BackendConfigPath:   "/nonexistent/milvus.yaml",
 				}
 
@@ -397,6 +412,7 @@ connection:
 					SimilarityThreshold: 1.0, // Valid: maximum threshold
 					MaxEntries:          10000,
 					TTLSeconds:          86400,
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -455,6 +471,7 @@ connection:
 				SimilarityThreshold: 0.8,
 				MaxEntries:          100,
 				TTLSeconds:          300,
+				EmbeddingModel:      "bert",
 			}
 			inMemoryCache = cache.NewInMemoryCache(options)
 		})
@@ -481,6 +498,7 @@ connection:
 				SimilarityThreshold: 0.8,
 				MaxEntries:          100,
 				TTLSeconds:          300,
+				EmbeddingModel:      "bert",
 			}
 			disabledCache := cache.NewInMemoryCache(disabledOptions)
 			defer disabledCache.Close()
@@ -548,6 +566,7 @@ connection:
 				SimilarityThreshold: 0.8,
 				MaxEntries:          100,
 				TTLSeconds:          1,
+				EmbeddingModel:      "bert",
 			})
 
 			err := inMemoryCache.AddPendingRequest("expired-request-id", "test-model", "stale query", []byte("request"))
@@ -571,6 +590,7 @@ connection:
 				SimilarityThreshold: 0.99, // Very high threshold
 				MaxEntries:          100,
 				TTLSeconds:          300,
+				EmbeddingModel:      "bert",
 			}
 			highThresholdCache := cache.NewInMemoryCache(highThresholdOptions)
 			defer highThresholdCache.Close()
@@ -621,6 +641,7 @@ connection:
 				SimilarityThreshold: 0.1,
 				MaxEntries:          10,
 				TTLSeconds:          1,
+				EmbeddingModel:      "bert",
 			})
 			defer ttlCache.Close()
 
@@ -660,6 +681,7 @@ connection:
 				SimilarityThreshold: 0.8,
 				MaxEntries:          100,
 				TTLSeconds:          300,
+				EmbeddingModel:      "bert",
 			}
 			disabledCache := cache.NewInMemoryCache(disabledOptions)
 			defer disabledCache.Close()
@@ -703,6 +725,7 @@ connection:
 				SimilarityThreshold: 0.9,
 				MaxEntries:          2000,
 				TTLSeconds:          7200,
+				EmbeddingModel:      "bert",
 				BackendConfigPath:   "config/cache/milvus.yaml",
 			}
Original file line number	Diff line number	Diff line change
`@@ -116,4 +116,8 @@ type CacheConfig struct {`
`116`	`116`
`117`	`117`	`// Hybrid cache specific settings`
`118`	`118`	MaxMemoryEntries int `yaml:"max_memory_entries,omitempty"` // Max entries in HNSW for hybrid cache
	`119`	`+`
	`120`	`+ // EmbeddingModel specifies which embedding model to use`
	`121`	`+ // Options: "bert" (default), "qwen3", "gemma"`
	`122`	+ EmbeddingModel string `yaml:"embedding_model,omitempty"`
`119`	`123`	`}`