rootfs
diff --git a/‎candle-binding/Cargo.lock‎
Lines changed: 108 additions & 131 deletions b/‎candle-binding/Cargo.lock‎
Lines changed: 108 additions & 131 deletions
diff --git a/‎candle-binding/Cargo.toml‎
Lines changed: 3 additions & 3 deletions b/‎candle-binding/Cargo.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎candle-binding/src/model_architectures/embedding/qwen3_embedding.rs‎
Lines changed: 5 additions & 9 deletions b/‎candle-binding/src/model_architectures/embedding/qwen3_embedding.rs‎
Lines changed: 5 additions & 9 deletions
diff --git a/‎config/config.yaml‎
Lines changed: 4 additions & 0 deletions b/‎config/config.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎deploy/kubernetes/config.yaml‎
Lines changed: 4 additions & 1 deletion b/‎deploy/kubernetes/config.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎deploy/openshift/config-openshift.yaml‎
Lines changed: 3 additions & 0 deletions b/‎deploy/openshift/config-openshift.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/apis/vllm.ai/v1alpha1/filter_types.go‎
Lines changed: 6 additions & 0 deletions b/‎src/semantic-router/pkg/apis/vllm.ai/v1alpha1/filter_types.go‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/cache/cache_factory.go‎
Lines changed: 4 additions & 2 deletions b/‎src/semantic-router/pkg/cache/cache_factory.go‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/semantic-router/pkg/cache/cache_interface.go‎
Lines changed: 4 additions & 0 deletions b/‎src/semantic-router/pkg/cache/cache_interface.go‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/cache/cache_test.go‎
Lines changed: 23 additions & 0 deletions b/‎src/semantic-router/pkg/cache/cache_test.go‎
Lines changed: 23 additions & 0 deletions
@@ -18,9 +18,9 @@ flash-attn = ["candle-flash-attn"]
 
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-core = "0.8.4"
-candle-nn = "0.8.4"
-candle-transformers = "0.8.4"
+candle-core = { version = "0.8.4", features = ["cuda"] }
+candle-nn = { version = "0.8.4", features = ["cuda"] }
+candle-transformers = { version = "0.8.4", features = ["cuda"] }
 # Flash Attention 2 (optional, requires CUDA)
 # Reference: https://github.com/huggingface/candle/tree/main/candle-flash-attn
 candle-flash-attn = { version = "0.8.4", optional = true }
 
@@ -1326,7 +1326,7 @@ impl Qwen3Attention {
         q: &Tensor,
         k: &Tensor,
         v: &Tensor,
-        attention_mask: Option<&Tensor>,
+        _attention_mask: Option<&Tensor>,
     ) -> UnifiedResult<Tensor> {
         // Flash Attention 2 implementation using candle-flash-attn
         //
@@ -1363,8 +1363,8 @@ impl Qwen3Attention {
             &q_flash,
             &k_flash,
             &v_flash,
-            self.scale as f32, // softmax scaling factor
-            false,             // causal: false (Qwen3-Embedding is non-causal)
+            self.scaling as f32, // softmax scaling factor
+            false,               // causal: false (Qwen3-Embedding is non-causal)
         )
         .map_err(|e| UnifiedError::Processing {
             operation: "Flash Attention 2: flash_attn".to_string(),
@@ -1975,15 +1975,11 @@ impl Qwen3EmbeddingModel {
         #[cfg(not(feature = "flash-attn"))]
         {
             if config.max_position_embeddings > 8192 {
-                eprintln!("⚠️  WARNING: Flash Attention 2 not enabled!");
+                eprintln!("ℹ️  Note: Using standard attention");
                 eprintln!(
-                    "   For {}K sequence length, performance may degrade:",
+                    "   Sequence length: {}K tokens",
                     config.max_position_embeddings / 1024
                 );
-                eprintln!("   - Memory usage: +40% (estimated)");
-                eprintln!("   - Inference speed: -50% (estimated)");
-                eprintln!("   Official recommendation: Compile with --features flash-attn");
-                eprintln!("   Reference: https://github.com/qwenlm/qwen3-embedding#usage");
             }
         }
 
 
@@ -10,6 +10,10 @@ semantic_cache:
   max_entries: 1000 # Only applies to memory backend
   ttl_seconds: 3600
   eviction_policy: "fifo"
+  # Embedding model for semantic similarity matching
+  # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
+  # Default: "bert" (fastest, lowest memory)
+  embedding_model: "bert"
 
 tools:
   enabled: true
 
@@ -9,7 +9,10 @@ semantic_cache:
   similarity_threshold: 0.8
   max_entries: 1000  # Only applies to memory backend
   ttl_seconds: 3600
-  eviction_policy: "fifo"  
+  eviction_policy: "fifo"
+  # Embedding model for semantic similarity matching
+  # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
+  embedding_model: "bert"  # Default: BERT (fastest, lowest memory for Kubernetes)  
 
 tools:
   enabled: true
 
@@ -10,6 +10,9 @@ semantic_cache:
   max_entries: 1000  # Only applies to memory backend
   ttl_seconds: 3600
   eviction_policy: "fifo"
+  # Embedding model for semantic similarity matching
+  # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
+  embedding_model: "bert"  # Default: BERT (fastest, lowest memory for OpenShift)
 
 tools:
   enabled: true
 
@@ -117,6 +117,12 @@ type SemanticCacheConfig struct {
 	// +kubebuilder:default=memory
 	Backend *string `json:"backend,omitempty"`
 
+	// EmbeddingModel defines which embedding model to use for semantic similarity
+	// +optional
+	// +kubebuilder:validation:Enum=bert;qwen3;gemma
+	// +kubebuilder:default=bert
+	EmbeddingModel *string `json:"embeddingModel,omitempty"`
+
 	// BackendConfig defines backend-specific configuration
 	// +optional
 	BackendConfig map[string]string `json:"backendConfig,omitempty"`
 
@@ -24,14 +24,16 @@ func NewCacheBackend(config CacheConfig) (CacheBackend, error) {
 	switch config.BackendType {
 	case InMemoryCacheType, "":
 		// Use in-memory cache as the default backend
-		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f",
-			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold)
+		observability.Debugf("Creating in-memory cache backend - MaxEntries: %d, TTL: %ds, Threshold: %.3f, EmbeddingModel: %s",
+			config.MaxEntries, config.TTLSeconds, config.SimilarityThreshold, config.EmbeddingModel)
+
 		options := InMemoryCacheOptions{
 			Enabled:             config.Enabled,
 			SimilarityThreshold: config.SimilarityThreshold,
 			MaxEntries:          config.MaxEntries,
 			TTLSeconds:          config.TTLSeconds,
 			EvictionPolicy:      config.EvictionPolicy,
+			EmbeddingModel:      config.EmbeddingModel,
 		}
 		return NewInMemoryCache(options), nil
 
 
@@ -96,4 +96,8 @@ type CacheConfig struct {
 
 	// BackendConfigPath points to backend-specific configuration files
 	BackendConfigPath string `yaml:"backend_config_path,omitempty"`
+
+	// EmbeddingModel specifies which embedding model to use
+	// Options: "bert" (default), "qwen3", "gemma"
+	EmbeddingModel string `yaml:"embedding_model,omitempty"`
 }
@@ -50,6 +50,7 @@ var _ = Describe("Cache Package", func() {
 						SimilarityThreshold: 0.8,
 						MaxEntries:          1000,
 						TTLSeconds:          3600,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -65,6 +66,7 @@ var _ = Describe("Cache Package", func() {
 						SimilarityThreshold: 0.8,
 						MaxEntries:          1000,
 						TTLSeconds:          3600,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -80,6 +82,7 @@ var _ = Describe("Cache Package", func() {
 						SimilarityThreshold: 0.8,
 						MaxEntries:          500,
 						TTLSeconds:          1800,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -141,6 +144,7 @@ development:
 						SimilarityThreshold: 0.85,
 						TTLSeconds:          7200,
 						BackendConfigPath:   milvusConfigPath,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -168,6 +172,7 @@ development:
 						SimilarityThreshold: 0.8,
 						TTLSeconds:          3600,
 						BackendConfigPath:   milvusConfigPath,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -184,6 +189,7 @@ development:
 						Enabled:             true,
 						SimilarityThreshold: 0.8,
 						TTLSeconds:          3600,
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -201,6 +207,7 @@ development:
 						SimilarityThreshold: -0.8, // invalid
 						MaxEntries:          10,
 						TTLSeconds:          -1, // invalid
+						EmbeddingModel:      "bert",
 					}
 
 					backend, err := cache.NewCacheBackend(config)
@@ -220,6 +227,7 @@ development:
 					SimilarityThreshold: 0.8,
 					MaxEntries:          1000,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 					EvictionPolicy:      "lru",
 				}
 
@@ -246,6 +254,7 @@ development:
 					SimilarityThreshold: 1.5, // Invalid: > 1.0
 					MaxEntries:          1000,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -260,6 +269,7 @@ development:
 					SimilarityThreshold: -0.1, // Invalid: < 0.0
 					MaxEntries:          1000,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -274,6 +284,7 @@ development:
 					SimilarityThreshold: 0.8,
 					MaxEntries:          1000,
 					TTLSeconds:          -1, // Invalid: negative TTL
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -288,6 +299,7 @@ development:
 					SimilarityThreshold: 0.8,
 					MaxEntries:          -1, // Invalid: negative max entries
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -302,6 +314,7 @@ development:
 					SimilarityThreshold: 0.8,
 					MaxEntries:          1000,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 					EvictionPolicy:      "random", // unsupported
 				}
 
@@ -316,6 +329,7 @@ development:
 					Enabled:             true,
 					SimilarityThreshold: 0.8,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 					// BackendConfigPath is missing
 				}
 
@@ -330,6 +344,7 @@ development:
 					Enabled:             true,
 					SimilarityThreshold: 0.8,
 					TTLSeconds:          3600,
+					EmbeddingModel:      "bert",
 					BackendConfigPath:   "/nonexistent/milvus.yaml",
 				}
 
@@ -358,6 +373,7 @@ development:
 					SimilarityThreshold: 1.0, // Valid: maximum threshold
 					MaxEntries:          10000,
 					TTLSeconds:          86400,
+					EmbeddingModel:      "bert",
 				}
 
 				err := cache.ValidateCacheConfig(config)
@@ -416,6 +432,7 @@ development:
 				SimilarityThreshold: 0.8,
 				MaxEntries:          100,
 				TTLSeconds:          300,
+				EmbeddingModel:      "bert",
 			}
 			inMemoryCache = cache.NewInMemoryCache(options)
 		})
@@ -442,6 +459,7 @@ development:
 				SimilarityThreshold: 0.8,
 				MaxEntries:          100,
 				TTLSeconds:          300,
+				EmbeddingModel:      "bert",
 			}
 			disabledCache := cache.NewInMemoryCache(disabledOptions)
 			defer disabledCache.Close()
@@ -509,6 +527,7 @@ development:
 				SimilarityThreshold: 0.8,
 				MaxEntries:          100,
 				TTLSeconds:          1,
+				EmbeddingModel:      "bert",
 			})
 
 			err := inMemoryCache.AddPendingRequest("expired-request-id", "test-model", "stale query", []byte("request"))
@@ -532,6 +551,7 @@ development:
 				SimilarityThreshold: 0.99, // Very high threshold
 				MaxEntries:          100,
 				TTLSeconds:          300,
+				EmbeddingModel:      "bert",
 			}
 			highThresholdCache := cache.NewInMemoryCache(highThresholdOptions)
 			defer highThresholdCache.Close()
@@ -582,6 +602,7 @@ development:
 				SimilarityThreshold: 0.1,
 				MaxEntries:          10,
 				TTLSeconds:          1,
+				EmbeddingModel:      "bert",
 			})
 			defer ttlCache.Close()
 
@@ -621,6 +642,7 @@ development:
 				SimilarityThreshold: 0.8,
 				MaxEntries:          100,
 				TTLSeconds:          300,
+				EmbeddingModel:      "bert",
 			}
 			disabledCache := cache.NewInMemoryCache(disabledOptions)
 			defer disabledCache.Close()
@@ -664,6 +686,7 @@ development:
 				SimilarityThreshold: 0.9,
 				MaxEntries:          2000,
 				TTLSeconds:          7200,
+				EmbeddingModel:      "bert",
 				BackendConfigPath:   "config/cache/milvus.yaml",
 			}
Original file line number	Diff line number	Diff line change
`@@ -96,4 +96,8 @@ type CacheConfig struct {`
`96`	`96`
`97`	`97`	`// BackendConfigPath points to backend-specific configuration files`
`98`	`98`	BackendConfigPath string `yaml:"backend_config_path,omitempty"`
	`99`	`+`
	`100`	`+ // EmbeddingModel specifies which embedding model to use`
	`101`	`+ // Options: "bert" (default), "qwen3", "gemma"`
	`102`	+ EmbeddingModel string `yaml:"embedding_model,omitempty"`
`99`	`103`	`}`