vllm-project
diff --git a/‎config/config.yaml‎
Lines changed: 0 additions & 13 deletions b/‎config/config.yaml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎deploy/kubernetes/config.yaml‎
Lines changed: 0 additions & 13 deletions b/‎deploy/kubernetes/config.yaml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎src/semantic-router/go.mod‎
Lines changed: 1 addition & 0 deletions b/‎src/semantic-router/go.mod‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/semantic-router/go.sum‎
Lines changed: 2 additions & 0 deletions b/‎src/semantic-router/go.sum‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 38 additions & 62 deletions b/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 38 additions & 62 deletions
@@ -20,10 +20,6 @@ prompt_guard:
   threshold: 0.7
   use_cpu: true
   jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
-gpu_config:
-  flops: 312000000000000  # 312e12 fp16
-  hbm: 2000000000000      # 2e12 (2 TB/s)
-  description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
 
 # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
 vllm_endpoints:
@@ -52,9 +48,6 @@ vllm_endpoints:
 
 model_config:
   phi4:
-    param_count: 14000000000  # 14B parameters https://huggingface.co/microsoft/phi-4
-    batch_size: 512.0  # vLLM default batch size
-    context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
     pricing:
       currency: USD
       prompt_per_1m: 0.07
@@ -65,9 +58,6 @@ model_config:
     # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
     preferred_endpoints: ["endpoint1", "endpoint3"]
   gemma3:27b:
-    param_count: 27000000000  # 27B parameters (base version)
-    batch_size: 512.0
-    context_size: 16384.0
     pricing:
       currency: USD
       prompt_per_1m: 0.067
@@ -77,9 +67,6 @@ model_config:
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     preferred_endpoints: ["endpoint1"]
   "mistral-small3.1":
-    param_count: 22000000000
-    batch_size: 512.0
-    context_size: 16384.0
     pricing:
       currency: USD
       prompt_per_1m: 0.1
 
@@ -20,10 +20,6 @@ prompt_guard:
   threshold: 0.7
   use_cpu: true
   jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
-gpu_config:
-  flops: 312000000000000  # 312e12 fp16
-  hbm: 2000000000000      # 2e12 (2 TB/s)
-  description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
 
 # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
 vllm_endpoints:
@@ -52,26 +48,17 @@ vllm_endpoints:
 
 model_config:
   phi4:
-    param_count: 14000000000  # 14B parameters https://huggingface.co/microsoft/phi-4
-    batch_size: 512.0  # vLLM default batch size
-    context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
     preferred_endpoints: ["endpoint1", "endpoint3"]
   gemma3:27b:
-    param_count: 27000000000  # 27B parameters (base version)
-    batch_size: 512.0
-    context_size: 16384.0
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     preferred_endpoints: ["endpoint1"]
   "mistral-small3.1":
-    param_count: 22000000000
-    batch_size: 512.0
-    context_size: 16384.0
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
 
@@ -24,6 +24,7 @@ require (
 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/cncf/xds/go v0.0.0-20241223141626-cff3c89139a3 // indirect
 	github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
+	github.com/fsnotify/fsnotify v1.7.0 // indirect
 	github.com/go-logr/logr v1.4.2 // indirect
 	github.com/go-task/slim-sprig/v3 v3.0.0 // indirect
 	github.com/google/go-cmp v0.7.0 // indirect
 
@@ -10,6 +10,8 @@ github.com/envoyproxy/go-control-plane/envoy v1.32.4 h1:jb83lalDRZSpPWW2Z7Mck/8k
 github.com/envoyproxy/go-control-plane/envoy v1.32.4/go.mod h1:Gzjc5k8JcJswLjAx1Zm+wSYE20UrLtt7JZMWiWQXQEw=
 github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8=
 github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU=
+github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
+github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
 github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
 github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
 github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
 
@@ -3,6 +3,7 @@ package config
 import (
 	"fmt"
 	"os"
+	"path/filepath"
 	"sync"
 
 	"gopkg.in/yaml.v3"
@@ -53,9 +54,6 @@ type RouterConfig struct {
 	// Model parameters configuration
 	ModelConfig map[string]ModelParams `yaml:"model_config"`
 
-	// GPU configuration for TTFT calculation
-	GPUConfig GPUConfig `yaml:"gpu_config"`
-
 	// Tools configuration for automatic tool selection
 	Tools ToolsConfig `yaml:"tools"`
 
@@ -191,7 +189,7 @@ type VLLMEndpoint struct {
 	HealthCheckPath string `yaml:"health_check_path,omitempty"`
 }
 
-// ModelParams represents configuration for model-specific parameters
+// ModelPricing represents configuration for model-specific parameters
 type ModelPricing struct {
 	// ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted.
 	Currency string `yaml:"currency,omitempty"`
@@ -202,15 +200,6 @@ type ModelPricing struct {
 }
 
 type ModelParams struct {
-	// Number of parameters in the model
-	ParamCount float64 `yaml:"param_count"`
-
-	// Default batch size for this model
-	BatchSize float64 `yaml:"batch_size"`
-
-	// Default context size for this model
-	ContextSize float64 `yaml:"context_size"`
-
 	// PII policy configuration for this model
 	PIIPolicy PIIPolicy `yaml:"pii_policy,omitempty"`
 
@@ -252,18 +241,6 @@ const (
 	PIITypeZipCode         = "ZIP_CODE"          // ZIP/Postal codes
 )
 
-// GPUConfig represents configuration for GPU parameters used in TTFT calculation
-type GPUConfig struct {
-	// FLOPs performance in operations per second
-	FLOPS float64 `yaml:"flops"`
-
-	// HBM memory bandwidth in bytes per second
-	HBM float64 `yaml:"hbm"`
-
-	// Description of the GPU configuration (e.g., "A100-80G")
-	Description string `yaml:"description"`
-}
-
 // GetCacheSimilarityThreshold returns the effective threshold for the semantic cache
 func (c *RouterConfig) GetCacheSimilarityThreshold() float32 {
 	if c.SemanticCache.SimilarityThreshold != nil {
@@ -291,30 +268,56 @@ var (
 	config     *RouterConfig
 	configOnce sync.Once
 	configErr  error
+	configMu   sync.RWMutex
 )
 
-// LoadConfig loads the configuration from the specified YAML file
+// LoadConfig loads the configuration from the specified YAML file once and caches it globally.
 func LoadConfig(configPath string) (*RouterConfig, error) {
 	configOnce.Do(func() {
-		data, err := os.ReadFile(configPath)
+		cfg, err := ParseConfigFile(configPath)
 		if err != nil {
-			configErr = fmt.Errorf("failed to read config file: %w", err)
-			return
-		}
-
-		config = &RouterConfig{}
-		if err := yaml.Unmarshal(data, config); err != nil {
-			configErr = fmt.Errorf("failed to parse config file: %w", err)
+			configErr = err
 			return
 		}
+		configMu.Lock()
+		config = cfg
+		configMu.Unlock()
 	})
-
 	if configErr != nil {
 		return nil, configErr
 	}
+	configMu.RLock()
+	defer configMu.RUnlock()
 	return config, nil
 }
 
+// ParseConfigFile parses the YAML config file without touching the global cache.
+func ParseConfigFile(configPath string) (*RouterConfig, error) {
+	// Resolve symlinks to handle Kubernetes ConfigMap mounts
+	resolved, _ := filepath.EvalSymlinks(configPath)
+	if resolved == "" {
+		resolved = configPath
+	}
+	data, err := os.ReadFile(resolved)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read config file: %w", err)
+	}
+	cfg := &RouterConfig{}
+	if err := yaml.Unmarshal(data, cfg); err != nil {
+		return nil, fmt.Errorf("failed to parse config file: %w", err)
+	}
+	return cfg, nil
+}
+
+// ReplaceGlobalConfig replaces the globally cached config. It is safe for concurrent readers.
+func ReplaceGlobalConfig(newCfg *RouterConfig) {
+	configMu.Lock()
+	defer configMu.Unlock()
+	config = newCfg
+	// Do not reset configOnce to avoid racing re-parses via LoadConfig; callers should use ParseConfigFile for fresher reads.
+	configErr = nil
+}
+
 // GetConfig returns the current configuration
 func GetConfig() *RouterConfig {
 	return config
@@ -349,33 +352,6 @@ func (c *RouterConfig) GetModelForCategoryIndex(index int) string {
 	return c.DefaultModel
 }
 
-// GetModelParamCount returns the parameter count for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelParamCount(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.ParamCount
-	}
-	return defaultValue
-}
-
-// GetModelBatchSize returns the batch size for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelBatchSize(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.BatchSize
-	}
-	return defaultValue
-}
-
-// GetModelContextSize returns the context size for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.ContextSize
-	}
-	return defaultValue
-}
-
 // GetModelPricing returns pricing per 1M tokens and its currency for the given model.
 // The currency indicates the unit of the returned rates (e.g., "USD").
 func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) {