vllm-project
diff --git a/‎config/config.yaml‎
Lines changed: 0 additions & 13 deletions b/‎config/config.yaml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎deploy/kubernetes/config.yaml‎
Lines changed: 0 additions & 13 deletions b/‎deploy/kubernetes/config.yaml‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 1 addition & 52 deletions b/‎src/semantic-router/pkg/config/config.go‎
Lines changed: 1 addition & 52 deletions
diff --git a/‎src/semantic-router/pkg/config/config_test.go‎
Lines changed: 5 additions & 91 deletions b/‎src/semantic-router/pkg/config/config_test.go‎
Lines changed: 5 additions & 91 deletions
diff --git a/‎src/semantic-router/pkg/extproc/router.go‎
Lines changed: 1 addition & 3 deletions b/‎src/semantic-router/pkg/extproc/router.go‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎src/semantic-router/pkg/utils/ttft/calculator.go‎
Lines changed: 0 additions & 64 deletions b/‎src/semantic-router/pkg/utils/ttft/calculator.go‎
Lines changed: 0 additions & 64 deletions
@@ -20,10 +20,6 @@ prompt_guard:
   threshold: 0.7
   use_cpu: true
   jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
-gpu_config:
-  flops: 312000000000000  # 312e12 fp16
-  hbm: 2000000000000      # 2e12 (2 TB/s)
-  description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
 
 # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
 vllm_endpoints:
@@ -52,9 +48,6 @@ vllm_endpoints:
 
 model_config:
   phi4:
-    param_count: 14000000000  # 14B parameters https://huggingface.co/microsoft/phi-4
-    batch_size: 512.0  # vLLM default batch size
-    context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
     pricing:
       currency: USD
       prompt_per_1m: 0.07
@@ -65,9 +58,6 @@ model_config:
     # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
     preferred_endpoints: ["endpoint1", "endpoint3"]
   gemma3:27b:
-    param_count: 27000000000  # 27B parameters (base version)
-    batch_size: 512.0
-    context_size: 16384.0
     pricing:
       currency: USD
       prompt_per_1m: 0.067
@@ -77,9 +67,6 @@ model_config:
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     preferred_endpoints: ["endpoint1"]
   "mistral-small3.1":
-    param_count: 22000000000
-    batch_size: 512.0
-    context_size: 16384.0
     pricing:
       currency: USD
       prompt_per_1m: 0.1
 
@@ -20,10 +20,6 @@ prompt_guard:
   threshold: 0.7
   use_cpu: true
   jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
-gpu_config:
-  flops: 312000000000000  # 312e12 fp16
-  hbm: 2000000000000      # 2e12 (2 TB/s)
-  description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
 
 # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
 vllm_endpoints:
@@ -52,26 +48,17 @@ vllm_endpoints:
 
 model_config:
   phi4:
-    param_count: 14000000000  # 14B parameters https://huggingface.co/microsoft/phi-4
-    batch_size: 512.0  # vLLM default batch size
-    context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
     preferred_endpoints: ["endpoint1", "endpoint3"]
   gemma3:27b:
-    param_count: 27000000000  # 27B parameters (base version)
-    batch_size: 512.0
-    context_size: 16384.0
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     preferred_endpoints: ["endpoint1"]
   "mistral-small3.1":
-    param_count: 22000000000
-    batch_size: 512.0
-    context_size: 16384.0
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
 
@@ -54,9 +54,6 @@ type RouterConfig struct {
 	// Model parameters configuration
 	ModelConfig map[string]ModelParams `yaml:"model_config"`
 
-	// GPU configuration for TTFT calculation
-	GPUConfig GPUConfig `yaml:"gpu_config"`
-
 	// Tools configuration for automatic tool selection
 	Tools ToolsConfig `yaml:"tools"`
 
@@ -192,7 +189,7 @@ type VLLMEndpoint struct {
 	HealthCheckPath string `yaml:"health_check_path,omitempty"`
 }
 
-// ModelParams represents configuration for model-specific parameters
+// ModelPricing represents configuration for model-specific parameters
 type ModelPricing struct {
 	// ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted.
 	Currency string `yaml:"currency,omitempty"`
@@ -203,15 +200,6 @@ type ModelPricing struct {
 }
 
 type ModelParams struct {
-	// Number of parameters in the model
-	ParamCount float64 `yaml:"param_count"`
-
-	// Default batch size for this model
-	BatchSize float64 `yaml:"batch_size"`
-
-	// Default context size for this model
-	ContextSize float64 `yaml:"context_size"`
-
 	// PII policy configuration for this model
 	PIIPolicy PIIPolicy `yaml:"pii_policy,omitempty"`
 
@@ -253,18 +241,6 @@ const (
 	PIITypeZipCode         = "ZIP_CODE"          // ZIP/Postal codes
 )
 
-// GPUConfig represents configuration for GPU parameters used in TTFT calculation
-type GPUConfig struct {
-	// FLOPs performance in operations per second
-	FLOPS float64 `yaml:"flops"`
-
-	// HBM memory bandwidth in bytes per second
-	HBM float64 `yaml:"hbm"`
-
-	// Description of the GPU configuration (e.g., "A100-80G")
-	Description string `yaml:"description"`
-}
-
 // GetCacheSimilarityThreshold returns the effective threshold for the semantic cache
 func (c *RouterConfig) GetCacheSimilarityThreshold() float32 {
 	if c.SemanticCache.SimilarityThreshold != nil {
@@ -376,33 +352,6 @@ func (c *RouterConfig) GetModelForCategoryIndex(index int) string {
 	return c.DefaultModel
 }
 
-// GetModelParamCount returns the parameter count for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelParamCount(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.ParamCount
-	}
-	return defaultValue
-}
-
-// GetModelBatchSize returns the batch size for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelBatchSize(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.BatchSize
-	}
-	return defaultValue
-}
-
-// GetModelContextSize returns the context size for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.ContextSize
-	}
-	return defaultValue
-}
-
 // GetModelPricing returns pricing per 1M tokens and its currency for the given model.
 // The currency indicates the unit of the returned rates (e.g., "USD").
 func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) {
 
@@ -105,26 +105,15 @@ vllm_endpoints:
 
 model_config:
   "model-a":
-    param_count: 1000000000
-    batch_size: 32
-    context_size: 8192
     pii_policy:
       allow_by_default: false
       pii_types_allowed: ["NO_PII", "ORGANIZATION"]
     preferred_endpoints: ["endpoint1"]
   "model-b":
-    param_count: 175000000
-    batch_size: 64
-    context_size: 4096
     pii_policy:
       allow_by_default: true
     preferred_endpoints: ["endpoint1", "endpoint2"]
 
-gpu_config:
-  flops: 312000000000000
-  hbm: 2000000000000
-  description: "A100-80G"
-
 tools:
   enabled: true
   top_k: 5
@@ -172,14 +161,9 @@ tools:
 
 				// Verify model config
 				Expect(cfg.ModelConfig).To(HaveKey("model-a"))
-				Expect(cfg.ModelConfig["model-a"].ParamCount).To(Equal(float64(1000000000)))
 				Expect(cfg.ModelConfig["model-a"].PIIPolicy.AllowByDefault).To(BeFalse())
 				Expect(cfg.ModelConfig["model-a"].PIIPolicy.PIITypes).To(ContainElements("NO_PII", "ORGANIZATION"))
 
-				// Verify GPU config
-				Expect(cfg.GPUConfig.FLOPS).To(Equal(float64(312000000000000)))
-				Expect(cfg.GPUConfig.Description).To(Equal("A100-80G"))
-
 				// Verify tools config
 				Expect(cfg.Tools.Enabled).To(BeTrue())
 				Expect(cfg.Tools.TopK).To(Equal(5))
@@ -430,7 +414,8 @@ model_config:
     pii_policy:
       allow_by_default: true
   "unconfigured-model":
-    param_count: 1000000
+    pii_policy:
+      allow_by_default: true
 `
 			err := os.WriteFile(configFile, []byte(configContent), 0o644)
 			Expect(err).NotTo(HaveOccurred())
@@ -646,74 +631,6 @@ prompt_guard:
 		})
 	})
 
-	Describe("Model Parameter Functions", func() {
-		BeforeEach(func() {
-			configContent := `
-model_config:
-  "configured-model":
-    param_count: 175000000
-    batch_size: 32
-    context_size: 4096
-`
-			err := os.WriteFile(configFile, []byte(configContent), 0o644)
-			Expect(err).NotTo(HaveOccurred())
-		})
-
-		Describe("GetModelParamCount", func() {
-			It("should return configured value for existing model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				count := cfg.GetModelParamCount("configured-model", 1000000)
-				Expect(count).To(Equal(float64(175000000)))
-			})
-
-			It("should return default value for non-existent model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				count := cfg.GetModelParamCount("unknown-model", 999999)
-				Expect(count).To(Equal(float64(999999)))
-			})
-		})
-
-		Describe("GetModelBatchSize", func() {
-			It("should return configured value for existing model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				batchSize := cfg.GetModelBatchSize("configured-model", 16)
-				Expect(batchSize).To(Equal(float64(32)))
-			})
-
-			It("should return default value for non-existent model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				batchSize := cfg.GetModelBatchSize("unknown-model", 64)
-				Expect(batchSize).To(Equal(float64(64)))
-			})
-		})
-
-		Describe("GetModelContextSize", func() {
-			It("should return configured value for existing model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				contextSize := cfg.GetModelContextSize("configured-model", 2048)
-				Expect(contextSize).To(Equal(float64(4096)))
-			})
-
-			It("should return default value for non-existent model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				contextSize := cfg.GetModelContextSize("unknown-model", 8192)
-				Expect(contextSize).To(Equal(float64(8192)))
-			})
-		})
-	})
-
 	Describe("GetCategoryDescriptions", func() {
 		Context("with categories having descriptions", func() {
 			BeforeEach(func() {
@@ -805,18 +722,15 @@ semantic_cache:
 			configContent := `
 model_config:
   "large-model":
-    param_count: 1.7976931348623157e+308
-gpu_config:
-  flops: 1e20
-  hbm: 1e15
+    pii_policy:
+      allow_by_default: true
 `
 			err := os.WriteFile(configFile, []byte(configContent), 0o644)
 			Expect(err).NotTo(HaveOccurred())
 
 			cfg, err := config.LoadConfig(configFile)
 			Expect(err).NotTo(HaveOccurred())
-			Expect(cfg.ModelConfig["large-model"].ParamCount).To(Equal(1.7976931348623157e+308))
-			Expect(cfg.GPUConfig.FLOPS).To(Equal(1e20))
+			Expect(cfg.ModelConfig["large-model"].PIIPolicy.AllowByDefault).To(BeTrue())
 		})
 
 		It("should handle special string values", func() {
 
@@ -14,7 +14,6 @@ import (
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/tools"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/classification"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/pii"
-	"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/ttft"
 )
 
 var (
@@ -132,8 +131,7 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) {
 
 	// Create utility components
 	piiChecker := pii.NewPolicyChecker(cfg, cfg.ModelConfig)
-	ttftCalculator := ttft.NewCalculator(cfg.GPUConfig)
-	modelTTFT := ttftCalculator.InitializeModelTTFT(cfg)
+	modelTTFT := make(map[string]float64) // Empty TTFT map since load balancing is disabled
 	classifier := classification.NewClassifier(cfg, categoryMapping, piiMapping, jailbreakMapping, modelTTFT)
 
 	// Create global classification service for API access