From 348c74079bd90ee0ac9ec6a75f3a723798ea442d Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Mon, 8 Sep 2025 13:26:16 -0400
Subject: [PATCH 1/2] chore: remove GPU and model params in config. Backend and
 model aware optimization will be handled in the control plane

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 config/config.yaml                            | 13 ---
 deploy/kubernetes/config.yaml                 | 13 ---
 src/semantic-router/pkg/config/config.go      | 52 +---------
 src/semantic-router/pkg/config/config_test.go | 95 +------------------
 src/semantic-router/pkg/extproc/router.go     |  4 +-
 .../pkg/utils/ttft/calculator.go              | 64 -------------
 .../pkg/utils/ttft/calculator_test.go         | 56 -----------
 src/training/model_eval/result_to_config.py   |  5 -
 website/docs/getting-started/configuration.md |  6 --
 9 files changed, 7 insertions(+), 301 deletions(-)
 delete mode 100644 src/semantic-router/pkg/utils/ttft/calculator.go
 delete mode 100644 src/semantic-router/pkg/utils/ttft/calculator_test.go

diff --git a/config/config.yaml b/config/config.yaml
index d72624e8..3b74778a 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -20,10 +20,6 @@ prompt_guard:
   threshold: 0.7
   use_cpu: true
   jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
-gpu_config:
-  flops: 312000000000000  # 312e12 fp16
-  hbm: 2000000000000      # 2e12 (2 TB/s)
-  description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
 
 # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
 vllm_endpoints:
@@ -52,9 +48,6 @@ vllm_endpoints:
 
 model_config:
   phi4:
-    param_count: 14000000000  # 14B parameters https://huggingface.co/microsoft/phi-4
-    batch_size: 512.0  # vLLM default batch size
-    context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
     pricing:
       currency: USD
       prompt_per_1m: 0.07
@@ -65,9 +58,6 @@ model_config:
     # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
     preferred_endpoints: ["endpoint1", "endpoint3"]
   gemma3:27b:
-    param_count: 27000000000  # 27B parameters (base version)
-    batch_size: 512.0
-    context_size: 16384.0
     pricing:
       currency: USD
       prompt_per_1m: 0.067
@@ -77,9 +67,6 @@ model_config:
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     preferred_endpoints: ["endpoint1"]
   "mistral-small3.1":
-    param_count: 22000000000
-    batch_size: 512.0
-    context_size: 16384.0
     pricing:
       currency: USD
       prompt_per_1m: 0.1
diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/config.yaml
index 9813f5c2..9470b1ce 100644
--- a/deploy/kubernetes/config.yaml
+++ b/deploy/kubernetes/config.yaml
@@ -20,10 +20,6 @@ prompt_guard:
   threshold: 0.7
   use_cpu: true
   jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
-gpu_config:
-  flops: 312000000000000  # 312e12 fp16
-  hbm: 2000000000000      # 2e12 (2 TB/s)
-  description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
 
 # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
 vllm_endpoints:
@@ -52,26 +48,17 @@ vllm_endpoints:
 
 model_config:
   phi4:
-    param_count: 14000000000  # 14B parameters https://huggingface.co/microsoft/phi-4
-    batch_size: 512.0  # vLLM default batch size
-    context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
     preferred_endpoints: ["endpoint1", "endpoint3"]
   gemma3:27b:
-    param_count: 27000000000  # 27B parameters (base version)
-    batch_size: 512.0
-    context_size: 16384.0
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
     preferred_endpoints: ["endpoint1"]
   "mistral-small3.1":
-    param_count: 22000000000
-    batch_size: 512.0
-    context_size: 16384.0
     pii_policy:
       allow_by_default: false  # Deny all PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"]  # Only allow these specific PII types
diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
index 85ee09f1..1965ec6b 100644
--- a/src/semantic-router/pkg/config/config.go
+++ b/src/semantic-router/pkg/config/config.go
@@ -53,9 +53,6 @@ type RouterConfig struct {
 	// Model parameters configuration
 	ModelConfig map[string]ModelParams `yaml:"model_config"`
 
-	// GPU configuration for TTFT calculation
-	GPUConfig GPUConfig `yaml:"gpu_config"`
-
 	// Tools configuration for automatic tool selection
 	Tools ToolsConfig `yaml:"tools"`
 
@@ -191,7 +188,7 @@ type VLLMEndpoint struct {
 	HealthCheckPath string `yaml:"health_check_path,omitempty"`
 }
 
-// ModelParams represents configuration for model-specific parameters
+// ModelPricing represents configuration for model-specific parameters
 type ModelPricing struct {
 	// ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted.
 	Currency string `yaml:"currency,omitempty"`
@@ -202,15 +199,6 @@ type ModelPricing struct {
 }
 
 type ModelParams struct {
-	// Number of parameters in the model
-	ParamCount float64 `yaml:"param_count"`
-
-	// Default batch size for this model
-	BatchSize float64 `yaml:"batch_size"`
-
-	// Default context size for this model
-	ContextSize float64 `yaml:"context_size"`
-
 	// PII policy configuration for this model
 	PIIPolicy PIIPolicy `yaml:"pii_policy,omitempty"`
 
@@ -252,18 +240,6 @@ const (
 	PIITypeZipCode         = "ZIP_CODE"          // ZIP/Postal codes
 )
 
-// GPUConfig represents configuration for GPU parameters used in TTFT calculation
-type GPUConfig struct {
-	// FLOPs performance in operations per second
-	FLOPS float64 `yaml:"flops"`
-
-	// HBM memory bandwidth in bytes per second
-	HBM float64 `yaml:"hbm"`
-
-	// Description of the GPU configuration (e.g., "A100-80G")
-	Description string `yaml:"description"`
-}
-
 // GetCacheSimilarityThreshold returns the effective threshold for the semantic cache
 func (c *RouterConfig) GetCacheSimilarityThreshold() float32 {
 	if c.SemanticCache.SimilarityThreshold != nil {
@@ -349,32 +325,6 @@ func (c *RouterConfig) GetModelForCategoryIndex(index int) string {
 	return c.DefaultModel
 }
 
-// GetModelParamCount returns the parameter count for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelParamCount(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.ParamCount
-	}
-	return defaultValue
-}
-
-// GetModelBatchSize returns the batch size for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelBatchSize(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.BatchSize
-	}
-	return defaultValue
-}
-
-// GetModelContextSize returns the context size for a given model
-// If the model is not found in the config, returns the default value
-func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float64) float64 {
-	if modelConfig, ok := c.ModelConfig[modelName]; ok {
-		return modelConfig.ContextSize
-	}
-	return defaultValue
-}
 
 // GetModelPricing returns pricing per 1M tokens and its currency for the given model.
 // The currency indicates the unit of the returned rates (e.g., "USD").
diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go
index 36338c66..5ab69746 100644
--- a/src/semantic-router/pkg/config/config_test.go
+++ b/src/semantic-router/pkg/config/config_test.go
@@ -105,26 +105,15 @@ vllm_endpoints:
 
 model_config:
   "model-a":
-    param_count: 1000000000
-    batch_size: 32
-    context_size: 8192
     pii_policy:
       allow_by_default: false
       pii_types_allowed: ["NO_PII", "ORGANIZATION"]
     preferred_endpoints: ["endpoint1"]
   "model-b":
-    param_count: 175000000
-    batch_size: 64
-    context_size: 4096
     pii_policy:
       allow_by_default: true
     preferred_endpoints: ["endpoint1", "endpoint2"]
 
-gpu_config:
-  flops: 312000000000000
-  hbm: 2000000000000
-  description: "A100-80G"
-
 tools:
   enabled: true
   top_k: 5
@@ -172,14 +161,9 @@ tools:
 
 				// Verify model config
 				Expect(cfg.ModelConfig).To(HaveKey("model-a"))
-				Expect(cfg.ModelConfig["model-a"].ParamCount).To(Equal(float64(1000000000)))
 				Expect(cfg.ModelConfig["model-a"].PIIPolicy.AllowByDefault).To(BeFalse())
 				Expect(cfg.ModelConfig["model-a"].PIIPolicy.PIITypes).To(ContainElements("NO_PII", "ORGANIZATION"))
 
-				// Verify GPU config
-				Expect(cfg.GPUConfig.FLOPS).To(Equal(float64(312000000000000)))
-				Expect(cfg.GPUConfig.Description).To(Equal("A100-80G"))
-
 				// Verify tools config
 				Expect(cfg.Tools.Enabled).To(BeTrue())
 				Expect(cfg.Tools.TopK).To(Equal(5))
@@ -430,7 +414,8 @@ model_config:
     pii_policy:
       allow_by_default: true
   "unconfigured-model":
-    param_count: 1000000
+    pii_policy:
+      allow_by_default: true
 `
 			err := os.WriteFile(configFile, []byte(configContent), 0o644)
 			Expect(err).NotTo(HaveOccurred())
@@ -646,73 +631,6 @@ prompt_guard:
 		})
 	})
 
-	Describe("Model Parameter Functions", func() {
-		BeforeEach(func() {
-			configContent := `
-model_config:
-  "configured-model":
-    param_count: 175000000
-    batch_size: 32
-    context_size: 4096
-`
-			err := os.WriteFile(configFile, []byte(configContent), 0o644)
-			Expect(err).NotTo(HaveOccurred())
-		})
-
-		Describe("GetModelParamCount", func() {
-			It("should return configured value for existing model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				count := cfg.GetModelParamCount("configured-model", 1000000)
-				Expect(count).To(Equal(float64(175000000)))
-			})
-
-			It("should return default value for non-existent model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				count := cfg.GetModelParamCount("unknown-model", 999999)
-				Expect(count).To(Equal(float64(999999)))
-			})
-		})
-
-		Describe("GetModelBatchSize", func() {
-			It("should return configured value for existing model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				batchSize := cfg.GetModelBatchSize("configured-model", 16)
-				Expect(batchSize).To(Equal(float64(32)))
-			})
-
-			It("should return default value for non-existent model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				batchSize := cfg.GetModelBatchSize("unknown-model", 64)
-				Expect(batchSize).To(Equal(float64(64)))
-			})
-		})
-
-		Describe("GetModelContextSize", func() {
-			It("should return configured value for existing model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				contextSize := cfg.GetModelContextSize("configured-model", 2048)
-				Expect(contextSize).To(Equal(float64(4096)))
-			})
-
-			It("should return default value for non-existent model", func() {
-				cfg, err := config.LoadConfig(configFile)
-				Expect(err).NotTo(HaveOccurred())
-
-				contextSize := cfg.GetModelContextSize("unknown-model", 8192)
-				Expect(contextSize).To(Equal(float64(8192)))
-			})
-		})
-	})
 
 	Describe("GetCategoryDescriptions", func() {
 		Context("with categories having descriptions", func() {
@@ -805,18 +723,15 @@ semantic_cache:
 			configContent := `
 model_config:
   "large-model":
-    param_count: 1.7976931348623157e+308
-gpu_config:
-  flops: 1e20
-  hbm: 1e15
+    pii_policy:
+      allow_by_default: true
 `
 			err := os.WriteFile(configFile, []byte(configContent), 0o644)
 			Expect(err).NotTo(HaveOccurred())
 
 			cfg, err := config.LoadConfig(configFile)
 			Expect(err).NotTo(HaveOccurred())
-			Expect(cfg.ModelConfig["large-model"].ParamCount).To(Equal(1.7976931348623157e+308))
-			Expect(cfg.GPUConfig.FLOPS).To(Equal(1e20))
+			Expect(cfg.ModelConfig["large-model"].PIIPolicy.AllowByDefault).To(BeTrue())
 		})
 
 		It("should handle special string values", func() {
diff --git a/src/semantic-router/pkg/extproc/router.go b/src/semantic-router/pkg/extproc/router.go
index 773fe1c8..9a327081 100644
--- a/src/semantic-router/pkg/extproc/router.go
+++ b/src/semantic-router/pkg/extproc/router.go
@@ -14,7 +14,6 @@ import (
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/tools"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/classification"
 	"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/pii"
-	"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/ttft"
 )
 
 var (
@@ -129,8 +128,7 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) {
 
 	// Create utility components
 	piiChecker := pii.NewPolicyChecker(cfg, cfg.ModelConfig)
-	ttftCalculator := ttft.NewCalculator(cfg.GPUConfig)
-	modelTTFT := ttftCalculator.InitializeModelTTFT(cfg)
+	modelTTFT := make(map[string]float64) // Empty TTFT map since load balancing is disabled
 	classifier := classification.NewClassifier(cfg, categoryMapping, piiMapping, jailbreakMapping, modelTTFT)
 
 	// Create global classification service for API access
diff --git a/src/semantic-router/pkg/utils/ttft/calculator.go b/src/semantic-router/pkg/utils/ttft/calculator.go
deleted file mode 100644
index a67bf537..00000000
--- a/src/semantic-router/pkg/utils/ttft/calculator.go
+++ /dev/null
@@ -1,64 +0,0 @@
-package ttft
-
-import (
-	"github.com/vllm-project/semantic-router/semantic-router/pkg/config"
-)
-
-// Calculator handles TTFT (Time To First Token) calculations
-type Calculator struct {
-	GPUConfig config.GPUConfig
-}
-
-// NewCalculator creates a new TTFT calculator
-func NewCalculator(gpuConfig config.GPUConfig) *Calculator {
-	return &Calculator{
-		GPUConfig: gpuConfig,
-	}
-}
-
-// ComputeBaseTTFT computes base TTFT for a model using the formula based on
-// https://www.jinghong-chen.net/estimate-vram-usage-in-llm-inference/
-// TTFT = (2*N*b*s)/(FLOPs) + (2*N)/(HBM)
-// Parameters are loaded from config: model-specific (N, b, s) and GPU-specific (FLOPs, HBM)
-func (c *Calculator) ComputeBaseTTFT(modelName string, cfg *config.RouterConfig) float64 {
-	// Get model-specific parameters from config
-	defaultParamCount := 7e9    // Default to 7B if unknown
-	defaultBatchSize := 512.0   // Default batch size
-	defaultContextSize := 256.0 // Default context size
-
-	// Get model parameters
-	N := cfg.GetModelParamCount(modelName, defaultParamCount)
-	b := cfg.GetModelBatchSize(modelName, defaultBatchSize)
-	s := cfg.GetModelContextSize(modelName, defaultContextSize)
-
-	// Get GPU parameters from config
-	FLOPs := c.GPUConfig.FLOPS
-	HBM := c.GPUConfig.HBM
-
-	prefillCompute := 2 * N * b * s
-	prefillMemory := 2 * N
-
-	TTFT := (prefillCompute/FLOPs + prefillMemory/HBM) * 1000 // ms
-	return TTFT
-}
-
-// InitializeModelTTFT initializes TTFT map for all models in config
-func (c *Calculator) InitializeModelTTFT(cfg *config.RouterConfig) map[string]float64 {
-	modelTTFT := make(map[string]float64)
-
-	for _, cat := range cfg.Categories {
-		for _, modelScore := range cat.ModelScores {
-			if _, ok := modelTTFT[modelScore.Model]; !ok {
-				modelTTFT[modelScore.Model] = c.ComputeBaseTTFT(modelScore.Model, cfg)
-			}
-		}
-	}
-
-	if cfg.DefaultModel != "" {
-		if _, ok := modelTTFT[cfg.DefaultModel]; !ok {
-			modelTTFT[cfg.DefaultModel] = c.ComputeBaseTTFT(cfg.DefaultModel, cfg)
-		}
-	}
-
-	return modelTTFT
-}
diff --git a/src/semantic-router/pkg/utils/ttft/calculator_test.go b/src/semantic-router/pkg/utils/ttft/calculator_test.go
deleted file mode 100644
index bf4b3fa1..00000000
--- a/src/semantic-router/pkg/utils/ttft/calculator_test.go
+++ /dev/null
@@ -1,56 +0,0 @@
-package ttft
-
-import (
-	"testing"
-
-	"github.com/vllm-project/semantic-router/semantic-router/pkg/config"
-)
-
-func TestComputeBaseTTFT(t *testing.T) {
-
-	gpuConfig := config.GPUConfig{
-		FLOPS: 1e12, // 1 TFLOP
-		HBM:   1e11, // 100 GB/s
-	}
-	calculator := NewCalculator(gpuConfig)
-
-	routerCfg := &config.RouterConfig{}
-	// Mock config methods if needed, or set up fields so that
-	// GetModelParamCount, GetModelBatchSize, GetModelContextSize return defaults
-
-	ttft := calculator.ComputeBaseTTFT("test-model", routerCfg)
-	if ttft <= 0 {
-		t.Errorf("Expected TTFT > 0, got %f", ttft)
-	}
-}
-
-func TestInitializeModelTTFT(t *testing.T) {
-	gpuConfig := config.GPUConfig{
-		FLOPS: 1e12,
-		HBM:   1e11,
-	}
-	calculator := NewCalculator(gpuConfig)
-
-	// Minimal mock config with two categories and models
-	routerCfg := &config.RouterConfig{
-		Categories: []config.Category{
-			{
-				ModelScores: []config.ModelScore{
-					{Model: "model-a", Score: 0.9},
-					{Model: "model-b", Score: 0.8},
-				},
-			},
-		},
-		DefaultModel: "model-default",
-	}
-
-	modelTTFT := calculator.InitializeModelTTFT(routerCfg)
-	if len(modelTTFT) != 3 {
-		t.Errorf("Expected 3 models in TTFT map, got %d", len(modelTTFT))
-	}
-	for model, ttft := range modelTTFT {
-		if ttft <= 0 {
-			t.Errorf("Model %s has non-positive TTFT: %f", model, ttft)
-		}
-	}
-}
diff --git a/src/training/model_eval/result_to_config.py b/src/training/model_eval/result_to_config.py
index 6d7feb79..1267bada 100644
--- a/src/training/model_eval/result_to_config.py
+++ b/src/training/model_eval/result_to_config.py
@@ -102,11 +102,6 @@ def generate_config_yaml(category_accuracies, similarity_threshold):
             "use_cpu": True,
             "jailbreak_mapping_path": "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json",
         },
-        "gpu_config": {
-            "flops": 312000000000000,  # 312e12 fp16
-            "hbm": 2000000000000,  # 2e12 (2 TB/s)
-            "description": "A100-80G",
-        },
         "classifier": {
             "category_model": {
                 "model_id": "models/category_classifier_modernbert-base_model",
diff --git a/website/docs/getting-started/configuration.md b/website/docs/getting-started/configuration.md
index 51f42175..5f4cdadb 100644
--- a/website/docs/getting-started/configuration.md
+++ b/website/docs/getting-started/configuration.md
@@ -50,9 +50,6 @@ vllm_endpoints:
 # Model configuration
 model_config:
   "your-model":
-    param_count: 7000000000    # Model parameters
-    batch_size: 512.0
-    context_size: 4096.0
     pii_policy:
       allow_by_default: true
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON"]
@@ -115,9 +112,6 @@ Configure model-specific settings:
 ```yaml
 model_config:
   "llama2-7b":
-    param_count: 7000000000     # Model size in parameters
-    batch_size: 512.0           # Batch size
-    context_size: 4096.0        # Context window
     pii_policy:
       allow_by_default: true    # Allow PII by default
       pii_types_allowed: ["EMAIL_ADDRESS", "PERSON"]

From a931ee1f4d43de9f56c386f8f2de1883ad13a66f Mon Sep 17 00:00:00 2001
From: Huamin Chen <hchen@redhat.com>
Date: Mon, 8 Sep 2025 13:30:00 -0400
Subject: [PATCH 2/2] fix pre-commit issues

Signed-off-by: Huamin Chen <hchen@redhat.com>
---
 src/semantic-router/pkg/config/config.go      | 1 -
 src/semantic-router/pkg/config/config_test.go | 1 -
 2 files changed, 2 deletions(-)

diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
index 1965ec6b..e25c4e7d 100644
--- a/src/semantic-router/pkg/config/config.go
+++ b/src/semantic-router/pkg/config/config.go
@@ -325,7 +325,6 @@ func (c *RouterConfig) GetModelForCategoryIndex(index int) string {
 	return c.DefaultModel
 }
 
-
 // GetModelPricing returns pricing per 1M tokens and its currency for the given model.
 // The currency indicates the unit of the returned rates (e.g., "USD").
 func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) {
diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go
index 5ab69746..106f9a6f 100644
--- a/src/semantic-router/pkg/config/config_test.go
+++ b/src/semantic-router/pkg/config/config_test.go
@@ -631,7 +631,6 @@ prompt_guard:
 		})
 	})
 
-
 	Describe("GetCategoryDescriptions", func() {
 		Context("with categories having descriptions", func() {
 			BeforeEach(func() {