From 348c74079bd90ee0ac9ec6a75f3a723798ea442d Mon Sep 17 00:00:00 2001 From: Huamin Chen Date: Mon, 8 Sep 2025 13:26:16 -0400 Subject: [PATCH 1/2] chore: remove GPU and model params in config. Backend and model aware optimization will be handled in the control plane Signed-off-by: Huamin Chen --- config/config.yaml | 13 --- deploy/kubernetes/config.yaml | 13 --- src/semantic-router/pkg/config/config.go | 52 +--------- src/semantic-router/pkg/config/config_test.go | 95 +------------------ src/semantic-router/pkg/extproc/router.go | 4 +- .../pkg/utils/ttft/calculator.go | 64 ------------- .../pkg/utils/ttft/calculator_test.go | 56 ----------- src/training/model_eval/result_to_config.py | 5 - website/docs/getting-started/configuration.md | 6 -- 9 files changed, 7 insertions(+), 301 deletions(-) delete mode 100644 src/semantic-router/pkg/utils/ttft/calculator.go delete mode 100644 src/semantic-router/pkg/utils/ttft/calculator_test.go diff --git a/config/config.yaml b/config/config.yaml index d72624e8..3b74778a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -20,10 +20,6 @@ prompt_guard: threshold: 0.7 use_cpu: true jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" -gpu_config: - flops: 312000000000000 # 312e12 fp16 - hbm: 2000000000000 # 2e12 (2 TB/s) - description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models vllm_endpoints: @@ -52,9 +48,6 @@ vllm_endpoints: model_config: phi4: - param_count: 14000000000 # 14B parameters https://huggingface.co/microsoft/phi-4 - batch_size: 512.0 # vLLM default batch size - context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4 pricing: currency: USD prompt_per_1m: 0.07 @@ -65,9 +58,6 @@ model_config: # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model) preferred_endpoints: ["endpoint1", "endpoint3"] gemma3:27b: - param_count: 27000000000 # 27B parameters (base version) - batch_size: 512.0 - context_size: 16384.0 pricing: currency: USD prompt_per_1m: 0.067 @@ -77,9 +67,6 @@ model_config: pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types preferred_endpoints: ["endpoint1"] "mistral-small3.1": - param_count: 22000000000 - batch_size: 512.0 - context_size: 16384.0 pricing: currency: USD prompt_per_1m: 0.1 diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/config.yaml index 9813f5c2..9470b1ce 100644 --- a/deploy/kubernetes/config.yaml +++ b/deploy/kubernetes/config.yaml @@ -20,10 +20,6 @@ prompt_guard: threshold: 0.7 use_cpu: true jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" -gpu_config: - flops: 312000000000000 # 312e12 fp16 - hbm: 2000000000000 # 2e12 (2 TB/s) - description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf # vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models vllm_endpoints: @@ -52,26 +48,17 @@ vllm_endpoints: model_config: phi4: - param_count: 14000000000 # 14B parameters https://huggingface.co/microsoft/phi-4 - batch_size: 512.0 # vLLM default batch size - context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4 pii_policy: allow_by_default: false # Deny all PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types # Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model) preferred_endpoints: ["endpoint1", "endpoint3"] gemma3:27b: - param_count: 27000000000 # 27B parameters (base version) - batch_size: 512.0 - context_size: 16384.0 pii_policy: allow_by_default: false # Deny all PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types preferred_endpoints: ["endpoint1"] "mistral-small3.1": - param_count: 22000000000 - batch_size: 512.0 - context_size: 16384.0 pii_policy: allow_by_default: false # Deny all PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index 85ee09f1..1965ec6b 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -53,9 +53,6 @@ type RouterConfig struct { // Model parameters configuration ModelConfig map[string]ModelParams `yaml:"model_config"` - // GPU configuration for TTFT calculation - GPUConfig GPUConfig `yaml:"gpu_config"` - // Tools configuration for automatic tool selection Tools ToolsConfig `yaml:"tools"` @@ -191,7 +188,7 @@ type VLLMEndpoint struct { HealthCheckPath string `yaml:"health_check_path,omitempty"` } -// ModelParams represents configuration for model-specific parameters +// ModelPricing represents configuration for model-specific parameters type ModelPricing struct { // ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted. Currency string `yaml:"currency,omitempty"` @@ -202,15 +199,6 @@ type ModelPricing struct { } type ModelParams struct { - // Number of parameters in the model - ParamCount float64 `yaml:"param_count"` - - // Default batch size for this model - BatchSize float64 `yaml:"batch_size"` - - // Default context size for this model - ContextSize float64 `yaml:"context_size"` - // PII policy configuration for this model PIIPolicy PIIPolicy `yaml:"pii_policy,omitempty"` @@ -252,18 +240,6 @@ const ( PIITypeZipCode = "ZIP_CODE" // ZIP/Postal codes ) -// GPUConfig represents configuration for GPU parameters used in TTFT calculation -type GPUConfig struct { - // FLOPs performance in operations per second - FLOPS float64 `yaml:"flops"` - - // HBM memory bandwidth in bytes per second - HBM float64 `yaml:"hbm"` - - // Description of the GPU configuration (e.g., "A100-80G") - Description string `yaml:"description"` -} - // GetCacheSimilarityThreshold returns the effective threshold for the semantic cache func (c *RouterConfig) GetCacheSimilarityThreshold() float32 { if c.SemanticCache.SimilarityThreshold != nil { @@ -349,32 +325,6 @@ func (c *RouterConfig) GetModelForCategoryIndex(index int) string { return c.DefaultModel } -// GetModelParamCount returns the parameter count for a given model -// If the model is not found in the config, returns the default value -func (c *RouterConfig) GetModelParamCount(modelName string, defaultValue float64) float64 { - if modelConfig, ok := c.ModelConfig[modelName]; ok { - return modelConfig.ParamCount - } - return defaultValue -} - -// GetModelBatchSize returns the batch size for a given model -// If the model is not found in the config, returns the default value -func (c *RouterConfig) GetModelBatchSize(modelName string, defaultValue float64) float64 { - if modelConfig, ok := c.ModelConfig[modelName]; ok { - return modelConfig.BatchSize - } - return defaultValue -} - -// GetModelContextSize returns the context size for a given model -// If the model is not found in the config, returns the default value -func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float64) float64 { - if modelConfig, ok := c.ModelConfig[modelName]; ok { - return modelConfig.ContextSize - } - return defaultValue -} // GetModelPricing returns pricing per 1M tokens and its currency for the given model. // The currency indicates the unit of the returned rates (e.g., "USD"). diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index 36338c66..5ab69746 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -105,26 +105,15 @@ vllm_endpoints: model_config: "model-a": - param_count: 1000000000 - batch_size: 32 - context_size: 8192 pii_policy: allow_by_default: false pii_types_allowed: ["NO_PII", "ORGANIZATION"] preferred_endpoints: ["endpoint1"] "model-b": - param_count: 175000000 - batch_size: 64 - context_size: 4096 pii_policy: allow_by_default: true preferred_endpoints: ["endpoint1", "endpoint2"] -gpu_config: - flops: 312000000000000 - hbm: 2000000000000 - description: "A100-80G" - tools: enabled: true top_k: 5 @@ -172,14 +161,9 @@ tools: // Verify model config Expect(cfg.ModelConfig).To(HaveKey("model-a")) - Expect(cfg.ModelConfig["model-a"].ParamCount).To(Equal(float64(1000000000))) Expect(cfg.ModelConfig["model-a"].PIIPolicy.AllowByDefault).To(BeFalse()) Expect(cfg.ModelConfig["model-a"].PIIPolicy.PIITypes).To(ContainElements("NO_PII", "ORGANIZATION")) - // Verify GPU config - Expect(cfg.GPUConfig.FLOPS).To(Equal(float64(312000000000000))) - Expect(cfg.GPUConfig.Description).To(Equal("A100-80G")) - // Verify tools config Expect(cfg.Tools.Enabled).To(BeTrue()) Expect(cfg.Tools.TopK).To(Equal(5)) @@ -430,7 +414,8 @@ model_config: pii_policy: allow_by_default: true "unconfigured-model": - param_count: 1000000 + pii_policy: + allow_by_default: true ` err := os.WriteFile(configFile, []byte(configContent), 0o644) Expect(err).NotTo(HaveOccurred()) @@ -646,73 +631,6 @@ prompt_guard: }) }) - Describe("Model Parameter Functions", func() { - BeforeEach(func() { - configContent := ` -model_config: - "configured-model": - param_count: 175000000 - batch_size: 32 - context_size: 4096 -` - err := os.WriteFile(configFile, []byte(configContent), 0o644) - Expect(err).NotTo(HaveOccurred()) - }) - - Describe("GetModelParamCount", func() { - It("should return configured value for existing model", func() { - cfg, err := config.LoadConfig(configFile) - Expect(err).NotTo(HaveOccurred()) - - count := cfg.GetModelParamCount("configured-model", 1000000) - Expect(count).To(Equal(float64(175000000))) - }) - - It("should return default value for non-existent model", func() { - cfg, err := config.LoadConfig(configFile) - Expect(err).NotTo(HaveOccurred()) - - count := cfg.GetModelParamCount("unknown-model", 999999) - Expect(count).To(Equal(float64(999999))) - }) - }) - - Describe("GetModelBatchSize", func() { - It("should return configured value for existing model", func() { - cfg, err := config.LoadConfig(configFile) - Expect(err).NotTo(HaveOccurred()) - - batchSize := cfg.GetModelBatchSize("configured-model", 16) - Expect(batchSize).To(Equal(float64(32))) - }) - - It("should return default value for non-existent model", func() { - cfg, err := config.LoadConfig(configFile) - Expect(err).NotTo(HaveOccurred()) - - batchSize := cfg.GetModelBatchSize("unknown-model", 64) - Expect(batchSize).To(Equal(float64(64))) - }) - }) - - Describe("GetModelContextSize", func() { - It("should return configured value for existing model", func() { - cfg, err := config.LoadConfig(configFile) - Expect(err).NotTo(HaveOccurred()) - - contextSize := cfg.GetModelContextSize("configured-model", 2048) - Expect(contextSize).To(Equal(float64(4096))) - }) - - It("should return default value for non-existent model", func() { - cfg, err := config.LoadConfig(configFile) - Expect(err).NotTo(HaveOccurred()) - - contextSize := cfg.GetModelContextSize("unknown-model", 8192) - Expect(contextSize).To(Equal(float64(8192))) - }) - }) - }) Describe("GetCategoryDescriptions", func() { Context("with categories having descriptions", func() { @@ -805,18 +723,15 @@ semantic_cache: configContent := ` model_config: "large-model": - param_count: 1.7976931348623157e+308 -gpu_config: - flops: 1e20 - hbm: 1e15 + pii_policy: + allow_by_default: true ` err := os.WriteFile(configFile, []byte(configContent), 0o644) Expect(err).NotTo(HaveOccurred()) cfg, err := config.LoadConfig(configFile) Expect(err).NotTo(HaveOccurred()) - Expect(cfg.ModelConfig["large-model"].ParamCount).To(Equal(1.7976931348623157e+308)) - Expect(cfg.GPUConfig.FLOPS).To(Equal(1e20)) + Expect(cfg.ModelConfig["large-model"].PIIPolicy.AllowByDefault).To(BeTrue()) }) It("should handle special string values", func() { diff --git a/src/semantic-router/pkg/extproc/router.go b/src/semantic-router/pkg/extproc/router.go index 773fe1c8..9a327081 100644 --- a/src/semantic-router/pkg/extproc/router.go +++ b/src/semantic-router/pkg/extproc/router.go @@ -14,7 +14,6 @@ import ( "github.com/vllm-project/semantic-router/semantic-router/pkg/tools" "github.com/vllm-project/semantic-router/semantic-router/pkg/utils/classification" "github.com/vllm-project/semantic-router/semantic-router/pkg/utils/pii" - "github.com/vllm-project/semantic-router/semantic-router/pkg/utils/ttft" ) var ( @@ -129,8 +128,7 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) { // Create utility components piiChecker := pii.NewPolicyChecker(cfg, cfg.ModelConfig) - ttftCalculator := ttft.NewCalculator(cfg.GPUConfig) - modelTTFT := ttftCalculator.InitializeModelTTFT(cfg) + modelTTFT := make(map[string]float64) // Empty TTFT map since load balancing is disabled classifier := classification.NewClassifier(cfg, categoryMapping, piiMapping, jailbreakMapping, modelTTFT) // Create global classification service for API access diff --git a/src/semantic-router/pkg/utils/ttft/calculator.go b/src/semantic-router/pkg/utils/ttft/calculator.go deleted file mode 100644 index a67bf537..00000000 --- a/src/semantic-router/pkg/utils/ttft/calculator.go +++ /dev/null @@ -1,64 +0,0 @@ -package ttft - -import ( - "github.com/vllm-project/semantic-router/semantic-router/pkg/config" -) - -// Calculator handles TTFT (Time To First Token) calculations -type Calculator struct { - GPUConfig config.GPUConfig -} - -// NewCalculator creates a new TTFT calculator -func NewCalculator(gpuConfig config.GPUConfig) *Calculator { - return &Calculator{ - GPUConfig: gpuConfig, - } -} - -// ComputeBaseTTFT computes base TTFT for a model using the formula based on -// https://www.jinghong-chen.net/estimate-vram-usage-in-llm-inference/ -// TTFT = (2*N*b*s)/(FLOPs) + (2*N)/(HBM) -// Parameters are loaded from config: model-specific (N, b, s) and GPU-specific (FLOPs, HBM) -func (c *Calculator) ComputeBaseTTFT(modelName string, cfg *config.RouterConfig) float64 { - // Get model-specific parameters from config - defaultParamCount := 7e9 // Default to 7B if unknown - defaultBatchSize := 512.0 // Default batch size - defaultContextSize := 256.0 // Default context size - - // Get model parameters - N := cfg.GetModelParamCount(modelName, defaultParamCount) - b := cfg.GetModelBatchSize(modelName, defaultBatchSize) - s := cfg.GetModelContextSize(modelName, defaultContextSize) - - // Get GPU parameters from config - FLOPs := c.GPUConfig.FLOPS - HBM := c.GPUConfig.HBM - - prefillCompute := 2 * N * b * s - prefillMemory := 2 * N - - TTFT := (prefillCompute/FLOPs + prefillMemory/HBM) * 1000 // ms - return TTFT -} - -// InitializeModelTTFT initializes TTFT map for all models in config -func (c *Calculator) InitializeModelTTFT(cfg *config.RouterConfig) map[string]float64 { - modelTTFT := make(map[string]float64) - - for _, cat := range cfg.Categories { - for _, modelScore := range cat.ModelScores { - if _, ok := modelTTFT[modelScore.Model]; !ok { - modelTTFT[modelScore.Model] = c.ComputeBaseTTFT(modelScore.Model, cfg) - } - } - } - - if cfg.DefaultModel != "" { - if _, ok := modelTTFT[cfg.DefaultModel]; !ok { - modelTTFT[cfg.DefaultModel] = c.ComputeBaseTTFT(cfg.DefaultModel, cfg) - } - } - - return modelTTFT -} diff --git a/src/semantic-router/pkg/utils/ttft/calculator_test.go b/src/semantic-router/pkg/utils/ttft/calculator_test.go deleted file mode 100644 index bf4b3fa1..00000000 --- a/src/semantic-router/pkg/utils/ttft/calculator_test.go +++ /dev/null @@ -1,56 +0,0 @@ -package ttft - -import ( - "testing" - - "github.com/vllm-project/semantic-router/semantic-router/pkg/config" -) - -func TestComputeBaseTTFT(t *testing.T) { - - gpuConfig := config.GPUConfig{ - FLOPS: 1e12, // 1 TFLOP - HBM: 1e11, // 100 GB/s - } - calculator := NewCalculator(gpuConfig) - - routerCfg := &config.RouterConfig{} - // Mock config methods if needed, or set up fields so that - // GetModelParamCount, GetModelBatchSize, GetModelContextSize return defaults - - ttft := calculator.ComputeBaseTTFT("test-model", routerCfg) - if ttft <= 0 { - t.Errorf("Expected TTFT > 0, got %f", ttft) - } -} - -func TestInitializeModelTTFT(t *testing.T) { - gpuConfig := config.GPUConfig{ - FLOPS: 1e12, - HBM: 1e11, - } - calculator := NewCalculator(gpuConfig) - - // Minimal mock config with two categories and models - routerCfg := &config.RouterConfig{ - Categories: []config.Category{ - { - ModelScores: []config.ModelScore{ - {Model: "model-a", Score: 0.9}, - {Model: "model-b", Score: 0.8}, - }, - }, - }, - DefaultModel: "model-default", - } - - modelTTFT := calculator.InitializeModelTTFT(routerCfg) - if len(modelTTFT) != 3 { - t.Errorf("Expected 3 models in TTFT map, got %d", len(modelTTFT)) - } - for model, ttft := range modelTTFT { - if ttft <= 0 { - t.Errorf("Model %s has non-positive TTFT: %f", model, ttft) - } - } -} diff --git a/src/training/model_eval/result_to_config.py b/src/training/model_eval/result_to_config.py index 6d7feb79..1267bada 100644 --- a/src/training/model_eval/result_to_config.py +++ b/src/training/model_eval/result_to_config.py @@ -102,11 +102,6 @@ def generate_config_yaml(category_accuracies, similarity_threshold): "use_cpu": True, "jailbreak_mapping_path": "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json", }, - "gpu_config": { - "flops": 312000000000000, # 312e12 fp16 - "hbm": 2000000000000, # 2e12 (2 TB/s) - "description": "A100-80G", - }, "classifier": { "category_model": { "model_id": "models/category_classifier_modernbert-base_model", diff --git a/website/docs/getting-started/configuration.md b/website/docs/getting-started/configuration.md index 51f42175..5f4cdadb 100644 --- a/website/docs/getting-started/configuration.md +++ b/website/docs/getting-started/configuration.md @@ -50,9 +50,6 @@ vllm_endpoints: # Model configuration model_config: "your-model": - param_count: 7000000000 # Model parameters - batch_size: 512.0 - context_size: 4096.0 pii_policy: allow_by_default: true pii_types_allowed: ["EMAIL_ADDRESS", "PERSON"] @@ -115,9 +112,6 @@ Configure model-specific settings: ```yaml model_config: "llama2-7b": - param_count: 7000000000 # Model size in parameters - batch_size: 512.0 # Batch size - context_size: 4096.0 # Context window pii_policy: allow_by_default: true # Allow PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON"] From a931ee1f4d43de9f56c386f8f2de1883ad13a66f Mon Sep 17 00:00:00 2001 From: Huamin Chen Date: Mon, 8 Sep 2025 13:30:00 -0400 Subject: [PATCH 2/2] fix pre-commit issues Signed-off-by: Huamin Chen --- src/semantic-router/pkg/config/config.go | 1 - src/semantic-router/pkg/config/config_test.go | 1 - 2 files changed, 2 deletions(-) diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index 1965ec6b..e25c4e7d 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -325,7 +325,6 @@ func (c *RouterConfig) GetModelForCategoryIndex(index int) string { return c.DefaultModel } - // GetModelPricing returns pricing per 1M tokens and its currency for the given model. // The currency indicates the unit of the returned rates (e.g., "USD"). func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) { diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index 5ab69746..106f9a6f 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -631,7 +631,6 @@ prompt_guard: }) }) - Describe("GetCategoryDescriptions", func() { Context("with categories having descriptions", func() { BeforeEach(func() {