Skip to content

Commit 464ed6c

Browse files
authored
chore: remove GPU and model params in config. Backend and model aware optimization will be handled in the control plane (#93)
* chore: remove GPU and model params in config. Backend and model aware optimization will be handled in the control plane Signed-off-by: Huamin Chen <[email protected]> * fix pre-commit issues Signed-off-by: Huamin Chen <[email protected]> --------- Signed-off-by: Huamin Chen <[email protected]>
1 parent 9bca691 commit 464ed6c

File tree

9 files changed

+7
-303
lines changed

9 files changed

+7
-303
lines changed

config/config.yaml

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@ prompt_guard:
2020
threshold: 0.7
2121
use_cpu: true
2222
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
23-
gpu_config:
24-
flops: 312000000000000 # 312e12 fp16
25-
hbm: 2000000000000 # 2e12 (2 TB/s)
26-
description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
2723

2824
# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
2925
vllm_endpoints:
@@ -52,9 +48,6 @@ vllm_endpoints:
5248

5349
model_config:
5450
phi4:
55-
param_count: 14000000000 # 14B parameters https://huggingface.co/microsoft/phi-4
56-
batch_size: 512.0 # vLLM default batch size
57-
context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
5851
pricing:
5952
currency: USD
6053
prompt_per_1m: 0.07
@@ -65,9 +58,6 @@ model_config:
6558
# Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
6659
preferred_endpoints: ["endpoint1", "endpoint3"]
6760
gemma3:27b:
68-
param_count: 27000000000 # 27B parameters (base version)
69-
batch_size: 512.0
70-
context_size: 16384.0
7161
pricing:
7262
currency: USD
7363
prompt_per_1m: 0.067
@@ -77,9 +67,6 @@ model_config:
7767
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
7868
preferred_endpoints: ["endpoint1"]
7969
"mistral-small3.1":
80-
param_count: 22000000000
81-
batch_size: 512.0
82-
context_size: 16384.0
8370
pricing:
8471
currency: USD
8572
prompt_per_1m: 0.1

deploy/kubernetes/config.yaml

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,6 @@ prompt_guard:
2020
threshold: 0.7
2121
use_cpu: true
2222
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
23-
gpu_config:
24-
flops: 312000000000000 # 312e12 fp16
25-
hbm: 2000000000000 # 2e12 (2 TB/s)
26-
description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf
2723

2824
# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
2925
vllm_endpoints:
@@ -52,26 +48,17 @@ vllm_endpoints:
5248

5349
model_config:
5450
phi4:
55-
param_count: 14000000000 # 14B parameters https://huggingface.co/microsoft/phi-4
56-
batch_size: 512.0 # vLLM default batch size
57-
context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
5851
pii_policy:
5952
allow_by_default: false # Deny all PII by default
6053
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
6154
# Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
6255
preferred_endpoints: ["endpoint1", "endpoint3"]
6356
gemma3:27b:
64-
param_count: 27000000000 # 27B parameters (base version)
65-
batch_size: 512.0
66-
context_size: 16384.0
6757
pii_policy:
6858
allow_by_default: false # Deny all PII by default
6959
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
7060
preferred_endpoints: ["endpoint1"]
7161
"mistral-small3.1":
72-
param_count: 22000000000
73-
batch_size: 512.0
74-
context_size: 16384.0
7562
pii_policy:
7663
allow_by_default: false # Deny all PII by default
7764
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types

src/semantic-router/pkg/config/config.go

Lines changed: 1 addition & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,6 @@ type RouterConfig struct {
5454
// Model parameters configuration
5555
ModelConfig map[string]ModelParams `yaml:"model_config"`
5656

57-
// GPU configuration for TTFT calculation
58-
GPUConfig GPUConfig `yaml:"gpu_config"`
59-
6057
// Tools configuration for automatic tool selection
6158
Tools ToolsConfig `yaml:"tools"`
6259

@@ -192,7 +189,7 @@ type VLLMEndpoint struct {
192189
HealthCheckPath string `yaml:"health_check_path,omitempty"`
193190
}
194191

195-
// ModelParams represents configuration for model-specific parameters
192+
// ModelPricing represents configuration for model-specific parameters
196193
type ModelPricing struct {
197194
// ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted.
198195
Currency string `yaml:"currency,omitempty"`
@@ -203,15 +200,6 @@ type ModelPricing struct {
203200
}
204201

205202
type ModelParams struct {
206-
// Number of parameters in the model
207-
ParamCount float64 `yaml:"param_count"`
208-
209-
// Default batch size for this model
210-
BatchSize float64 `yaml:"batch_size"`
211-
212-
// Default context size for this model
213-
ContextSize float64 `yaml:"context_size"`
214-
215203
// PII policy configuration for this model
216204
PIIPolicy PIIPolicy `yaml:"pii_policy,omitempty"`
217205

@@ -253,18 +241,6 @@ const (
253241
PIITypeZipCode = "ZIP_CODE" // ZIP/Postal codes
254242
)
255243

256-
// GPUConfig represents configuration for GPU parameters used in TTFT calculation
257-
type GPUConfig struct {
258-
// FLOPs performance in operations per second
259-
FLOPS float64 `yaml:"flops"`
260-
261-
// HBM memory bandwidth in bytes per second
262-
HBM float64 `yaml:"hbm"`
263-
264-
// Description of the GPU configuration (e.g., "A100-80G")
265-
Description string `yaml:"description"`
266-
}
267-
268244
// GetCacheSimilarityThreshold returns the effective threshold for the semantic cache
269245
func (c *RouterConfig) GetCacheSimilarityThreshold() float32 {
270246
if c.SemanticCache.SimilarityThreshold != nil {
@@ -376,33 +352,6 @@ func (c *RouterConfig) GetModelForCategoryIndex(index int) string {
376352
return c.DefaultModel
377353
}
378354

379-
// GetModelParamCount returns the parameter count for a given model
380-
// If the model is not found in the config, returns the default value
381-
func (c *RouterConfig) GetModelParamCount(modelName string, defaultValue float64) float64 {
382-
if modelConfig, ok := c.ModelConfig[modelName]; ok {
383-
return modelConfig.ParamCount
384-
}
385-
return defaultValue
386-
}
387-
388-
// GetModelBatchSize returns the batch size for a given model
389-
// If the model is not found in the config, returns the default value
390-
func (c *RouterConfig) GetModelBatchSize(modelName string, defaultValue float64) float64 {
391-
if modelConfig, ok := c.ModelConfig[modelName]; ok {
392-
return modelConfig.BatchSize
393-
}
394-
return defaultValue
395-
}
396-
397-
// GetModelContextSize returns the context size for a given model
398-
// If the model is not found in the config, returns the default value
399-
func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float64) float64 {
400-
if modelConfig, ok := c.ModelConfig[modelName]; ok {
401-
return modelConfig.ContextSize
402-
}
403-
return defaultValue
404-
}
405-
406355
// GetModelPricing returns pricing per 1M tokens and its currency for the given model.
407356
// The currency indicates the unit of the returned rates (e.g., "USD").
408357
func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) {

src/semantic-router/pkg/config/config_test.go

Lines changed: 5 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -105,26 +105,15 @@ vllm_endpoints:
105105
106106
model_config:
107107
"model-a":
108-
param_count: 1000000000
109-
batch_size: 32
110-
context_size: 8192
111108
pii_policy:
112109
allow_by_default: false
113110
pii_types_allowed: ["NO_PII", "ORGANIZATION"]
114111
preferred_endpoints: ["endpoint1"]
115112
"model-b":
116-
param_count: 175000000
117-
batch_size: 64
118-
context_size: 4096
119113
pii_policy:
120114
allow_by_default: true
121115
preferred_endpoints: ["endpoint1", "endpoint2"]
122116
123-
gpu_config:
124-
flops: 312000000000000
125-
hbm: 2000000000000
126-
description: "A100-80G"
127-
128117
tools:
129118
enabled: true
130119
top_k: 5
@@ -172,14 +161,9 @@ tools:
172161

173162
// Verify model config
174163
Expect(cfg.ModelConfig).To(HaveKey("model-a"))
175-
Expect(cfg.ModelConfig["model-a"].ParamCount).To(Equal(float64(1000000000)))
176164
Expect(cfg.ModelConfig["model-a"].PIIPolicy.AllowByDefault).To(BeFalse())
177165
Expect(cfg.ModelConfig["model-a"].PIIPolicy.PIITypes).To(ContainElements("NO_PII", "ORGANIZATION"))
178166

179-
// Verify GPU config
180-
Expect(cfg.GPUConfig.FLOPS).To(Equal(float64(312000000000000)))
181-
Expect(cfg.GPUConfig.Description).To(Equal("A100-80G"))
182-
183167
// Verify tools config
184168
Expect(cfg.Tools.Enabled).To(BeTrue())
185169
Expect(cfg.Tools.TopK).To(Equal(5))
@@ -430,7 +414,8 @@ model_config:
430414
pii_policy:
431415
allow_by_default: true
432416
"unconfigured-model":
433-
param_count: 1000000
417+
pii_policy:
418+
allow_by_default: true
434419
`
435420
err := os.WriteFile(configFile, []byte(configContent), 0o644)
436421
Expect(err).NotTo(HaveOccurred())
@@ -646,74 +631,6 @@ prompt_guard:
646631
})
647632
})
648633

649-
Describe("Model Parameter Functions", func() {
650-
BeforeEach(func() {
651-
configContent := `
652-
model_config:
653-
"configured-model":
654-
param_count: 175000000
655-
batch_size: 32
656-
context_size: 4096
657-
`
658-
err := os.WriteFile(configFile, []byte(configContent), 0o644)
659-
Expect(err).NotTo(HaveOccurred())
660-
})
661-
662-
Describe("GetModelParamCount", func() {
663-
It("should return configured value for existing model", func() {
664-
cfg, err := config.LoadConfig(configFile)
665-
Expect(err).NotTo(HaveOccurred())
666-
667-
count := cfg.GetModelParamCount("configured-model", 1000000)
668-
Expect(count).To(Equal(float64(175000000)))
669-
})
670-
671-
It("should return default value for non-existent model", func() {
672-
cfg, err := config.LoadConfig(configFile)
673-
Expect(err).NotTo(HaveOccurred())
674-
675-
count := cfg.GetModelParamCount("unknown-model", 999999)
676-
Expect(count).To(Equal(float64(999999)))
677-
})
678-
})
679-
680-
Describe("GetModelBatchSize", func() {
681-
It("should return configured value for existing model", func() {
682-
cfg, err := config.LoadConfig(configFile)
683-
Expect(err).NotTo(HaveOccurred())
684-
685-
batchSize := cfg.GetModelBatchSize("configured-model", 16)
686-
Expect(batchSize).To(Equal(float64(32)))
687-
})
688-
689-
It("should return default value for non-existent model", func() {
690-
cfg, err := config.LoadConfig(configFile)
691-
Expect(err).NotTo(HaveOccurred())
692-
693-
batchSize := cfg.GetModelBatchSize("unknown-model", 64)
694-
Expect(batchSize).To(Equal(float64(64)))
695-
})
696-
})
697-
698-
Describe("GetModelContextSize", func() {
699-
It("should return configured value for existing model", func() {
700-
cfg, err := config.LoadConfig(configFile)
701-
Expect(err).NotTo(HaveOccurred())
702-
703-
contextSize := cfg.GetModelContextSize("configured-model", 2048)
704-
Expect(contextSize).To(Equal(float64(4096)))
705-
})
706-
707-
It("should return default value for non-existent model", func() {
708-
cfg, err := config.LoadConfig(configFile)
709-
Expect(err).NotTo(HaveOccurred())
710-
711-
contextSize := cfg.GetModelContextSize("unknown-model", 8192)
712-
Expect(contextSize).To(Equal(float64(8192)))
713-
})
714-
})
715-
})
716-
717634
Describe("GetCategoryDescriptions", func() {
718635
Context("with categories having descriptions", func() {
719636
BeforeEach(func() {
@@ -805,18 +722,15 @@ semantic_cache:
805722
configContent := `
806723
model_config:
807724
"large-model":
808-
param_count: 1.7976931348623157e+308
809-
gpu_config:
810-
flops: 1e20
811-
hbm: 1e15
725+
pii_policy:
726+
allow_by_default: true
812727
`
813728
err := os.WriteFile(configFile, []byte(configContent), 0o644)
814729
Expect(err).NotTo(HaveOccurred())
815730

816731
cfg, err := config.LoadConfig(configFile)
817732
Expect(err).NotTo(HaveOccurred())
818-
Expect(cfg.ModelConfig["large-model"].ParamCount).To(Equal(1.7976931348623157e+308))
819-
Expect(cfg.GPUConfig.FLOPS).To(Equal(1e20))
733+
Expect(cfg.ModelConfig["large-model"].PIIPolicy.AllowByDefault).To(BeTrue())
820734
})
821735

822736
It("should handle special string values", func() {

src/semantic-router/pkg/extproc/router.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ import (
1414
"github.com/vllm-project/semantic-router/semantic-router/pkg/tools"
1515
"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/classification"
1616
"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/pii"
17-
"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/ttft"
1817
)
1918

2019
var (
@@ -132,8 +131,7 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) {
132131

133132
// Create utility components
134133
piiChecker := pii.NewPolicyChecker(cfg, cfg.ModelConfig)
135-
ttftCalculator := ttft.NewCalculator(cfg.GPUConfig)
136-
modelTTFT := ttftCalculator.InitializeModelTTFT(cfg)
134+
modelTTFT := make(map[string]float64) // Empty TTFT map since load balancing is disabled
137135
classifier := classification.NewClassifier(cfg, categoryMapping, piiMapping, jailbreakMapping, modelTTFT)
138136

139137
// Create global classification service for API access

src/semantic-router/pkg/utils/ttft/calculator.go

Lines changed: 0 additions & 64 deletions
This file was deleted.

0 commit comments

Comments
 (0)