Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ prompt_guard:
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
gpu_config:
flops: 312000000000000 # 312e12 fp16
hbm: 2000000000000 # 2e12 (2 TB/s)
description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf

# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
vllm_endpoints:
Expand Down Expand Up @@ -52,9 +48,6 @@ vllm_endpoints:

model_config:
phi4:
param_count: 14000000000 # 14B parameters https://huggingface.co/microsoft/phi-4
batch_size: 512.0 # vLLM default batch size
context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
pricing:
currency: USD
prompt_per_1m: 0.07
Expand All @@ -65,9 +58,6 @@ model_config:
# Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
preferred_endpoints: ["endpoint1", "endpoint3"]
gemma3:27b:
param_count: 27000000000 # 27B parameters (base version)
batch_size: 512.0
context_size: 16384.0
pricing:
currency: USD
prompt_per_1m: 0.067
Expand All @@ -77,9 +67,6 @@ model_config:
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
preferred_endpoints: ["endpoint1"]
"mistral-small3.1":
param_count: 22000000000
batch_size: 512.0
context_size: 16384.0
pricing:
currency: USD
prompt_per_1m: 0.1
Expand Down
13 changes: 0 additions & 13 deletions deploy/kubernetes/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@ prompt_guard:
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
gpu_config:
flops: 312000000000000 # 312e12 fp16
hbm: 2000000000000 # 2e12 (2 TB/s)
description: "A100-80G" # https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/a100/pdf/nvidia-a100-datasheet-us-nvidia-1758950-r4-web.pdf

# vLLM Endpoints Configuration - supports multiple endpoints, each can serve multiple models
vllm_endpoints:
Expand Down Expand Up @@ -52,26 +48,17 @@ vllm_endpoints:

model_config:
phi4:
param_count: 14000000000 # 14B parameters https://huggingface.co/microsoft/phi-4
batch_size: 512.0 # vLLM default batch size
context_size: 16384.0 # based on https://huggingface.co/microsoft/phi-4
pii_policy:
allow_by_default: false # Deny all PII by default
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
# Specify which endpoints can serve this model (optional - if not specified, uses all endpoints that list this model)
preferred_endpoints: ["endpoint1", "endpoint3"]
gemma3:27b:
param_count: 27000000000 # 27B parameters (base version)
batch_size: 512.0
context_size: 16384.0
pii_policy:
allow_by_default: false # Deny all PII by default
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
preferred_endpoints: ["endpoint1"]
"mistral-small3.1":
param_count: 22000000000
batch_size: 512.0
context_size: 16384.0
pii_policy:
allow_by_default: false # Deny all PII by default
pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types
Expand Down
53 changes: 1 addition & 52 deletions src/semantic-router/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,6 @@ type RouterConfig struct {
// Model parameters configuration
ModelConfig map[string]ModelParams `yaml:"model_config"`

// GPU configuration for TTFT calculation
GPUConfig GPUConfig `yaml:"gpu_config"`

// Tools configuration for automatic tool selection
Tools ToolsConfig `yaml:"tools"`

Expand Down Expand Up @@ -192,7 +189,7 @@ type VLLMEndpoint struct {
HealthCheckPath string `yaml:"health_check_path,omitempty"`
}

// ModelParams represents configuration for model-specific parameters
// ModelPricing represents configuration for model-specific parameters
type ModelPricing struct {
// ISO currency code for the pricing (e.g., "USD"). Defaults to "USD" when omitted.
Currency string `yaml:"currency,omitempty"`
Expand All @@ -203,15 +200,6 @@ type ModelPricing struct {
}

type ModelParams struct {
// Number of parameters in the model
ParamCount float64 `yaml:"param_count"`

// Default batch size for this model
BatchSize float64 `yaml:"batch_size"`

// Default context size for this model
ContextSize float64 `yaml:"context_size"`

// PII policy configuration for this model
PIIPolicy PIIPolicy `yaml:"pii_policy,omitempty"`

Expand Down Expand Up @@ -253,18 +241,6 @@ const (
PIITypeZipCode = "ZIP_CODE" // ZIP/Postal codes
)

// GPUConfig represents configuration for GPU parameters used in TTFT calculation
type GPUConfig struct {
// FLOPs performance in operations per second
FLOPS float64 `yaml:"flops"`

// HBM memory bandwidth in bytes per second
HBM float64 `yaml:"hbm"`

// Description of the GPU configuration (e.g., "A100-80G")
Description string `yaml:"description"`
}

// GetCacheSimilarityThreshold returns the effective threshold for the semantic cache
func (c *RouterConfig) GetCacheSimilarityThreshold() float32 {
if c.SemanticCache.SimilarityThreshold != nil {
Expand Down Expand Up @@ -376,33 +352,6 @@ func (c *RouterConfig) GetModelForCategoryIndex(index int) string {
return c.DefaultModel
}

// GetModelParamCount returns the parameter count for a given model
// If the model is not found in the config, returns the default value
func (c *RouterConfig) GetModelParamCount(modelName string, defaultValue float64) float64 {
if modelConfig, ok := c.ModelConfig[modelName]; ok {
return modelConfig.ParamCount
}
return defaultValue
}

// GetModelBatchSize returns the batch size for a given model
// If the model is not found in the config, returns the default value
func (c *RouterConfig) GetModelBatchSize(modelName string, defaultValue float64) float64 {
if modelConfig, ok := c.ModelConfig[modelName]; ok {
return modelConfig.BatchSize
}
return defaultValue
}

// GetModelContextSize returns the context size for a given model
// If the model is not found in the config, returns the default value
func (c *RouterConfig) GetModelContextSize(modelName string, defaultValue float64) float64 {
if modelConfig, ok := c.ModelConfig[modelName]; ok {
return modelConfig.ContextSize
}
return defaultValue
}

// GetModelPricing returns pricing per 1M tokens and its currency for the given model.
// The currency indicates the unit of the returned rates (e.g., "USD").
func (c *RouterConfig) GetModelPricing(modelName string) (promptPer1M float64, completionPer1M float64, currency string, ok bool) {
Expand Down
96 changes: 5 additions & 91 deletions src/semantic-router/pkg/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,26 +105,15 @@ vllm_endpoints:

model_config:
"model-a":
param_count: 1000000000
batch_size: 32
context_size: 8192
pii_policy:
allow_by_default: false
pii_types_allowed: ["NO_PII", "ORGANIZATION"]
preferred_endpoints: ["endpoint1"]
"model-b":
param_count: 175000000
batch_size: 64
context_size: 4096
pii_policy:
allow_by_default: true
preferred_endpoints: ["endpoint1", "endpoint2"]

gpu_config:
flops: 312000000000000
hbm: 2000000000000
description: "A100-80G"

tools:
enabled: true
top_k: 5
Expand Down Expand Up @@ -172,14 +161,9 @@ tools:

// Verify model config
Expect(cfg.ModelConfig).To(HaveKey("model-a"))
Expect(cfg.ModelConfig["model-a"].ParamCount).To(Equal(float64(1000000000)))
Expect(cfg.ModelConfig["model-a"].PIIPolicy.AllowByDefault).To(BeFalse())
Expect(cfg.ModelConfig["model-a"].PIIPolicy.PIITypes).To(ContainElements("NO_PII", "ORGANIZATION"))

// Verify GPU config
Expect(cfg.GPUConfig.FLOPS).To(Equal(float64(312000000000000)))
Expect(cfg.GPUConfig.Description).To(Equal("A100-80G"))

// Verify tools config
Expect(cfg.Tools.Enabled).To(BeTrue())
Expect(cfg.Tools.TopK).To(Equal(5))
Expand Down Expand Up @@ -430,7 +414,8 @@ model_config:
pii_policy:
allow_by_default: true
"unconfigured-model":
param_count: 1000000
pii_policy:
allow_by_default: true
`
err := os.WriteFile(configFile, []byte(configContent), 0o644)
Expect(err).NotTo(HaveOccurred())
Expand Down Expand Up @@ -646,74 +631,6 @@ prompt_guard:
})
})

Describe("Model Parameter Functions", func() {
BeforeEach(func() {
configContent := `
model_config:
"configured-model":
param_count: 175000000
batch_size: 32
context_size: 4096
`
err := os.WriteFile(configFile, []byte(configContent), 0o644)
Expect(err).NotTo(HaveOccurred())
})

Describe("GetModelParamCount", func() {
It("should return configured value for existing model", func() {
cfg, err := config.LoadConfig(configFile)
Expect(err).NotTo(HaveOccurred())

count := cfg.GetModelParamCount("configured-model", 1000000)
Expect(count).To(Equal(float64(175000000)))
})

It("should return default value for non-existent model", func() {
cfg, err := config.LoadConfig(configFile)
Expect(err).NotTo(HaveOccurred())

count := cfg.GetModelParamCount("unknown-model", 999999)
Expect(count).To(Equal(float64(999999)))
})
})

Describe("GetModelBatchSize", func() {
It("should return configured value for existing model", func() {
cfg, err := config.LoadConfig(configFile)
Expect(err).NotTo(HaveOccurred())

batchSize := cfg.GetModelBatchSize("configured-model", 16)
Expect(batchSize).To(Equal(float64(32)))
})

It("should return default value for non-existent model", func() {
cfg, err := config.LoadConfig(configFile)
Expect(err).NotTo(HaveOccurred())

batchSize := cfg.GetModelBatchSize("unknown-model", 64)
Expect(batchSize).To(Equal(float64(64)))
})
})

Describe("GetModelContextSize", func() {
It("should return configured value for existing model", func() {
cfg, err := config.LoadConfig(configFile)
Expect(err).NotTo(HaveOccurred())

contextSize := cfg.GetModelContextSize("configured-model", 2048)
Expect(contextSize).To(Equal(float64(4096)))
})

It("should return default value for non-existent model", func() {
cfg, err := config.LoadConfig(configFile)
Expect(err).NotTo(HaveOccurred())

contextSize := cfg.GetModelContextSize("unknown-model", 8192)
Expect(contextSize).To(Equal(float64(8192)))
})
})
})

Describe("GetCategoryDescriptions", func() {
Context("with categories having descriptions", func() {
BeforeEach(func() {
Expand Down Expand Up @@ -805,18 +722,15 @@ semantic_cache:
configContent := `
model_config:
"large-model":
param_count: 1.7976931348623157e+308
gpu_config:
flops: 1e20
hbm: 1e15
pii_policy:
allow_by_default: true
`
err := os.WriteFile(configFile, []byte(configContent), 0o644)
Expect(err).NotTo(HaveOccurred())

cfg, err := config.LoadConfig(configFile)
Expect(err).NotTo(HaveOccurred())
Expect(cfg.ModelConfig["large-model"].ParamCount).To(Equal(1.7976931348623157e+308))
Expect(cfg.GPUConfig.FLOPS).To(Equal(1e20))
Expect(cfg.ModelConfig["large-model"].PIIPolicy.AllowByDefault).To(BeTrue())
})

It("should handle special string values", func() {
Expand Down
4 changes: 1 addition & 3 deletions src/semantic-router/pkg/extproc/router.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ import (
"github.com/vllm-project/semantic-router/semantic-router/pkg/tools"
"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/classification"
"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/pii"
"github.com/vllm-project/semantic-router/semantic-router/pkg/utils/ttft"
)

var (
Expand Down Expand Up @@ -132,8 +131,7 @@ func NewOpenAIRouter(configPath string) (*OpenAIRouter, error) {

// Create utility components
piiChecker := pii.NewPolicyChecker(cfg, cfg.ModelConfig)
ttftCalculator := ttft.NewCalculator(cfg.GPUConfig)
modelTTFT := ttftCalculator.InitializeModelTTFT(cfg)
modelTTFT := make(map[string]float64) // Empty TTFT map since load balancing is disabled
classifier := classification.NewClassifier(cfg, categoryMapping, piiMapping, jailbreakMapping, modelTTFT)

// Create global classification service for API access
Expand Down
64 changes: 0 additions & 64 deletions src/semantic-router/pkg/utils/ttft/calculator.go

This file was deleted.

Loading
Loading