diff --git a/config/config-mcp-classifier-example.yaml b/config/config-mcp-classifier-example.yaml index 8fc25e37..1aaca432 100644 --- a/config/config-mcp-classifier-example.yaml +++ b/config/config-mcp-classifier-example.yaml @@ -75,8 +75,6 @@ vllm_endpoints: - name: endpoint1 address: 127.0.0.1 port: 8000 - models: - - openai/gpt-oss-20b weight: 1 health_check_path: /health diff --git a/config/config.development.yaml b/config/config.development.yaml index 3bec3828..fa7afdef 100644 --- a/config/config.development.yaml +++ b/config/config.development.yaml @@ -29,8 +29,6 @@ vllm_endpoints: - name: "local-endpoint" address: "127.0.0.1" port: 8000 - models: - - "test-model" weight: 1 model_config: diff --git a/config/config.e2e.yaml b/config/config.e2e.yaml index 1e2e5689..d90da786 100644 --- a/config/config.e2e.yaml +++ b/config/config.e2e.yaml @@ -42,15 +42,11 @@ vllm_endpoints: - name: "qwen-endpoint" address: "127.0.0.1" port: 8000 - models: - - "Model-A" weight: 1 health_check_path: "/health" - name: "tinyllama-endpoint" address: "127.0.0.1" port: 8001 - models: - - "Model-B" weight: 1 health_check_path: "/health" diff --git a/config/config.production.yaml b/config/config.production.yaml index 07258956..edd049a3 100644 --- a/config/config.production.yaml +++ b/config/config.production.yaml @@ -34,8 +34,6 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "openai/gpt-oss-20b" weight: 1 model_config: diff --git a/config/config.recipe-accuracy.yaml b/config/config.recipe-accuracy.yaml index 82769836..18f2751d 100644 --- a/config/config.recipe-accuracy.yaml +++ b/config/config.recipe-accuracy.yaml @@ -44,8 +44,6 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "openai/gpt-oss-20b" weight: 1 model_config: diff --git a/config/config.recipe-latency.yaml b/config/config.recipe-latency.yaml index 15008b04..00b3ae00 100644 --- a/config/config.recipe-latency.yaml +++ b/config/config.recipe-latency.yaml @@ -39,8 +39,6 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "openai/gpt-oss-20b" weight: 1 model_config: diff --git a/config/config.recipe-token-efficiency.yaml b/config/config.recipe-token-efficiency.yaml index be3d8abc..b76aeec4 100644 --- a/config/config.recipe-token-efficiency.yaml +++ b/config/config.recipe-token-efficiency.yaml @@ -44,8 +44,6 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "openai/gpt-oss-20b" weight: 1 model_config: diff --git a/config/config.testing.yaml b/config/config.testing.yaml index 461010eb..9dc59e5c 100644 --- a/config/config.testing.yaml +++ b/config/config.testing.yaml @@ -30,8 +30,6 @@ vllm_endpoints: - name: "mock" address: "172.28.0.10" port: 8000 - models: - - "openai/gpt-oss-20b" weight: 1 health_check_path: "/health" diff --git a/config/config.yaml b/config/config.yaml index 9b814cdc..579b9e35 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -34,8 +34,6 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" # IPv4 address - REQUIRED format port: 8000 - models: - - "openai/gpt-oss-20b" weight: 1 model_config: diff --git a/config/examples/system_prompt_example.yaml b/config/examples/system_prompt_example.yaml index d0cbfd3f..ff83cd91 100644 --- a/config/examples/system_prompt_example.yaml +++ b/config/examples/system_prompt_example.yaml @@ -100,9 +100,9 @@ default_reasoning_effort: medium # vLLM endpoints configuration vllm_endpoints: - name: "mock" - address: "http://127.0.0.1:8000" - models: - - "openai/gpt-oss-20b" + address: "127.0.0.1" + port: 8000 + weight: 1 # Usage Notes: # 1. System prompts are automatically injected based on query classification diff --git a/dashboard/frontend/src/pages/ConfigPage.tsx b/dashboard/frontend/src/pages/ConfigPage.tsx index 4e969525..4d5867be 100644 --- a/dashboard/frontend/src/pages/ConfigPage.tsx +++ b/dashboard/frontend/src/pages/ConfigPage.tsx @@ -7,7 +7,6 @@ interface VLLMEndpoint { name: string address: string port: number - models: string[] weight: number health_check_path: string } diff --git a/deploy/kubernetes/config.yaml b/deploy/kubernetes/config.yaml index cdb4eb0a..5bc40cbb 100644 --- a/deploy/kubernetes/config.yaml +++ b/deploy/kubernetes/config.yaml @@ -34,8 +34,6 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" # IPv4 address - REQUIRED format port: 8000 - models: - - "openai/gpt-oss-20b" weight: 1 model_config: diff --git a/deploy/kubernetes/istio/config.yaml b/deploy/kubernetes/istio/config.yaml index 8ce78ab3..e424ed9d 100644 --- a/deploy/kubernetes/istio/config.yaml +++ b/deploy/kubernetes/istio/config.yaml @@ -34,14 +34,10 @@ vllm_endpoints: - name: "endpoint1" address: "10.104.192.205" # IPv4 address - REQUIRED format port: 80 - models: - - "llama3-8b" weight: 1 - name: "endpoint2" address: "10.99.27.202" # IPv4 address - REQUIRED format port: 80 - models: - - "phi4-mini" weight: 1 model_config: diff --git a/deploy/openshift/config-openshift.yaml b/deploy/openshift/config-openshift.yaml index 857a996a..9cd98925 100644 --- a/deploy/openshift/config-openshift.yaml +++ b/deploy/openshift/config-openshift.yaml @@ -32,14 +32,10 @@ vllm_endpoints: - name: "model-a-endpoint" address: "127.0.0.1" # localhost in same pod port: 8000 - models: - - "Model-A" weight: 1 - name: "model-b-endpoint" address: "127.0.0.1" # localhost in same pod port: 8001 - models: - - "Model-B" weight: 1 model_config: diff --git a/src/semantic-router/pkg/api/server_test.go b/src/semantic-router/pkg/api/server_test.go index aaf4e005..ecdae7b4 100644 --- a/src/semantic-router/pkg/api/server_test.go +++ b/src/semantic-router/pkg/api/server_test.go @@ -309,10 +309,17 @@ func TestOpenAIModelsEndpoint(t *testing.T) { Name: "primary", Address: "127.0.0.1", // Changed from localhost to IP address Port: 8000, - Models: []string{"gpt-4o-mini", "llama-3.1-8b-instruct"}, Weight: 1, }, }, + ModelConfig: map[string]config.ModelParams{ + "gpt-4o-mini": { + PreferredEndpoints: []string{"primary"}, + }, + "llama-3.1-8b-instruct": { + PreferredEndpoints: []string{"primary"}, + }, + }, } apiServer := &ClassificationAPIServer{ diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index 49b32c46..e550dde4 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -253,9 +253,6 @@ type VLLMEndpoint struct { // Port of the vLLM endpoint Port int `yaml:"port"` - // List of models served by this endpoint - Models []string `yaml:"models"` - // Load balancing weight for this endpoint Weight int `yaml:"weight,omitempty"` } @@ -604,32 +601,21 @@ func (c *RouterConfig) IsPromptGuardEnabled() bool { } // GetEndpointsForModel returns all endpoints that can serve the specified model -// If the model has preferred endpoints configured, returns only those endpoints that are available -// Otherwise, returns all endpoints that list the model in their Models array +// Returns endpoints based on the model's preferred_endpoints configuration in model_config func (c *RouterConfig) GetEndpointsForModel(modelName string) []VLLMEndpoint { - var availableEndpoints []VLLMEndpoint - - // First, find all endpoints that can serve this model - for _, endpoint := range c.VLLMEndpoints { - if slices.Contains(endpoint.Models, modelName) { - availableEndpoints = append(availableEndpoints, endpoint) - } - } + var endpoints []VLLMEndpoint // Check if model has preferred endpoints configured if modelConfig, ok := c.ModelConfig[modelName]; ok && len(modelConfig.PreferredEndpoints) > 0 { - var preferredEndpoints []VLLMEndpoint - for _, endpoint := range availableEndpoints { - if slices.Contains(modelConfig.PreferredEndpoints, endpoint.Name) { - preferredEndpoints = append(preferredEndpoints, endpoint) + // Return only the preferred endpoints + for _, endpointName := range modelConfig.PreferredEndpoints { + if endpoint, found := c.GetEndpointByName(endpointName); found { + endpoints = append(endpoints, *endpoint) } } - if len(preferredEndpoints) > 0 { - return preferredEndpoints - } } - return availableEndpoints + return endpoints } // GetEndpointByName returns the endpoint with the specified name @@ -642,18 +628,12 @@ func (c *RouterConfig) GetEndpointByName(name string) (*VLLMEndpoint, bool) { return nil, false } -// GetAllModels returns a list of all models available across all endpoints +// GetAllModels returns a list of all models configured in model_config func (c *RouterConfig) GetAllModels() []string { - modelSet := make(map[string]bool) var models []string - for _, endpoint := range c.VLLMEndpoints { - for _, model := range endpoint.Models { - if !modelSet[model] { - modelSet[model] = true - models = append(models, model) - } - } + for modelName := range c.ModelConfig { + models = append(models, modelName) } return models diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index b4028476..c7ccdc89 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -92,15 +92,10 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "model-a" - - "model-b" weight: 1 - name: "endpoint2" address: "127.0.0.1" port: 8000 - models: - - "model-b" weight: 2 model_config: @@ -177,7 +172,6 @@ tools: Expect(cfg.VLLMEndpoints[0].Name).To(Equal("endpoint1")) Expect(cfg.VLLMEndpoints[0].Address).To(Equal("127.0.0.1")) Expect(cfg.VLLMEndpoints[0].Port).To(Equal(8000)) - Expect(cfg.VLLMEndpoints[0].Models).To(ContainElements("model-a", "model-b")) Expect(cfg.VLLMEndpoints[0].Weight).To(Equal(1)) Expect(cfg.VLLMEndpoints[1].Name).To(Equal("endpoint2")) @@ -788,22 +782,14 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "model-a" - - "model-b" weight: 1 - name: "endpoint2" address: "127.0.0.1" port: 8000 - models: - - "model-b" - - "model-c" weight: 2 - name: "endpoint3" address: "127.0.0.1" port: 8000 - models: - - "model-a" weight: 1 model_config: @@ -841,13 +827,12 @@ default_model: "model-b" Expect(endpointNames).To(ContainElements("endpoint1", "endpoint3")) }) - It("should return all available endpoints when no preferences configured", func() { + It("should return empty slice when no preferred endpoints configured", func() { cfg, err := config.LoadConfig(configFile) Expect(err).NotTo(HaveOccurred()) endpoints := cfg.GetEndpointsForModel("model-c") - Expect(endpoints).To(HaveLen(1)) - Expect(endpoints[0].Name).To(Equal("endpoint2")) + Expect(endpoints).To(BeEmpty()) }) It("should return empty slice for non-existent model", func() { @@ -858,11 +843,11 @@ default_model: "model-b" Expect(endpoints).To(BeEmpty()) }) - It("should fallback to all available endpoints if preferred endpoints don't exist", func() { + It("should return only preferred endpoints", func() { cfg, err := config.LoadConfig(configFile) Expect(err).NotTo(HaveOccurred()) - // model-b has preferred endpoint2, which serves it + // model-b has preferred endpoint2 endpoints := cfg.GetEndpointsForModel("model-b") Expect(endpoints).To(HaveLen(1)) Expect(endpoints[0].Name).To(Equal("endpoint2")) @@ -879,7 +864,6 @@ default_model: "model-b" Expect(endpoint.Name).To(Equal("endpoint1")) Expect(endpoint.Address).To(Equal("127.0.0.1")) Expect(endpoint.Port).To(Equal(8000)) - Expect(endpoint.Models).To(ContainElements("model-a", "model-b")) }) It("should return false when endpoint doesn't exist", func() { @@ -893,7 +877,7 @@ default_model: "model-b" }) Describe("GetAllModels", func() { - It("should return all unique models across endpoints", func() { + It("should return all models from model_config", func() { cfg, err := config.LoadConfig(configFile) Expect(err).NotTo(HaveOccurred()) @@ -908,7 +892,7 @@ default_model: "model-b" cfg, err := config.LoadConfig(configFile) Expect(err).NotTo(HaveOccurred()) - // model-a is available on endpoint1 (weight 1) and endpoint3 (weight 1) + // model-a has preferred endpoints: endpoint1 (weight 1) and endpoint3 (weight 1) // Since they have the same weight, it should return the first one found endpointName, found := cfg.SelectBestEndpointForModel("model-a") Expect(found).To(BeTrue()) @@ -924,13 +908,13 @@ default_model: "model-b" Expect(endpointName).To(BeEmpty()) }) - It("should select single endpoint when only one available", func() { + It("should return false when model has no preferred endpoints", func() { cfg, err := config.LoadConfig(configFile) Expect(err).NotTo(HaveOccurred()) endpointName, found := cfg.SelectBestEndpointForModel("model-c") - Expect(found).To(BeTrue()) - Expect(endpointName).To(Equal("endpoint2")) + Expect(found).To(BeFalse()) + Expect(endpointName).To(BeEmpty()) }) }) @@ -944,16 +928,18 @@ default_model: "model-b" }) It("should fail validation when a category model has no endpoints", func() { - // Add a model to categories that doesn't exist in any endpoint + // Add a model to categories that doesn't have preferred_endpoints configured configContent := ` vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "existing-model" weight: 1 +model_config: + "existing-model": + preferred_endpoints: ["endpoint1"] + categories: - name: "test" model_scores: @@ -981,10 +967,12 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "existing-model" weight: 1 +model_config: + "existing-model": + preferred_endpoints: ["endpoint1"] + default_model: "missing-default-model" ` err := os.WriteFile(configFile, []byte(configContent), 0o644) @@ -1007,10 +995,12 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "test-model" weight: 1 +model_config: + "test-model": + preferred_endpoints: ["endpoint1"] + categories: - name: "test" model_scores: @@ -1034,10 +1024,12 @@ vllm_endpoints: - name: "endpoint1" address: "::1" port: 8000 - models: - - "test-model" weight: 1 +model_config: + "test-model": + preferred_endpoints: ["endpoint1"] + categories: - name: "test" model_scores: @@ -1063,10 +1055,12 @@ vllm_endpoints: - name: "endpoint1" address: "example.com" port: 8000 - models: - - "test-model" weight: 1 +model_config: + "test-model": + preferred_endpoints: ["endpoint1"] + categories: - name: "test" model_scores: @@ -1092,10 +1086,12 @@ vllm_endpoints: - name: "endpoint1" address: "http://127.0.0.1" port: 8000 - models: - - "test-model" weight: 1 +model_config: + "test-model": + preferred_endpoints: ["endpoint1"] + categories: - name: "test" model_scores: @@ -1120,10 +1116,12 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1/api" port: 8000 - models: - - "test-model" weight: 1 +model_config: + "test-model": + preferred_endpoints: ["endpoint1"] + categories: - name: "test" model_scores: @@ -1147,10 +1145,12 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1:8080" port: 8000 - models: - - "test-model" weight: 1 +model_config: + "test-model": + preferred_endpoints: ["endpoint1"] + categories: - name: "test" model_scores: @@ -1175,10 +1175,12 @@ vllm_endpoints: - name: "test-endpoint" address: "https://example.com" port: 8000 - models: - - "test-model" weight: 1 +model_config: + "test-model": + preferred_endpoints: ["test-endpoint"] + categories: - name: "test" model_scores: @@ -1212,16 +1214,18 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: - - "test-model1" weight: 1 - name: "endpoint2" address: "example.com" port: 8001 - models: - - "test-model2" weight: 1 +model_config: + "test-model1": + preferred_endpoints: ["endpoint1"] + "test-model2": + preferred_endpoints: ["endpoint2"] + categories: - name: "test" model_scores: diff --git a/src/semantic-router/pkg/config/validation_test.go b/src/semantic-router/pkg/config/validation_test.go index 1189c054..a3950cbb 100644 --- a/src/semantic-router/pkg/config/validation_test.go +++ b/src/semantic-router/pkg/config/validation_test.go @@ -186,13 +186,11 @@ var _ = Describe("IP Address Validation", func() { Name: "endpoint1", Address: "127.0.0.1", Port: 8000, - Models: []string{"model1"}, }, { Name: "endpoint2", Address: "::1", Port: 8001, - Models: []string{"model2"}, }, } @@ -208,7 +206,6 @@ var _ = Describe("IP Address Validation", func() { Name: "invalid-endpoint", Address: "example.com", Port: 8000, - Models: []string{"model1"}, }, } @@ -228,7 +225,6 @@ var _ = Describe("IP Address Validation", func() { Name: "test-endpoint", Address: "http://127.0.0.1", Port: 8000, - Models: []string{"model1"}, }, } diff --git a/src/semantic-router/pkg/extproc/endpoint_selection_test.go b/src/semantic-router/pkg/extproc/endpoint_selection_test.go index 480056c5..7339edf9 100644 --- a/src/semantic-router/pkg/extproc/endpoint_selection_test.go +++ b/src/semantic-router/pkg/extproc/endpoint_selection_test.go @@ -299,7 +299,6 @@ var _ = Describe("Endpoint Selection", func() { Expect(endpoint1.Name).To(Equal("test-endpoint1")) Expect(endpoint1.Address).To(Equal("127.0.0.1")) Expect(endpoint1.Port).To(Equal(8000)) - Expect(endpoint1.Models).To(ContainElements("model-a", "model-b")) Expect(endpoint1.Weight).To(Equal(1)) // Verify second endpoint @@ -307,7 +306,6 @@ var _ = Describe("Endpoint Selection", func() { Expect(endpoint2.Name).To(Equal("test-endpoint2")) Expect(endpoint2.Address).To(Equal("127.0.0.1")) Expect(endpoint2.Port).To(Equal(8001)) - Expect(endpoint2.Models).To(ContainElement("model-b")) Expect(endpoint2.Weight).To(Equal(2)) }) diff --git a/src/semantic-router/pkg/extproc/models_endpoint_test.go b/src/semantic-router/pkg/extproc/models_endpoint_test.go index 9fbd5d17..20192548 100644 --- a/src/semantic-router/pkg/extproc/models_endpoint_test.go +++ b/src/semantic-router/pkg/extproc/models_endpoint_test.go @@ -18,10 +18,17 @@ func TestHandleModelsRequest(t *testing.T) { Name: "primary", Address: "127.0.0.1", Port: 8000, - Models: []string{"gpt-4o-mini", "llama-3.1-8b-instruct"}, Weight: 1, }, }, + ModelConfig: map[string]config.ModelParams{ + "gpt-4o-mini": { + PreferredEndpoints: []string{"primary"}, + }, + "llama-3.1-8b-instruct": { + PreferredEndpoints: []string{"primary"}, + }, + }, } router := &OpenAIRouter{ @@ -142,10 +149,14 @@ func TestHandleRequestHeadersWithModelsEndpoint(t *testing.T) { Name: "primary", Address: "127.0.0.1", Port: 8000, - Models: []string{"gpt-4o-mini"}, Weight: 1, }, }, + ModelConfig: map[string]config.ModelParams{ + "gpt-4o-mini": { + PreferredEndpoints: []string{"primary"}, + }, + }, } router := &OpenAIRouter{ diff --git a/src/semantic-router/pkg/extproc/test_utils_test.go b/src/semantic-router/pkg/extproc/test_utils_test.go index 3492a402..4cbc8999 100644 --- a/src/semantic-router/pkg/extproc/test_utils_test.go +++ b/src/semantic-router/pkg/extproc/test_utils_test.go @@ -199,14 +199,12 @@ func CreateTestConfig() *config.RouterConfig { Name: "test-endpoint1", Address: "127.0.0.1", Port: 8000, - Models: []string{"model-a", "model-b"}, Weight: 1, }, { Name: "test-endpoint2", Address: "127.0.0.1", Port: 8001, - Models: []string{"model-b"}, Weight: 2, }, }, diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md index 8c2c26f5..340ad847 100644 --- a/website/docs/installation/configuration.md +++ b/website/docs/installation/configuration.md @@ -159,10 +159,13 @@ Configure your LLM servers: vllm_endpoints: - name: "my_endpoint" address: "127.0.0.1" # Your server IP - MUST be IP address format - port: 8000 # Your server port - models: - - "llama2-7b" # Model name - must match vLLM --served-model-name - weight: 1 # Load balancing weight + port: 8000 # Your server port + weight: 1 # Load balancing weight + +# Model configuration - maps models to endpoints +model_config: + "llama2-7b": # Model name - must match vLLM --served-model-name + preferred_endpoints: ["my_endpoint"] ``` #### Address Format Requirements @@ -204,11 +207,12 @@ The model names in the `models` array must **exactly match** the `--served-model # vLLM server command: vllm serve meta-llama/Llama-2-7b-hf --served-model-name llama2-7b -# config.yaml must use the same name: -vllm_endpoints: - - models: ["llama2-7b"] # ✅ Matches --served-model-name - +# config.yaml must reference the model in model_config: model_config: + "llama2-7b": # ✅ Matches --served-model-name + preferred_endpoints: ["your-endpoint"] + +vllm_endpoints: "llama2-7b": # ✅ Matches --served-model-name # ... configuration ``` @@ -683,12 +687,10 @@ vllm_endpoints: - name: "math_endpoint" address: "192.168.1.10" # Math server IP port: 8000 - models: ["math-model"] weight: 1 - name: "general_endpoint" address: "192.168.1.20" # General server IP port: 8000 - models: ["general-model"] weight: 1 categories: @@ -711,12 +713,10 @@ vllm_endpoints: - name: "endpoint1" address: "192.168.1.30" # Primary server IP port: 8000 - models: ["my-model"] weight: 2 # Higher weight = more traffic - name: "endpoint2" address: "192.168.1.31" # Secondary server IP port: 8000 - models: ["my-model"] weight: 1 ``` diff --git a/website/docs/installation/installation.md b/website/docs/installation/installation.md index a96c683b..e53e0284 100644 --- a/website/docs/installation/installation.md +++ b/website/docs/installation/installation.md @@ -109,13 +109,11 @@ Edit `config/config.yaml` to point to your LLM endpoints: vllm_endpoints: - name: "your-endpoint" address: "127.0.0.1" # MUST be IP address (IPv4 or IPv6) - port: 11434 # Replace with your port - models: - - "your-model-name" # Replace with your model + port: 11434 # Replace with your port weight: 1 model_config: - "your-model-name": + "your-model-name": # Replace with your model name pii_policy: allow_by_default: false # Deny all PII by default pii_types_allowed: ["EMAIL_ADDRESS", "PERSON", "GPE", "PHONE_NUMBER"] # Only allow these specific PII types @@ -146,11 +144,12 @@ The model name in your configuration **must exactly match** the `--served-model- # When starting vLLM server: vllm serve microsoft/phi-4 --port 11434 --served-model-name your-model-name -# The config.yaml must use the same name: -vllm_endpoints: - - models: ["your-model-name"] # ✅ Must match --served-model-name - +# The config.yaml must reference the model in model_config: model_config: + "your-model-name": # ✅ Must match --served-model-name + preferred_endpoints: ["your-endpoint"] + +vllm_endpoints: "your-model-name": # ✅ Must match --served-model-name # ... configuration ``` diff --git a/website/docs/training/model-performance-eval.md b/website/docs/training/model-performance-eval.md index ce67a205..529c5481 100644 --- a/website/docs/training/model-performance-eval.md +++ b/website/docs/training/model-performance-eval.md @@ -69,11 +69,9 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 11434 - models: ["phi4"] # ✅ Matches --served_model_name phi4 - name: "endpoint2" address: "127.0.0.1" port: 11435 - models: ["qwen3-0.6B"] # ✅ Matches --served_model_name qwen3-0.6B model_config: "phi4": # ✅ Matches --served_model_name phi4 diff --git a/website/docs/tutorials/content-safety/pii-detection.md b/website/docs/tutorials/content-safety/pii-detection.md index f5acca7b..cd58ec60 100644 --- a/website/docs/tutorials/content-safety/pii-detection.md +++ b/website/docs/tutorials/content-safety/pii-detection.md @@ -61,11 +61,9 @@ vllm_endpoints: - name: secure-model address: "127.0.0.1" port: 8080 - models: ["secure-llm"] - name: general-model address: "127.0.0.1" port: 8081 - models: ["general-llm"] # Model-specific configurations model_config: diff --git a/website/docs/tutorials/intelligent-route/reasoning.md b/website/docs/tutorials/intelligent-route/reasoning.md index f9c7426b..9aa6183d 100644 --- a/website/docs/tutorials/intelligent-route/reasoning.md +++ b/website/docs/tutorials/intelligent-route/reasoning.md @@ -34,7 +34,6 @@ vllm_endpoints: - name: "endpoint1" address: "127.0.0.1" port: 8000 - models: ["deepseek-v31", "qwen3-30b", "openai/gpt-oss-20b"] # Must match --served-model-name weight: 1 # Reasoning family configurations (how to express reasoning for a family)