Skip to content

Commit 9a1c4e3

Browse files
committed
lint
Signed-off-by: bitliu <[email protected]>
1 parent 332073b commit 9a1c4e3

File tree

11 files changed

+89
-23
lines changed

11 files changed

+89
-23
lines changed

config/envoy.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ static_resources:
3131
upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%"
3232
request_id: "%REQ(X-REQUEST-ID)%"
3333
selected_model: "%REQ(X-SELECTED-MODEL)%"
34-
selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%"
3534
route_config:
3635
name: local_route
3736
virtual_hosts:
@@ -106,7 +105,7 @@ static_resources:
106105
lb_policy: CLUSTER_PROVIDED
107106
original_dst_lb_config:
108107
use_http_header: true
109-
http_header_name: "x-gateway-destination-endpoint"
108+
http_header_name: "x-vsr-destination-endpoint"
110109
typed_extension_protocol_options:
111110
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
112111
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions

deploy/docker-compose/addons/envoy.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ static_resources:
3131
upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%"
3232
request_id: "%REQ(X-REQUEST-ID)%"
3333
selected_model: "%REQ(X-SELECTED-MODEL)%"
34-
selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%"
3534
route_config:
3635
name: local_route
3736
virtual_hosts:
@@ -106,7 +105,7 @@ static_resources:
106105
lb_policy: CLUSTER_PROVIDED
107106
original_dst_lb_config:
108107
use_http_header: true
109-
http_header_name: "x-gateway-destination-endpoint"
108+
http_header_name: "x-vsr-destination-endpoint"
110109
typed_extension_protocol_options:
111110
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
112111
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions

deploy/openshift/envoy-openshift.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# OpenShift-specific Envoy configuration
22
# This config uses ORIGINAL_DST cluster with header-based destination selection
3-
# The semantic router sets the x-gateway-destination-endpoint header which Envoy uses
3+
# The semantic router sets the x-vsr-destination-endpoint header which Envoy uses
44
# to dynamically route to the correct vLLM endpoint (port 8000 or 8001)
55
static_resources:
66
listeners:
@@ -35,7 +35,7 @@ static_resources:
3535
upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%"
3636
request_id: "%REQ(X-REQUEST-ID)%"
3737
selected_model: "%REQ(X-SELECTED-MODEL)%"
38-
selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%"
38+
selected_endpoint: "%REQ(x-vsr-destination-endpoint)%"
3939
route_config:
4040
name: local_route
4141
virtual_hosts:
@@ -48,7 +48,7 @@ static_resources:
4848
route:
4949
cluster: semantic_router_cluster
5050
timeout: 300s
51-
# Dynamic route - destination determined by x-gateway-destination-endpoint header
51+
# Dynamic route - destination determined by x-vsr-destination-endpoint header
5252
- match:
5353
prefix: "/"
5454
route:
@@ -130,15 +130,15 @@ static_resources:
130130
http_protocol_options: {}
131131

132132
# Dynamic vLLM cluster using ORIGINAL_DST with header-based destination
133-
# The semantic router sets x-gateway-destination-endpoint header to specify the target
133+
# The semantic router sets x-vsr-destination-endpoint header to specify the target
134134
- name: vllm_dynamic_cluster
135135
connect_timeout: 300s
136136
per_connection_buffer_limit_bytes: 52428800
137137
type: ORIGINAL_DST
138138
lb_policy: CLUSTER_PROVIDED
139139
original_dst_lb_config:
140140
use_http_header: true
141-
http_header_name: "x-gateway-destination-endpoint"
141+
http_header_name: "x-vsr-destination-endpoint"
142142
typed_extension_protocol_options:
143143
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
144144
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions

src/semantic-router/pkg/extproc/extproc_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1901,7 +1901,7 @@ var _ = Describe("Endpoint Selection", func() {
19011901
var modelHeaderFound bool
19021902

19031903
for _, header := range headerMutation.SetHeaders {
1904-
if header.Header.Key == "x-gateway-destination-endpoint" {
1904+
if header.Header.Key == "x-vsr-destination-endpoint" {
19051905
endpointHeaderFound = true
19061906
// Should be one of the configured endpoint addresses
19071907
// Check both Value and RawValue since implementation uses RawValue
@@ -1975,7 +1975,7 @@ var _ = Describe("Endpoint Selection", func() {
19751975
var selectedEndpoint string
19761976

19771977
for _, header := range headerMutation.SetHeaders {
1978-
if header.Header.Key == "x-gateway-destination-endpoint" {
1978+
if header.Header.Key == "x-vsr-destination-endpoint" {
19791979
endpointHeaderFound = true
19801980
// Check both Value and RawValue since implementation uses RawValue
19811981
selectedEndpoint = header.Header.Value
@@ -2038,7 +2038,7 @@ var _ = Describe("Endpoint Selection", func() {
20382038
var selectedEndpoint string
20392039

20402040
for _, header := range headerMutation.SetHeaders {
2041-
if header.Header.Key == "x-gateway-destination-endpoint" {
2041+
if header.Header.Key == "x-vsr-destination-endpoint" {
20422042
endpointHeaderFound = true
20432043
// Check both Value and RawValue since implementation uses RawValue
20442044
selectedEndpoint = header.Header.Value

src/semantic-router/pkg/headers/headers_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ func TestHeaderConstants(t *testing.T) {
1212
}{
1313
// Request headers
1414
{"RequestID", RequestID, "x-request-id"},
15-
{"GatewayDestinationEndpoint", GatewayDestinationEndpoint, "x-gateway-destination-endpoint"},
15+
{"GatewayDestinationEndpoint", GatewayDestinationEndpoint, "x-vsr-destination-endpoint"},
1616
{"SelectedModel", SelectedModel, "x-selected-model"},
1717
// VSR headers
1818
{"VSRSelectedCategory", VSRSelectedCategory, "x-vsr-selected-category"},

website/docs/api/router.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ The router adds metadata headers to both requests and responses:
148148

149149
| Header | Description | Example |
150150
|--------|-------------|---------|
151-
| `x-gateway-destination-endpoint` | Backend endpoint selected | `endpoint1` |
151+
| `x-vsr-destination-endpoint` | Backend endpoint selected | `endpoint1` |
152152
| `x-selected-model` | Model category determined | `mathematics` |
153153
| `x-routing-confidence` | Classification confidence | `0.956` |
154154
| `x-request-id` | Unique request identifier | `req-abc123` |

website/docs/installation/ai-gateway/ai-gateway.md

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,74 @@ The deployment consists of:
1010
- **Envoy Gateway**: Core gateway functionality and traffic management
1111
- **Envoy AI Gateway**: AI Gateway built on Envoy Gateway for LLM providers
1212

13+
## Benefits of Integration
14+
15+
Integrating vLLM Semantic Router with Envoy AI Gateway provides enterprise-grade capabilities for production LLM deployments:
16+
17+
### 1. **Hybrid Model Selection**
18+
19+
Seamlessly route requests between cloud LLM providers (OpenAI, Anthropic, etc.) and self-hosted models.
20+
21+
### 2. **Token Rate Limiting**
22+
23+
Protect your infrastructure and control costs with fine-grained rate limiting:
24+
25+
- **Input token limits**: Control request size to prevent abuse
26+
- **Output token limits**: Manage response generation costs
27+
- **Total token limits**: Set overall usage quotas per user/tenant
28+
- **Time-based windows**: Configure limits per second, minute, or hour
29+
30+
### 3. **Model/Provider Failover**
31+
32+
Ensure high availability with automatic failover mechanisms:
33+
34+
- Detect unhealthy backends and route traffic to healthy instances
35+
- Support for active-passive and active-active failover strategies
36+
- Graceful degradation when primary models are unavailable
37+
38+
### 4. **Traffic Splitting & Canary Testing**
39+
40+
Deploy new models safely with progressive rollout capabilities:
41+
42+
- **A/B Testing**: Split traffic between model versions to compare performance
43+
- **Canary Deployments**: Gradually shift traffic to new models (e.g., 5% → 25% → 50% → 100%)
44+
- **Shadow Traffic**: Send duplicate requests to new models without affecting production
45+
- **Weight-based routing**: Fine-tune traffic distribution across model variants
46+
47+
### 5. **LLM Observability & Monitoring**
48+
49+
Gain deep insights into your LLM infrastructure:
50+
51+
- **Request/Response Metrics**: Track latency, throughput, token usage, and error rates
52+
- **Model Performance**: Monitor accuracy, quality scores, and user satisfaction
53+
- **Cost Analytics**: Analyze spending patterns across models and providers
54+
- **Distributed Tracing**: End-to-end visibility with OpenTelemetry integration
55+
- **Custom Dashboards**: Visualize metrics in Prometheus, Grafana, or your preferred monitoring stack
56+
57+
## Supported LLM Providers
58+
59+
| Provider Name | API Schema Config on [AIServiceBackend](https://aigateway.envoyproxy.io/docs/api/#aiservicebackendspec) | Upstream Authentication Config on [BackendSecurityPolicy](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyspec) | Status | Note |
60+
| ------------------------------------------------------------ | :----------------------------------------------------------: | :----------------------------------------------------------: | :----: | ------------------------------------------------------------ |
61+
| [OpenAI](https://platform.openai.com/docs/api-reference) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
62+
| [AWS Bedrock](https://docs.aws.amazon.com/bedrock/latest/APIReference/) | `{"name":"AWSBedrock"}` | [AWS Bedrock Credentials](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyawscredentials) || |
63+
| [Azure OpenAI](https://learn.microsoft.com/en-us/azure/ai-services/openai/reference) | `{"name":"AzureOpenAI","version":"2025-01-01-preview"}` or `{"name":"OpenAI", "version": "openai/v1"}` | [Azure Credentials](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyazurecredentials) or [Azure API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyazureapikey) || |
64+
| [Google Gemini on AI Studio](https://ai.google.dev/gemini-api/docs/openai) | `{"name":"OpenAI","version":"v1beta/openai"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || Only the OpenAI compatible endpoint |
65+
| [Google Vertex AI](https://cloud.google.com/vertex-ai/docs/reference/rest) | `{"name":"GCPVertexAI"}` | [GCP Credentials](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicygcpcredentials) || |
66+
| [Anthropic on GCP Vertex AI](https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude) | `{"name":"GCPAnthropic", "version":"vertex-2023-10-16"}` | [GCP Credentials](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicygcpcredentials) || Support both Native Anthropic messages endpoint and OpenAI compatible endpoint |
67+
| [Groq](https://console.groq.com/docs/openai) | `{"name":"OpenAI","version":"openai/v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
68+
| [Grok](https://docs.x.ai/docs/api-reference?utm_source=chatgpt.com#chat-completions) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
69+
| [Together AI](https://docs.together.ai/docs/openai-api-compatibility) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
70+
| [Cohere](https://docs.cohere.com/v2/docs/compatibility-api) | `{"name":"Cohere","version":"v2"}` or `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || Supports native Cohere v2 (e.g., /cohere/v2/rerank) and OpenAI-compatible endpoints. |
71+
| [Mistral](https://docs.mistral.ai/api/#tag/chat/operation/chat_completion_v1_chat_completions_post) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
72+
| [DeepInfra](https://deepinfra.com/docs/inference) | `{"name":"OpenAI","version":"v1/openai"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || Only the OpenAI compatible endpoint |
73+
| [DeepSeek](https://api-docs.deepseek.com/) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
74+
| [Hunyuan](https://cloud.tencent.com/document/product/1729/111007) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
75+
| [Tencent LLM Knowledge Engine](https://www.tencentcloud.com/document/product/1255/70381?lang=en) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
76+
| [Tetrate Agent Router Service (TARS)](https://router.tetrate.ai/) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
77+
| [SambaNova](https://docs.sambanova.ai/sambastudio/latest/open-ai-api.html) | `{"name":"OpenAI","version":"v1"}` | [API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyapikey) || |
78+
| [Anthropic](https://docs.claude.com/en/home) | `{"name":"Anthropic"}` | [Anthropic API Key](https://aigateway.envoyproxy.io/docs/api/#backendsecuritypolicyanthropicapikey) || Support only Native Anthropic messages endpoint |
79+
| Self-hosted-models | `{"name":"OpenAI","version":"v1"}` | N/A || Depending on the API schema spoken by self-hosted servers. For example, [vLLM](https://docs.vllm.ai/en/v0.8.3/serving/openai_compatible_server.html) speaks the OpenAI format. Also, API Key auth can be configured as well. |
80+
1381
## Prerequisites
1482

1583
Before starting, ensure you have the following tools installed:

website/docs/overview/architecture/envoy-extproc.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ func (r *OpenAIRouter) handleRequestBody(
227227
headerMutations := []*core.HeaderValueOption{
228228
{
229229
Header: &core.HeaderValue{
230-
Key: "x-gateway-destination-endpoint",
230+
Key: "x-vsr-destination-endpoint",
231231
Value: selectedEndpoint,
232232
},
233233
Append: &wrapperspb.BoolValue{Value: false},
@@ -347,7 +347,7 @@ static_resources:
347347
response_code: "%RESPONSE_CODE%"
348348
duration: "%DURATION%"
349349
selected_model: "%REQ(X-SELECTED-MODEL)%"
350-
selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%"
350+
selected_endpoint: "%REQ(x-vsr-destination-endpoint)%"
351351
routing_confidence: "%REQ(X-ROUTING-CONFIDENCE)%"
352352
353353
# Route configuration with dynamic routing
@@ -361,7 +361,7 @@ static_resources:
361361
- match:
362362
prefix: "/"
363363
headers:
364-
- name: "x-gateway-destination-endpoint"
364+
- name: "x-vsr-destination-endpoint"
365365
string_match:
366366
exact: "endpoint1"
367367
route:
@@ -370,7 +370,7 @@ static_resources:
370370
- match:
371371
prefix: "/"
372372
headers:
373-
- name: "x-gateway-destination-endpoint"
373+
- name: "x-vsr-destination-endpoint"
374374
string_match:
375375
exact: "endpoint2"
376376
route:
@@ -379,7 +379,7 @@ static_resources:
379379
- match:
380380
prefix: "/"
381381
headers:
382-
- name: "x-gateway-destination-endpoint"
382+
- name: "x-vsr-destination-endpoint"
383383
string_match:
384384
exact: "endpoint3"
385385
route:

website/docs/overview/architecture/system-architecture.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ graph TB
238238
239239
ToolsSelection --> RoutingDecision[Make Routing Decision<br/>Select Optimal Model]
240240
241-
RoutingDecision --> SetHeaders[Set Routing Headers<br/>x-gateway-destination-endpoint<br/>x-selected-model]
241+
RoutingDecision --> SetHeaders[Set Routing Headers<br/>x-vsr-destination-endpoint<br/>x-selected-model]
242242
243243
SetHeaders --> EnvoyRoute[Envoy Routes to<br/>Selected Backend]
244244

website/docs/tutorials/intelligent-route/reasoning.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ Verify routing via response headers
145145
The router does not inject routing metadata into the JSON body. Instead, inspect the response headers added by the router:
146146

147147
- X-Selected-Model
148-
- X-GATEWAY-DESTINATION-ENDPOINT
148+
- x-vsr-destination-endpoint
149149

150150
Example:
151151

@@ -161,7 +161,7 @@ curl -i http://localhost:8801/v1/chat/completions \
161161
}'
162162
# In the response headers, look for:
163163
# X-Selected-Model: <your-selected-model>
164-
# X-GATEWAY-DESTINATION-ENDPOINT: <address:port>
164+
# x-vsr-destination-endpoint: <address:port>
165165
```
166166

167167
4) Run a comprehensive evaluation

0 commit comments

Comments
 (0)