feat: support dynamic router based on picked endpoint

Xunzhuo · Xunzhuo · commit 0bbab023805e · 2025-08-28T18:50:00.000+08:00
Signed-off-by: bitliu &lt;bitliu@tencent.com&gt;
diff --git a/config/envoy.yaml b/config/envoy.yaml
@@ -38,39 +38,11 @@ static_resources:
             - name: local_service
               domains: ["*"]
               routes:
-              # Dynamic routing based on selected endpoint header
+              # Single route using original destination cluster
               - match:
                   prefix: "/"
-                  headers:
-                  - name: "x-semantic-destination-endpoint"
-                    string_match:
-                      exact: "endpoint1"
                 route:
-                  cluster: vllm_endpoint1
-                  timeout: 300s
-              - match:
-                  prefix: "/"
-                  headers:
-                  - name: "x-semantic-destination-endpoint"
-                    string_match:
-                      exact: "endpoint2"
-                route:
-                  cluster: vllm_endpoint2
-                  timeout: 300s
-              - match:
-                  prefix: "/"
-                  headers:
-                  - name: "x-semantic-destination-endpoint"
-                    string_match:
-                      exact: "endpoint3"
-                route:
-                  cluster: vllm_endpoint3
-                  timeout: 300s
-              # Fallback route - will be routed by the external processor
-              - match:
-                  prefix: "/"
-                route:
-                  cluster: vllm_endpoint1  # Default fallback
+                  cluster: vllm_dynamic_cluster
                   timeout: 300s
           http_filters:
           - name: envoy.filters.http.ext_proc
@@ -85,6 +57,8 @@ static_resources:
                 response_header_mode: "SEND"
                 request_body_mode: "BUFFERED"
                 response_body_mode: "BUFFERED"
+                request_trailer_mode: "SKIP"
+                response_trailer_mode: "SKIP"
               failure_mode_allow: true
               message_timeout: 300s
           - name: envoy.filters.http.router
@@ -95,6 +69,7 @@ static_resources:
   clusters:
   - name: extproc_service
     connect_timeout: 300s
+    per_connection_buffer_limit_bytes: 52428800
     type: STATIC
     lb_policy: ROUND_ROBIN
     typed_extension_protocol_options:
@@ -114,64 +89,17 @@ static_resources:
               socket_address:
                 address: 127.0.0.1
                 port_value: 50051
-  
-  # Multiple vLLM backend clusters
-  - name: vllm_endpoint1
-    connect_timeout: 300s
-    type: STRICT_DNS
-    lb_policy: ROUND_ROBIN
-    typed_extension_protocol_options:
-      envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
-        "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
-        explicit_http_config:
-          http_protocol_options: {}
-    load_assignment:
-      cluster_name: vllm_endpoint1
-      endpoints:
-      - lb_endpoints:
-        - endpoint:
-            address:
-              socket_address:
-                address: 192.168.12.90
-                port_value: 11434
-          load_balancing_weight: 1
 
-  - name: vllm_endpoint2
+  # Dynamic vLLM cluster using original destination
+  - name: vllm_dynamic_cluster
     connect_timeout: 300s
-    type: STRICT_DNS
-    lb_policy: ROUND_ROBIN
+    type: ORIGINAL_DST
+    lb_policy: CLUSTER_PROVIDED
+    original_dst_lb_config:
+      use_http_header: true
+      http_header_name: "x-semantic-destination-endpoint"
     typed_extension_protocol_options:
       envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
         "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
         explicit_http_config:
           http_protocol_options: {}
-    load_assignment:
-      cluster_name: vllm_endpoint2
-      endpoints:
-      - lb_endpoints:
-        - endpoint:
-            address:
-              socket_address:
-                address: 192.168.12.90
-                port_value: 11434
-          load_balancing_weight: 1
-
-  - name: vllm_endpoint3
-    connect_timeout: 300s
-    type: STRICT_DNS
-    lb_policy: ROUND_ROBIN
-    typed_extension_protocol_options:
-      envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
-        "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
-        explicit_http_config:
-          http_protocol_options: {}
-    load_assignment:
-      cluster_name: vllm_endpoint3
-      endpoints:
-      - lb_endpoints:
-        - endpoint:
-            address:
-              socket_address:
-                address: 192.168.12.90
-                port_value: 11434
-          load_balancing_weight: 2  # Higher weight for more powerful endpoint
diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
@@ -458,6 +458,30 @@ func (c *RouterConfig) SelectBestEndpointForModel(modelName string) (string, boo
 	return bestEndpoint.Name, true
 }
 
+// SelectBestEndpointAddressForModel selects the best endpoint for a model and returns the address:port
+// Returns the endpoint address:port string and whether selection was successful
+func (c *RouterConfig) SelectBestEndpointAddressForModel(modelName string) (string, bool) {
+	endpoints := c.GetEndpointsForModel(modelName)
+	if len(endpoints) == 0 {
+		return "", false
+	}
+
+	// If only one endpoint, return it
+	if len(endpoints) == 1 {
+		return fmt.Sprintf("%s:%d", endpoints[0].Address, endpoints[0].Port), true
+	}
+
+	// Select endpoint with highest weight
+	bestEndpoint := endpoints[0]
+	for _, endpoint := range endpoints[1:] {
+		if endpoint.Weight > bestEndpoint.Weight {
+			bestEndpoint = endpoint
+		}
+	}
+
+	return fmt.Sprintf("%s:%d", bestEndpoint.Address, bestEndpoint.Port), true
+}
+
 // ValidateEndpoints validates that all configured models have at least one endpoint
 func (c *RouterConfig) ValidateEndpoints() error {
 	// Get all models from categories
diff --git a/src/semantic-router/pkg/extproc/endpoint_selection_test.go b/src/semantic-router/pkg/extproc/endpoint_selection_test.go
@@ -77,8 +77,8 @@ var _ = Describe("Endpoint Selection", func() {
 					for _, header := range headerMutation.SetHeaders {
 						if header.Header.Key == "x-semantic-destination-endpoint" {
 							endpointHeaderFound = true
-							// Should be one of the configured endpoints
-							Expect(header.Header.Value).To(BeElementOf("test-endpoint1", "test-endpoint2"))
+							// Should be one of the configured endpoint addresses
+							Expect(header.Header.Value).To(BeElementOf("127.0.0.1:8000", "127.0.0.1:8001"))
 						}
 						if header.Header.Key == "x-selected-model" {
 							modelHeaderFound = true
@@ -148,7 +148,7 @@ var _ = Describe("Endpoint Selection", func() {
 
 					if endpointHeaderFound {
 						// model-a should be routed to test-endpoint1 based on preferred endpoints
-						Expect(selectedEndpoint).To(Equal("test-endpoint1"))
+						Expect(selectedEndpoint).To(Equal("127.0.0.1:8000"))
 					}
 				}
 			})
@@ -207,7 +207,7 @@ var _ = Describe("Endpoint Selection", func() {
 
 					if endpointHeaderFound {
 						// model-b should be routed to test-endpoint2 (higher weight) or test-endpoint1
-						Expect(selectedEndpoint).To(BeElementOf("test-endpoint1", "test-endpoint2"))
+						Expect(selectedEndpoint).To(BeElementOf("127.0.0.1:8000", "127.0.0.1:8001"))
 					}
 				}
 			})
@@ -256,6 +256,11 @@ var _ = Describe("Endpoint Selection", func() {
 			bestEndpoint, found := cfg.SelectBestEndpointForModel("model-b")
 			Expect(found).To(BeTrue())
 			Expect(bestEndpoint).To(BeElementOf("test-endpoint1", "test-endpoint2"))
+
+			// Test best endpoint address selection
+			bestEndpointAddress, found := cfg.SelectBestEndpointAddressForModel("model-b")
+			Expect(found).To(BeTrue())
+			Expect(bestEndpointAddress).To(BeElementOf("127.0.0.1:8000", "127.0.0.1:8001"))
 		})
 	})
 
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
@@ -344,10 +344,10 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 				actualModel = matchedModel
 
 				// Select the best endpoint for this model
-				endpoint, endpointFound := r.Config.SelectBestEndpointForModel(matchedModel)
+				endpointAddress, endpointFound := r.Config.SelectBestEndpointAddressForModel(matchedModel)
 				if endpointFound {
-					selectedEndpoint = endpoint
-					log.Printf("Selected endpoint: %s for model: %s", selectedEndpoint, matchedModel)
+					selectedEndpoint = endpointAddress
+					log.Printf("Selected endpoint address: %s for model: %s", selectedEndpoint, matchedModel)
 				} else {
 					log.Printf("Warning: No endpoint found for model %s, using fallback", matchedModel)
 				}
@@ -386,7 +386,6 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 					setHeaders = append(setHeaders, &core.HeaderValueOption{
 						Header: &core.HeaderValue{
 							Key:      "x-semantic-destination-endpoint",
-							Value:    selectedEndpoint,
 							RawValue: []byte(selectedEndpoint),
 						},
 					})
@@ -416,10 +415,9 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 					Response: &ext_proc.ProcessingResponse_RequestBody{
 						RequestBody: &ext_proc.BodyResponse{
 							Response: &ext_proc.CommonResponse{
-								ClearRouteCache: true,
-								Status:          ext_proc.CommonResponse_CONTINUE_AND_REPLACE,
-								HeaderMutation:  headerMutation,
-								BodyMutation:    bodyMutation,
+								Status:         ext_proc.CommonResponse_CONTINUE,
+								HeaderMutation: headerMutation,
+								BodyMutation:   bodyMutation,
 							},
 						},
 					},
@@ -444,13 +442,36 @@ func (r *OpenAIRouter) handleModelRouting(openAIRequest *openai.ChatCompletionNe
 		}
 
 		// Select the best endpoint for the specified model
-		endpoint, endpointFound := r.Config.SelectBestEndpointForModel(originalModel)
+		endpointAddress, endpointFound := r.Config.SelectBestEndpointAddressForModel(originalModel)
 		if endpointFound {
-			selectedEndpoint = endpoint
-			log.Printf("Selected endpoint: %s for model: %s", selectedEndpoint, originalModel)
+			selectedEndpoint = endpointAddress
+			log.Printf("Selected endpoint address: %s for model: %s", selectedEndpoint, originalModel)
 		} else {
+			// TOOD(Xunzhuo): pick a random endpoint from the list of all available endpoints
 			log.Printf("Warning: No endpoint found for model %s, using fallback", originalModel)
 		}
+		setHeaders := []*core.HeaderValueOption{}
+		if selectedEndpoint != "" {
+			setHeaders = append(setHeaders, &core.HeaderValueOption{
+				Header: &core.HeaderValue{
+					Key:      "x-semantic-destination-endpoint",
+					RawValue: []byte(selectedEndpoint),
+				},
+			})
+		}
+		// Set the response with body mutation and content-length removal
+		response = &ext_proc.ProcessingResponse{
+			Response: &ext_proc.ProcessingResponse_RequestBody{
+				RequestBody: &ext_proc.BodyResponse{
+					Response: &ext_proc.CommonResponse{
+						Status: ext_proc.CommonResponse_CONTINUE,
+						HeaderMutation: &ext_proc.HeaderMutation{
+							SetHeaders: setHeaders,
+						},
+					},
+				},
+			},
+		}
 	}
 
 	// Save the actual model that will be used for token tracking