[feat] add distribute-dp api server least_request route (vllm-project#1866)

paranoidRick · web-flow · commit b1c38177e04e · 2026-01-25T20:09:51.000-08:00
Signed-off-by: yangyouchuan &lt;1184540833@qq.com&gt;
diff --git a/pkg/cache/cache_trace.go b/pkg/cache/cache_trace.go
@@ -18,6 +18,7 @@ package cache
 import (
 	"context"
 	"fmt"
+	"strconv"
 	"sync/atomic"
 	"time"
 
@@ -49,6 +50,8 @@ func (c *Store) addPodStats(ctx *types.RoutingContext, requestID string) {
 		return
 	}
 	pod := ctx.TargetPod()
+	port := ctx.TargetPort()
+
 	metaPod, ok := c.metaPods.Load(utils.GeneratePodKey(pod.Namespace, pod.Name))
 	if !ok {
 		klog.Warningf("can't find routing pod: %s, requestID: %s", pod.Name, requestID)
@@ -57,7 +60,11 @@ func (c *Store) addPodStats(ctx *types.RoutingContext, requestID string) {
 
 	// Update running requests
 	requests := atomic.AddInt32(&metaPod.runningRequests, 1)
-	if err := c.updatePodRecord(metaPod, "", metrics.RealtimeNumRequestsRunning, metrics.PodMetricScope, &metrics.SimpleMetricValue{Value: float64(requests)}); err != nil {
+	metricName := metrics.RealtimeNumRequestsRunning
+	if port > 0 {
+		metricName = metricName + "/" + strconv.Itoa(port)
+	}
+	if err := c.updatePodRecord(metaPod, "", metricName, metrics.PodMetricScope, &metrics.SimpleMetricValue{Value: float64(requests)}); err != nil {
 		klog.Warningf("can't update realtime metric: %s, pod: %s, requestID: %s, err: %v", metrics.RealtimeNumRequestsRunning, metaPod.Name, requestID, err)
 	}
 
@@ -86,6 +93,7 @@ func (c *Store) donePodStats(ctx *types.RoutingContext, requestID string) {
 		return
 	}
 	pod := ctx.TargetPod()
+	port := ctx.TargetPort()
 
 	// Now that pendingLoadProvider must be set.
 	metaPod, ok := c.metaPods.Load(utils.GeneratePodKey(pod.Namespace, pod.Name))
@@ -96,7 +104,11 @@ func (c *Store) donePodStats(ctx *types.RoutingContext, requestID string) {
 
 	// Update running requests
 	requests := atomic.AddInt32(&metaPod.runningRequests, -1)
-	if err := c.updatePodRecord(metaPod, ctx.Model, metrics.RealtimeNumRequestsRunning, metrics.PodMetricScope, &metrics.SimpleMetricValue{Value: float64(requests)}); err != nil {
+	metricName := metrics.RealtimeNumRequestsRunning
+	if port > 0 {
+		metricName = metricName + "/" + strconv.Itoa(port)
+	}
+	if err := c.updatePodRecord(metaPod, ctx.Model, metricName, metrics.PodMetricScope, &metrics.SimpleMetricValue{Value: float64(requests)}); err != nil {
 		klog.Warningf("can't update realtime metric: %s, pod: %s, requestID: %s", metrics.RealtimeNumRequestsRunning, pod.Name, requestID)
 	}
 
diff --git a/pkg/plugins/gateway/algorithms/least_load_test.go b/pkg/plugins/gateway/algorithms/least_load_test.go
@@ -98,6 +98,10 @@ func (m *mockPodList) ListByIndex(index string) []*v1.Pod {
 	return nil
 }
 
+func (m *mockPodList) ListPortsForPod() map[string][]int {
+	return nil
+}
+
 func newMockPodList(pods []*v1.Pod, indexes map[string][]*v1.Pod) *mockPodList {
 	if indexes == nil {
 		indexes = make(map[string][]*v1.Pod)
diff --git a/pkg/plugins/gateway/algorithms/least_request.go b/pkg/plugins/gateway/algorithms/least_request.go
@@ -17,8 +17,11 @@ limitations under the License.
 package routingalgorithms
 
 import (
+	"fmt"
 	"math"
 	"math/rand"
+	"strconv"
+	"strings"
 
 	"github.com/vllm-project/aibrix/pkg/cache"
 	"github.com/vllm-project/aibrix/pkg/metrics"
@@ -45,14 +48,19 @@ func NewLeastRequestRouter() (types.Router, error) {
 		return nil, err
 	}
 
-	return leastRequestRouter{
+	return &leastRequestRouter{
 		cache: c,
 	}, nil
 }
 
 // Route request based of least active request among input ready pods
-func (r leastRequestRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) {
+func (r *leastRequestRouter) Route(ctx *types.RoutingContext, readyPodList types.PodList) (string, error) {
 	readyPods := readyPodList.All()
+	// Use distributed DP-level API server routing when pods have multiple ports
+	if isMultiPortPods(readyPods) {
+		return r.apiServerRoute(ctx, readyPods, readyPodList.ListPortsForPod())
+	}
+	// Use default Pod-level routing
 	targetPod := selectTargetPodWithLeastRequestCount(r.cache, readyPods)
 
 	// Use fallback if no valid metrics
@@ -68,6 +76,20 @@ func (r leastRequestRouter) Route(ctx *types.RoutingContext, readyPodList types.
 	return ctx.TargetAddress(), nil
 }
 
+func (r *leastRequestRouter) apiServerRoute(ctx *types.RoutingContext, readyPods []*v1.Pod, portsMap map[string][]int) (string, error) {
+	targetPod, targetPort := selectTargetPodAndPortWithLeastRequestCount(r.cache, readyPods, portsMap)
+	if targetPod == nil {
+		return "", fmt.Errorf("no target pod selected")
+	}
+
+	if targetPort == 0 {
+		return "", fmt.Errorf("target pod does not have a port")
+	}
+	ctx.SetTargetPod(targetPod)
+	ctx.SetTargetPort(targetPort)
+	return ctx.TargetAddress(), nil
+}
+
 func (r *leastRequestRouter) SubscribedMetrics() []string {
 	return []string{
 		metrics.RealtimeNumRequestsRunning,
@@ -95,13 +117,67 @@ func selectTargetPodWithLeastRequestCount(cache cache.Cache, readyPods []*v1.Pod
 	return targetPod
 }
 
+func selectTargetPodAndPortWithLeastRequestCount(cache cache.Cache, readyPods []*v1.Pod, portsMap map[string][]int) (*v1.Pod, int) {
+	readyPodsMap := make(map[string]*v1.Pod, len(readyPods))
+	for _, pod := range readyPods {
+		readyPodsMap[pod.Name] = pod
+	}
+
+	minCount := math.MaxInt32
+
+	var targetApiServers []string
+	podRequestCount := getRequestCountsWithPort(cache, readyPods, portsMap)
+	if len(podRequestCount) == 0 {
+		return nil, 0
+	}
+
+	klog.V(4).InfoS("selectTargetPodAndPortWithLeastRequestCount", "podRequestCount", podRequestCount)
+	for servername, totalReq := range podRequestCount {
+		if totalReq < minCount {
+			minCount = totalReq
+			targetApiServers = []string{servername}
+		} else if totalReq == minCount {
+			targetApiServers = append(targetApiServers, servername)
+		}
+	}
+
+	if len(targetApiServers) == 0 {
+		return nil, 0
+	}
+
+	// Random selection among candidates
+	selectedServer := targetApiServers[rand.Intn(len(targetApiServers))]
+	parts := strings.Split(selectedServer, "/")
+	if len(parts) != 2 {
+		klog.ErrorS(nil, "Invalid server name format", "serverName", selectedServer)
+		return nil, 0
+	}
+
+	podName := parts[0]
+	portStr := parts[1]
+
+	targetPod, found := readyPodsMap[podName]
+	if !found {
+		klog.ErrorS(nil, "Selected pod not found in ready pods list", "podName", podName)
+		return nil, 0
+	}
+
+	targetPort, err := strconv.Atoi(portStr)
+	if err != nil {
+		klog.ErrorS(err, "Failed to parse port", "port", portStr)
+		return targetPod, 0
+	}
+
+	return targetPod, targetPort
+}
+
 // getRequestCounts returns running request count for each pod tracked by gateway.
 // Note: Currently, gateway instance tracks active running request counts for each pod locally,
 // if multiple gateway instances are active then state is not shared across them.
 // It is advised to run on leader gateway instance.
 // TODO: Support stateful information sync across gateway instances: https://github.com/vllm-project/aibrix/issues/761
 func getRequestCounts(cache cache.Cache, readyPods []*v1.Pod) map[string]int {
-	podRequestCount := map[string]int{}
+	podRequestCount := make(map[string]int, len(readyPods))
 	for _, pod := range readyPods {
 		runningReq, err := cache.GetMetricValueByPod(pod.Name, pod.Namespace, metrics.RealtimeNumRequestsRunning)
 		if err != nil {
@@ -112,3 +188,45 @@ func getRequestCounts(cache cache.Cache, readyPods []*v1.Pod) map[string]int {
 
 	return podRequestCount
 }
+
+// getRequestCountsWithPort returns running request count for each pod with port tracked by gateway
+func getRequestCountsWithPort(cache cache.Cache, readyPods []*v1.Pod, portsMap map[string][]int) map[string]int {
+	podRequestCount := make(map[string]int)
+	for _, pod := range readyPods {
+		podPorts, exists := portsMap[pod.Name]
+		if !exists || len(podPorts) == 0 {
+			continue
+		}
+
+		for _, port := range podPorts {
+			var metricName string
+			var keyName string
+
+			if len(podPorts) == 1 {
+				metricName = metrics.RealtimeNumRequestsRunning
+				keyName = pod.Name
+			} else {
+				metricName = metrics.RealtimeNumRequestsRunning + "/" + strconv.Itoa(port)
+				keyName = pod.Name + "/" + strconv.Itoa(port)
+			}
+
+			var count int
+			if val, err := cache.GetMetricValueByPod(pod.Name, pod.Namespace, metricName); err == nil && val != nil {
+				count = int(val.GetSimpleValue())
+			}
+			podRequestCount[keyName] = count
+		}
+	}
+
+	return podRequestCount
+}
+
+func isMultiPortPods(pods []*v1.Pod) bool {
+	for _, pod := range pods {
+		if utils.IsDataParallelPod(pod) {
+			return true
+		}
+	}
+
+	return false
+}
diff --git a/pkg/plugins/gateway/algorithms/prefix_cache_preble_test.go b/pkg/plugins/gateway/algorithms/prefix_cache_preble_test.go
@@ -48,6 +48,10 @@ func (m *MockPodList) ListByIndex(index string) []*v1.Pod {
 	return m.pods
 }
 
+func (m *MockPodList) ListPortsForPod() map[string][]int {
+	return nil
+}
+
 func createTestRoutingContext(model, message, requestID string) *types.RoutingContext {
 	ctx := context.Background()
 	return types.NewRoutingContext(ctx, RouterPrefixCachePreble, model, message, requestID, "")
diff --git a/pkg/plugins/gateway/algorithms/vtc/vtc_basic_test.go b/pkg/plugins/gateway/algorithms/vtc/vtc_basic_test.go
@@ -106,6 +106,10 @@ func (p *SimplePodList) ListByIndex(index string) []*v1.Pod {
 	return p.pods
 }
 
+func (p *SimplePodList) ListPortsForPod() map[string][]int {
+	return nil
+}
+
 func TestVTCRouterSimple(t *testing.T) {
 	trackerConfig := &VTCConfig{
 		InputTokenWeight:  1.0,
diff --git a/pkg/plugins/gateway/gateway.go b/pkg/plugins/gateway/gateway.go
@@ -180,7 +180,7 @@ func (s *Server) selectTargetPod(ctx *types.RoutingContext, pods types.PodList,
 	if len(readyPods) == 0 {
 		return "", fmt.Errorf("no ready pods for routing")
 	}
-	if len(readyPods) == 1 {
+	if len(readyPods) == 1 && len(utils.GetPortsForPod(readyPods[0])) <= 1 {
 		ctx.SetTargetPod(readyPods[0])
 		return ctx.TargetAddress(), nil
 	}
diff --git a/pkg/types/pod_list.go b/pkg/types/pod_list.go
@@ -30,4 +30,7 @@ type PodList interface {
 
 	// ListByIndex returns a slice of pods that match the given index.
 	ListByIndex(index string) []*v1.Pod
+
+	// ListPortsForPod returns a map of portList that bind with pod, key podname
+	ListPortsForPod() map[string][]int
 }
diff --git a/pkg/types/router_context.go b/pkg/types/router_context.go
@@ -71,6 +71,7 @@ type RoutingContext struct {
 
 	targetPodSet chan struct{}
 	targetPod    atomic.Pointer[v1.Pod]
+	targetPort   atomic.Int32
 	lastError    atomic.Pointer[error]
 	tokens       []int           // Cache of tokenized prompts
 	predictor    OutputPredictor // OutputPredictor gained from cache
@@ -204,6 +205,14 @@ func (r *RoutingContext) TargetPod() *v1.Pod {
 	return targetPod
 }
 
+func (r *RoutingContext) TargetPort() int {
+	return int(r.targetPort.Load())
+}
+
+func (r *RoutingContext) SetTargetPort(port int) {
+	r.targetPort.Store(int32(port))
+}
+
 // GetError returns the error of the routing context.
 func (r *RoutingContext) GetError() error {
 	if r.TargetPod() == nil {
@@ -218,6 +227,11 @@ func (r *RoutingContext) TargetAddress() string {
 	if pod == nil {
 		return ""
 	}
+
+	port := r.TargetPort()
+	if port != 0 {
+		return r.targetAddressWithPort(pod.Status.PodIP, port)
+	}
 	return r.targetAddress(r.TargetPod())
 }
 
@@ -256,6 +270,10 @@ func (r *RoutingContext) targetAddress(pod *v1.Pod) string {
 	return fmt.Sprintf("%v:%v", pod.Status.PodIP, utils.GetModelPortForPod(r.RequestID, pod))
 }
 
+func (r *RoutingContext) targetAddressWithPort(podIP string, port int) string {
+	return fmt.Sprintf("%v:%v", podIP, port)
+}
+
 func (r *RoutingContext) getError() (err error) {
 	errAddr := r.lastError.Load()
 	if errAddr != nil {
diff --git a/pkg/utils/pod_array.go b/pkg/utils/pod_array.go
@@ -118,3 +118,22 @@ func (arr *PodArray) initDeployments() {
 	arr.deployments = deployments
 	arr.podsByDeployment = podsByDeployment
 }
+
+func (arr *PodArray) ListPortsForPod() map[string][]int {
+	pods := arr.All()
+	if len(pods) == 0 {
+		return nil
+	}
+
+	podWithPort := make(map[string][]int, len(pods))
+	for _, pod := range pods {
+		ports := GetPortsForPod(pod)
+		if len(ports) > 0 {
+			podWithPort[pod.Name] = append(podWithPort[pod.Name], ports...)
+		} else {
+			podWithPort[pod.Name] = []int{}
+		}
+	}
+
+	return podWithPort
+}
diff --git a/pkg/utils/util.go b/pkg/utils/util.go

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ func (s Server) selectTargetPod(ctx types.RoutingContext, pods types.PodList,`
`180`	`180`	`if len(readyPods) == 0 {`
`181`	`181`	`return "", fmt.Errorf("no ready pods for routing")`
`182`	`182`	`}`
`183`		`- if len(readyPods) == 1 {`
	`183`	`+ if len(readyPods) == 1 && len(utils.GetPortsForPod(readyPods[0])) <= 1 {`
`184`	`184`	`ctx.SetTargetPod(readyPods[0])`
`185`	`185`	`return ctx.TargetAddress(), nil`
`186`	`186`	`}`
Original file line number	Diff line number	Diff line change
`@@ -30,4 +30,7 @@ type PodList interface {`
`30`	`30`
`31`	`31`	`// ListByIndex returns a slice of pods that match the given index.`
`32`	`32`	`ListByIndex(index string) []*v1.Pod`
	`33`	`+`
	`34`	`+ // ListPortsForPod returns a map of portList that bind with pod, key podname`
	`35`	`+ ListPortsForPod() map[string][]int`
`33`	`36`	`}`