@@ -43,24 +43,24 @@ func (f *filter) Name() string {
4343
4444func (f * filter ) Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
4545 if f == nil {
46- klog .V (2 ).Infof ("Running nil filter, returning all input pods by default" )
46+ klog .V (3 ).Infof ("Running nil filter, returning all input pods by default" )
4747 return pods , nil
4848 }
49- klog .V (2 ).Infof ("Running filter %q on request %v with %v pods" , f .name , b , len (pods ))
49+ klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , b , len (pods ))
5050
5151 filtered , err := f .filter (b , pods )
5252
5353 next := f .nextOnSuccessOrFailure
5454 if err == nil {
55- klog .V (2 ).Infof ("onSuccess %v -> %v, filtered: %v" , f .name , next .Name (), len (filtered ))
55+ klog .V (3 ).Infof ("onSuccess %v -> %v, filtered: %v" , f .name , next .Name (), len (filtered ))
5656 if f .nextOnSuccess != nil {
5757 next = f .nextOnSuccess
5858 }
5959 // On success, pass the filtered result to the next filter.
6060 return next .Filter (b , filtered )
6161 }
6262
63- klog .V (2 ).Infof ("onFailure %v -> %v" , f .name , next .Name ())
63+ klog .V (3 ).Infof ("onFailure %v -> %v" , f .name , next .Name ())
6464 if f .nextOnFailure != nil {
6565 next = f .nextOnFailure
6666 }
@@ -88,32 +88,57 @@ func toFilterFunc(pp podPredicate) filterFunc {
8888 }
8989}
9090
91+ // leastQueuingFilterFunc finds the max and min queue size of all pods, divides the whole range
92+ // (max-min) by the number of pods, and finds the pods that fall into the first range.
93+ // The intuition is that if there are multiple pods that share similar queue size in the low range,
94+ // we should consider them all instead of the absolute minimum one. This worked better than picking
95+ // the least one as it gives more choices for the next filter, which on aggregate gave better
96+ // results.
97+ // TODO: Compare this strategy with other strategies such as top K.
9198func leastQueuingFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
9299 min := math .MaxInt
100+ max := 0
93101 filtered := []* backend.PodMetrics {}
102+
94103 for _ , pod := range pods {
95- if pod .WaitingQueueSize < min {
104+ if pod .WaitingQueueSize <= min {
96105 min = pod .WaitingQueueSize
97- filtered = []* backend.PodMetrics {}
98106 }
99- if pod .WaitingQueueSize == min {
107+ if pod .WaitingQueueSize >= max {
108+ max = pod .WaitingQueueSize
109+ }
110+ }
111+
112+ for _ , pod := range pods {
113+ if pod .WaitingQueueSize >= min && pod .WaitingQueueSize <= min + (max - min )/ len (pods ) {
100114 filtered = append (filtered , pod )
101115 }
102116 }
103117 return filtered , nil
104118}
105119
120+ // leastKVCacheFilterFunc finds the max and min KV cache of all pods, divides the whole range
121+ // (max-min) by the number of pods, and finds the pods that fall into the first range.
122+ // The intuition is that if there are multiple pods that share similar KV cache in the low range, we
123+ // should consider them all instead of the absolute minimum one. This worked better than picking the
124+ // least one as it gives more choices for the next filter, which on aggregate gave better results.
125+ // TODO: Compare this strategy with other strategies such as top K.
106126func leastKVCacheFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
107- min := math .MaxInt
127+ min := math .MaxFloat64
128+ max := math .SmallestNonzeroFloat64
108129 filtered := []* backend.PodMetrics {}
109- margin := 5
130+
110131 for _ , pod := range pods {
111- cur := int (pod .KVCacheUsagePercent ) / margin
112- if cur < min {
113- min = cur
114- filtered = []* backend.PodMetrics {}
132+ if pod .KVCacheUsagePercent <= min {
133+ min = pod .KVCacheUsagePercent
134+ }
135+ if pod .KVCacheUsagePercent >= max {
136+ max = pod .KVCacheUsagePercent
115137 }
116- if cur == min {
138+ }
139+
140+ for _ , pod := range pods {
141+ if pod .KVCacheUsagePercent >= min && pod .KVCacheUsagePercent <= min + (max - min )/ float64 (len (pods )) {
117142 filtered = append (filtered , pod )
118143 }
119144 }
0 commit comments