@@ -11,7 +11,7 @@ import (
1111
1212type Filter interface {
1313 Name () string
14- Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
14+ Filter (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
1515}
1616
1717// filter applies current filterFunc, and then recursively applies next filters depending success or
@@ -41,42 +41,46 @@ func (f *filter) Name() string {
4141 return f .name
4242}
4343
44- func (f * filter ) Filter (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
45- if f == nil {
46- klog .V (3 ).Infof ("Running nil filter, returning all input pods by default" )
47- return pods , nil
48- }
49- klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , b , len (pods ))
44+ func (f * filter ) Filter (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
45+ klog .V (3 ).Infof ("Running filter %q on request %v with %v pods" , f .name , req , len (pods ))
5046
51- filtered , err := f .filter (b , pods )
47+ filtered , err := f .filter (req , pods )
5248
5349 next := f .nextOnSuccessOrFailure
54- if err == nil {
55- klog .V (3 ).Infof ("onSuccess %v -> %v, filtered: %v" , f .name , next .Name (), len (filtered ))
50+ if err == nil && len (filtered ) > 0 {
51+ if f .nextOnSuccess == nil && f .nextOnSuccessOrFailure == nil {
52+ // No succeeding filters to run, return.
53+ return filtered , err
54+ }
5655 if f .nextOnSuccess != nil {
5756 next = f .nextOnSuccess
5857 }
58+ klog .V (3 ).Infof ("onSuccess %q -> %q, filtered: %v" , f .name , next .Name (), len (filtered ))
5959 // On success, pass the filtered result to the next filter.
60- return next .Filter (b , filtered )
61- }
62-
63- klog .V (3 ).Infof ("onFailure %v -> %v" , f .name , next .Name ())
64- if f .nextOnFailure != nil {
65- next = f .nextOnFailure
60+ return next .Filter (req , filtered )
61+ } else {
62+ if f .nextOnFailure == nil && f .nextOnSuccessOrFailure == nil {
63+ // No succeeding filters to run, return.
64+ return filtered , err
65+ }
66+ if f .nextOnFailure != nil {
67+ next = f .nextOnFailure
68+ }
69+ klog .V (3 ).Infof ("onFailure %q -> %q" , f .name , next .Name ())
70+ // On failure, pass the initial set of pods to the next filter.
71+ return next .Filter (req , pods )
6672 }
67- // On failure, pass the initial set of pods to the next filter.
68- return next .Filter (b , pods )
6973}
7074
7175// filterFunc filters a set of input pods to a subset.
72- type filterFunc func (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
76+ type filterFunc func (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error )
7377
7478// toFilterFunc is a helper function to convert a per pod filter func to the FilterFunc.
7579func toFilterFunc (pp podPredicate ) filterFunc {
76- return func (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
80+ return func (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
7781 filtered := []* backend.PodMetrics {}
7882 for _ , pod := range pods {
79- pass := pp (b , pod )
83+ pass := pp (req , pod )
8084 if pass {
8185 filtered = append (filtered , pod )
8286 }
@@ -95,7 +99,7 @@ func toFilterFunc(pp podPredicate) filterFunc {
9599// the least one as it gives more choices for the next filter, which on aggregate gave better
96100// results.
97101// TODO: Compare this strategy with other strategies such as top K.
98- func leastQueuingFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
102+ func leastQueuingFilterFunc (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
99103 min := math .MaxInt
100104 max := 0
101105 filtered := []* backend.PodMetrics {}
@@ -123,9 +127,9 @@ func leastQueuingFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
123127// should consider them all instead of the absolute minimum one. This worked better than picking the
124128// least one as it gives more choices for the next filter, which on aggregate gave better results.
125129// TODO: Compare this strategy with other strategies such as top K.
126- func leastKVCacheFilterFunc (b * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
130+ func leastKVCacheFilterFunc (req * LLMRequest , pods []* backend.PodMetrics ) ([]* backend.PodMetrics , error ) {
127131 min := math .MaxFloat64
128- max := math . SmallestNonzeroFloat64
132+ var max float64 = 0
129133 filtered := []* backend.PodMetrics {}
130134
131135 for _ , pod := range pods {
@@ -146,10 +150,21 @@ func leastKVCacheFilterFunc(b *LLMRequest, pods []*backend.PodMetrics) ([]*backe
146150}
147151
148152// podPredicate is a filter function to check whether a pod is desired.
149- type podPredicate func (b * LLMRequest , pod * backend.PodMetrics ) bool
153+ type podPredicate func (req * LLMRequest , pod * backend.PodMetrics ) bool
154+
155+ // We consider serving an adapter low cost it the adapter is active in the model server, or the
156+ // model server has room to load the adapter
157+ func lowLoRACostPredicate (req * LLMRequest , pod * backend.PodMetrics ) bool {
158+ _ , ok := pod .ActiveModels [req .ResolvedTargetModel ]
159+ return ok || len (pod .ActiveModels ) < pod .MaxActiveModels
160+ }
150161
151- // loraAffinityPredicate return true if the pod have the requested LoRA adapter loaded.
152- func loraAffinityPredicate (b * LLMRequest , pod * backend.PodMetrics ) bool {
153- _ , ok := pod .CachedModels [b .ResolvedTargetModel ]
154- return ok
162+ func criticalRequestPredicate (req * LLMRequest , pod * backend.PodMetrics ) bool {
163+ return req .Critical
164+ }
165+
166+ func noQueueAndLessThanKVCacheThresholdPredicate (queueThreshold int , kvCacheThreshold float64 ) podPredicate {
167+ return func (req * LLMRequest , pod * backend.PodMetrics ) bool {
168+ return pod .WaitingQueueSize <= queueThreshold && pod .KVCacheUsagePercent <= kvCacheThreshold
169+ }
155170}
0 commit comments