@@ -35,19 +35,6 @@ import (
3535 logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
3636)
3737
38- type PodPredictionResult struct {
39- Pod schedulingtypes.Pod
40- TTFT float64
41- TPOT float64
42- TTFTValid bool
43- TPOTValid bool
44- IsValid bool
45- Error error
46- Headroom float64 // Headroom for the pod, if applicable
47- TTFTHeadroom float64 // TTFT headroom for the pod
48- PrefixCacheScore float64 // Prefix cache score for the pod
49- }
50-
5138type SLOAwareRouter struct {
5239 tn plugins.TypedName
5340 latencypredictor latencypredictor.PredictorInterface
@@ -126,6 +113,48 @@ func (s *SLOAwareRouter) epsilonGreedyAffinityGate(
126113 return eligible , true
127114}
128115
116+ // scoreWithoutPredictions provides fallback scoring based only on prefix cache scores
117+ // when latency predictions are unavailable
118+ func (s * SLOAwareRouter ) scoreWithoutPredictions (
119+ ctx context.Context ,
120+ state * schedulingtypes.CycleState ,
121+ pods []schedulingtypes.Pod ,
122+ r * rand.Rand ,
123+ ) map [schedulingtypes.Pod ]float64 {
124+ logger := log .FromContext (ctx )
125+ logger .V (logutil .TRACE ).Info ("Using composite-only scoring without predictions" )
126+
127+ scores := make (map [schedulingtypes.Pod ]float64 , len (pods ))
128+ for _ , pod := range pods {
129+ scores [pod ] = 0
130+ }
131+
132+ if len (pods ) == 0 {
133+ return scores
134+ }
135+
136+ // Build prediction results with only prefix cache scores
137+ podResults := make ([]PodPredictionResult , 0 , len (pods ))
138+ for _ , pod := range pods {
139+ prefixScore := s .getPrefixCacheScoreForPod (ctx , state , pod )
140+ podResults = append (podResults , PodPredictionResult {
141+ Pod : pod ,
142+ PrefixCacheScore : prefixScore ,
143+ IsValid : true , // All pods are valid when we don't check predictions
144+ })
145+ }
146+
147+ // Select based on composite scores (prefix cache + other non-prediction metrics)
148+ selectedPod := s .selectFromCompositeScores (ctx , podResults , r , HeadroomStrategyCompositeOnly )
149+
150+ if selectedPod != nil {
151+ scores [selectedPod ] = 1
152+ logger .V (logutil .TRACE ).Info ("Selected pod using composite-only scoring" , "pod" , selectedPod .GetPod ().String ())
153+ }
154+
155+ return scores
156+ }
157+
129158func (s * SLOAwareRouter ) Score (ctx context.Context , state * schedulingtypes.CycleState , request * schedulingtypes.LLMRequest , pods []schedulingtypes.Pod ) map [schedulingtypes.Pod ]float64 {
130159 logger := log .FromContext (ctx )
131160 if s .latencypredictor == nil {
@@ -158,11 +187,6 @@ func (s *SLOAwareRouter) Score(ctx context.Context, state *schedulingtypes.Cycle
158187 return nil
159188 }
160189
161- predictions := s .generatePredictions (ctx , state , request , sloCtx , pods )
162- s .updateRequestContextWithPredictions (sloCtx , predictions )
163-
164- allPreds := append ([]PodPredictionResult (nil ), predictions ... )
165-
166190 // Initialize scores map with all pods having score 0
167191 scores := make (map [schedulingtypes.Pod ]float64 , len (pods ))
168192 for _ , pod := range pods {
@@ -171,6 +195,17 @@ func (s *SLOAwareRouter) Score(ctx context.Context, state *schedulingtypes.Cycle
171195
172196 source := rand .NewSource (time .Now ().UnixNano ())
173197 r := rand .New (source )
198+
199+ predictions , err := s .generatePredictions (ctx , state , request , sloCtx , pods )
200+ if err != nil {
201+ logger .V (logutil .DEBUG ).Error (err , "SLOAwareRouter: Error generating predictions, falling back to composite-only scoring" )
202+ // Fall back to composite-only scoring using prefix cache scores
203+ return s .scoreWithoutPredictions (ctx , state , pods , r )
204+ }
205+ s .updateRequestContextWithPredictions (sloCtx , predictions )
206+
207+ allPreds := append ([]PodPredictionResult (nil ), predictions ... )
208+
174209 allPreds , sticky := s .epsilonGreedyAffinityGate (ctx , allPreds , r , "overall" , AffinityGateTauGlobal )
175210
176211 // Check if all pods are invalid and all have running requests
0 commit comments