@@ -36,7 +36,7 @@ var _ framework.ProfileHandler = &PdProfileHandler{}
3636// PdProfileHandlerFactory defines the factory function for the PdProfileHandler
3737func PdProfileHandlerFactory (name string , rawParameters json.RawMessage , _ plugins.Handle ) (plugins.Plugin , error ) {
3838 parameters := pdProfileHandlerParameters {
39- Threshold : 100 ,
39+ Threshold : 0 ,
4040 DecodeProfile : defaultDecodeProfile ,
4141 PrefillProfile : defaultPrefillProfile ,
4242 HashBlockSize : prefix .DefaultHashBlockSize ,
@@ -99,25 +99,27 @@ func (h *PdProfileHandler) Pick(ctx context.Context, cycleState *types.CycleStat
9999 return map [string ]* framework.SchedulerProfile {}
100100 }
101101
102- // if we're here that means decode profile ran successfully, and we have additional profile configured that didn't run yet,
103- // which means PD is enabled (otherwise, prefill profile is not configured at all and this profile handler is not used).
104- // inspect decode execution result to decide if prefill should run or not.
105- // if the request is short enough, use decode results only and don't run the prefill profile.
106- hitPercentagePrefix := 0.0 // default to 0, meaning no prefix cache hit
107- prefixState , err := types .ReadCycleStateKey [* prefix.SchedulingContextState ](cycleState , prefix .PrefixCachePluginType )
108- if err != nil {
109- log .FromContext (ctx ).Error (err , "unable to read prefix state" )
110- } else {
111- decodePod := profileResults [h .decodeProfile ].TargetPods [0 ].GetPod ().NamespacedName
112- hitPrefix := max (prefixState .PrefixCacheServers [prefix .ServerID (decodePod )]- 1 , 0 ) // The first hit is always the model name
113- hitPercentagePrefix = float64 (hitPrefix * h .hashBlockSize ) / float64 (len (request .Prompt ))
114- log .FromContext (ctx ).V (logutil .DEBUG ).Info ("Computed hit percentage for prefix cache" , "hitPercentage" , hitPercentagePrefix ,
115- "promptLength" , len (request .Prompt ))
116- }
102+ if h .pdThreshold > 0 {
103+ // if we're here that means decode profile ran successfully, and we have additional profile configured that didn't run yet,
104+ // which means PD is enabled (otherwise, prefill profile is not configured at all and this profile handler is not used).
105+ // inspect decode execution result to decide if prefill should run or not.
106+ // if the request is short enough, use decode results only and don't run the prefill profile.
107+ hitPercentagePrefix := 0.0 // default to 0, meaning no prefix cache hit
108+ prefixState , err := types .ReadCycleStateKey [* prefix.SchedulingContextState ](cycleState , prefix .PrefixCachePluginType )
109+ if err != nil {
110+ log .FromContext (ctx ).Error (err , "unable to read prefix state" )
111+ } else {
112+ decodePod := profileResults [h .decodeProfile ].TargetPods [0 ].GetPod ().NamespacedName
113+ hitPrefix := max (prefixState .PrefixCacheServers [prefix .ServerID (decodePod )]- 1 , 0 ) // The first hit is always the model name
114+ hitPercentagePrefix = float64 (hitPrefix * h .hashBlockSize ) / float64 (len (request .Prompt ))
115+ log .FromContext (ctx ).V (logutil .DEBUG ).Info ("Computed hit percentage for prefix cache" , "hitPercentage" , hitPercentagePrefix ,
116+ "promptLength" , len (request .Prompt ))
117+ }
117118
118- if (1.0 - hitPercentagePrefix )* float64 (len (request .Prompt )) < float64 (h .pdThreshold ) {
119- log .FromContext (ctx ).Info ("Non-cached suffix is smaller than threshold, using decode profile only" , "hitPercentage" , hitPercentagePrefix )
120- return map [string ]* framework.SchedulerProfile {} // do not run prefill
119+ if (1.0 - hitPercentagePrefix )* float64 (len (request .Prompt )) < float64 (h .pdThreshold ) {
120+ log .FromContext (ctx ).Info ("Non-cached suffix is smaller than threshold, using decode profile only" , "hitPercentage" , hitPercentagePrefix )
121+ return map [string ]* framework.SchedulerProfile {} // do not run prefill
122+ }
121123 }
122124
123125 // run the prefill profile
0 commit comments