@@ -36,7 +36,7 @@ var _ framework.ProfileHandler = &PdProfileHandler{}
36
36
// PdProfileHandlerFactory defines the factory function for the PdProfileHandler
37
37
func PdProfileHandlerFactory (name string , rawParameters json.RawMessage , _ plugins.Handle ) (plugins.Plugin , error ) {
38
38
parameters := pdProfileHandlerParameters {
39
- Threshold : 100 ,
39
+ Threshold : 0 ,
40
40
DecodeProfile : defaultDecodeProfile ,
41
41
PrefillProfile : defaultPrefillProfile ,
42
42
HashBlockSize : prefix .DefaultHashBlockSize ,
@@ -99,25 +99,27 @@ func (h *PdProfileHandler) Pick(ctx context.Context, cycleState *types.CycleStat
99
99
return map [string ]* framework.SchedulerProfile {}
100
100
}
101
101
102
- // if we're here that means decode profile ran successfully, and we have additional profile configured that didn't run yet,
103
- // which means PD is enabled (otherwise, prefill profile is not configured at all and this profile handler is not used).
104
- // inspect decode execution result to decide if prefill should run or not.
105
- // if the request is short enough, use decode results only and don't run the prefill profile.
106
- hitPercentagePrefix := 0.0 // default to 0, meaning no prefix cache hit
107
- prefixState , err := types .ReadCycleStateKey [* prefix.SchedulingContextState ](cycleState , prefix .PrefixCachePluginType )
108
- if err != nil {
109
- log .FromContext (ctx ).Error (err , "unable to read prefix state" )
110
- } else {
111
- decodePod := profileResults [h .decodeProfile ].TargetPods [0 ].GetPod ().NamespacedName
112
- hitPrefix := max (prefixState .PrefixCacheServers [prefix .ServerID (decodePod )]- 1 , 0 ) // The first hit is always the model name
113
- hitPercentagePrefix = float64 (hitPrefix * h .hashBlockSize ) / float64 (len (request .Prompt ))
114
- log .FromContext (ctx ).V (logutil .DEBUG ).Info ("Computed hit percentage for prefix cache" , "hitPercentage" , hitPercentagePrefix ,
115
- "promptLength" , len (request .Prompt ))
116
- }
102
+ if h .pdThreshold > 0 {
103
+ // if we're here that means decode profile ran successfully, and we have additional profile configured that didn't run yet,
104
+ // which means PD is enabled (otherwise, prefill profile is not configured at all and this profile handler is not used).
105
+ // inspect decode execution result to decide if prefill should run or not.
106
+ // if the request is short enough, use decode results only and don't run the prefill profile.
107
+ hitPercentagePrefix := 0.0 // default to 0, meaning no prefix cache hit
108
+ prefixState , err := types .ReadCycleStateKey [* prefix.SchedulingContextState ](cycleState , prefix .PrefixCachePluginType )
109
+ if err != nil {
110
+ log .FromContext (ctx ).Error (err , "unable to read prefix state" )
111
+ } else {
112
+ decodePod := profileResults [h .decodeProfile ].TargetPods [0 ].GetPod ().NamespacedName
113
+ hitPrefix := max (prefixState .PrefixCacheServers [prefix .ServerID (decodePod )]- 1 , 0 ) // The first hit is always the model name
114
+ hitPercentagePrefix = float64 (hitPrefix * h .hashBlockSize ) / float64 (len (request .Prompt ))
115
+ log .FromContext (ctx ).V (logutil .DEBUG ).Info ("Computed hit percentage for prefix cache" , "hitPercentage" , hitPercentagePrefix ,
116
+ "promptLength" , len (request .Prompt ))
117
+ }
117
118
118
- if (1.0 - hitPercentagePrefix )* float64 (len (request .Prompt )) < float64 (h .pdThreshold ) {
119
- log .FromContext (ctx ).Info ("Non-cached suffix is smaller than threshold, using decode profile only" , "hitPercentage" , hitPercentagePrefix )
120
- return map [string ]* framework.SchedulerProfile {} // do not run prefill
119
+ if (1.0 - hitPercentagePrefix )* float64 (len (request .Prompt )) < float64 (h .pdThreshold ) {
120
+ log .FromContext (ctx ).Info ("Non-cached suffix is smaller than threshold, using decode profile only" , "hitPercentage" , hitPercentagePrefix )
121
+ return map [string ]* framework.SchedulerProfile {} // do not run prefill
122
+ }
121
123
}
122
124
123
125
// run the prefill profile
0 commit comments