Skip to content

Commit 7ea73b7

Browse files
authored
disable prefix-cache-aware decision making for P/D by default (#253)
1 parent 7d3d912 commit 7ea73b7

File tree

1 file changed

+21
-19
lines changed

1 file changed

+21
-19
lines changed

pkg/plugins/profile/pd_profile_handler.go

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ var _ framework.ProfileHandler = &PdProfileHandler{}
3636
// PdProfileHandlerFactory defines the factory function for the PdProfileHandler
3737
func PdProfileHandlerFactory(name string, rawParameters json.RawMessage, _ plugins.Handle) (plugins.Plugin, error) {
3838
parameters := pdProfileHandlerParameters{
39-
Threshold: 100,
39+
Threshold: 0,
4040
DecodeProfile: defaultDecodeProfile,
4141
PrefillProfile: defaultPrefillProfile,
4242
HashBlockSize: prefix.DefaultHashBlockSize,
@@ -99,25 +99,27 @@ func (h *PdProfileHandler) Pick(ctx context.Context, cycleState *types.CycleStat
9999
return map[string]*framework.SchedulerProfile{}
100100
}
101101

102-
// if we're here that means decode profile ran successfully, and we have additional profile configured that didn't run yet,
103-
// which means PD is enabled (otherwise, prefill profile is not configured at all and this profile handler is not used).
104-
// inspect decode execution result to decide if prefill should run or not.
105-
// if the request is short enough, use decode results only and don't run the prefill profile.
106-
hitPercentagePrefix := 0.0 // default to 0, meaning no prefix cache hit
107-
prefixState, err := types.ReadCycleStateKey[*prefix.SchedulingContextState](cycleState, prefix.PrefixCachePluginType)
108-
if err != nil {
109-
log.FromContext(ctx).Error(err, "unable to read prefix state")
110-
} else {
111-
decodePod := profileResults[h.decodeProfile].TargetPods[0].GetPod().NamespacedName
112-
hitPrefix := max(prefixState.PrefixCacheServers[prefix.ServerID(decodePod)]-1, 0) // The first hit is always the model name
113-
hitPercentagePrefix = float64(hitPrefix*h.hashBlockSize) / float64(len(request.Prompt))
114-
log.FromContext(ctx).V(logutil.DEBUG).Info("Computed hit percentage for prefix cache", "hitPercentage", hitPercentagePrefix,
115-
"promptLength", len(request.Prompt))
116-
}
102+
if h.pdThreshold > 0 {
103+
// if we're here that means decode profile ran successfully, and we have additional profile configured that didn't run yet,
104+
// which means PD is enabled (otherwise, prefill profile is not configured at all and this profile handler is not used).
105+
// inspect decode execution result to decide if prefill should run or not.
106+
// if the request is short enough, use decode results only and don't run the prefill profile.
107+
hitPercentagePrefix := 0.0 // default to 0, meaning no prefix cache hit
108+
prefixState, err := types.ReadCycleStateKey[*prefix.SchedulingContextState](cycleState, prefix.PrefixCachePluginType)
109+
if err != nil {
110+
log.FromContext(ctx).Error(err, "unable to read prefix state")
111+
} else {
112+
decodePod := profileResults[h.decodeProfile].TargetPods[0].GetPod().NamespacedName
113+
hitPrefix := max(prefixState.PrefixCacheServers[prefix.ServerID(decodePod)]-1, 0) // The first hit is always the model name
114+
hitPercentagePrefix = float64(hitPrefix*h.hashBlockSize) / float64(len(request.Prompt))
115+
log.FromContext(ctx).V(logutil.DEBUG).Info("Computed hit percentage for prefix cache", "hitPercentage", hitPercentagePrefix,
116+
"promptLength", len(request.Prompt))
117+
}
117118

118-
if (1.0-hitPercentagePrefix)*float64(len(request.Prompt)) < float64(h.pdThreshold) {
119-
log.FromContext(ctx).Info("Non-cached suffix is smaller than threshold, using decode profile only", "hitPercentage", hitPercentagePrefix)
120-
return map[string]*framework.SchedulerProfile{} // do not run prefill
119+
if (1.0-hitPercentagePrefix)*float64(len(request.Prompt)) < float64(h.pdThreshold) {
120+
log.FromContext(ctx).Info("Non-cached suffix is smaller than threshold, using decode profile only", "hitPercentage", hitPercentagePrefix)
121+
return map[string]*framework.SchedulerProfile{} // do not run prefill
122+
}
121123
}
122124

123125
// run the prefill profile

0 commit comments

Comments
 (0)