- 
                Notifications
    You must be signed in to change notification settings 
- Fork 37
Change time-to-first-token parameter to be based on number of request tokens #137 #165
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
0b7c39e
              e0d61de
              a199aea
              cecb32c
              0c80d58
              18d3075
              1fd0a9a
              1e8f33d
              dff8d3d
              0910dbf
              5f9fe1b
              049c10e
              9886b94
              4ae89f2
              904e18d
              4078dbd
              91e702f
              8430ea3
              a5305c8
              b74b3aa
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -495,7 +495,7 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) { | |
| model: displayModel, | ||
| doRemotePrefill: req.IsDoRemotePrefill(), | ||
| }, | ||
| responseTokens, toolCalls, finishReason, usageDataToSend, | ||
| usageDataToSend.PromptTokens, responseTokens, toolCalls, finishReason, usageDataToSend, | ||
| ) | ||
| } else { | ||
| if req.IsDoRemoteDecode() { | ||
|  | @@ -646,8 +646,9 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques | |
| } | ||
|  | ||
| // calculate how long to wait before returning the response, time is based on number of tokens | ||
| numOfTokens := usageData.CompletionTokens | ||
| totalMillisToWait := s.getTimeToFirstToken(doRemotePrefill) + s.getTotalInterTokenLatency(numOfTokens) | ||
| nPromptTokens := usageData.PromptTokens | ||
| nGenTokens := usageData.CompletionTokens | ||
| totalMillisToWait := s.getTimeToFirstToken(nPromptTokens, doRemotePrefill) + s.getTotalInterTokenLatency(nGenTokens) | ||
| time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond) | ||
|  | ||
| ctx.Response.Header.SetContentType("application/json") | ||
|  | @@ -665,13 +666,19 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques | |
| } | ||
|  | ||
| // returns time to first token based on the current request's doRemotePrefill | ||
| func (s *VllmSimulator) getTimeToFirstToken(doRemotePrefill bool) int { | ||
| mean := float64(s.config.TimeToFirstToken) | ||
| stddev := float64(s.config.TimeToFirstTokenStdDev) | ||
| if doRemotePrefill { | ||
| mean = float64(s.config.KVCacheTransferLatency) | ||
| stddev = float64(s.config.KVCacheTransferLatencyStdDev) | ||
| func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int { | ||
| if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 { | ||
| return s.calcPrefillOverhead(nPromptTokens, doRemotePrefill) | ||
| } | ||
|  | ||
| if !doRemotePrefill { | ||
| mean := float64(s.config.TimeToFirstToken) | ||
| stddev := float64(s.config.TimeToFirstTokenStdDev) | ||
| return int(common.RandomNorm(mean, stddev)) | ||
| } | ||
|  | ||
| mean := float64(s.config.KVCacheTransferLatency) | ||
| stddev := float64(s.config.KVCacheTransferLatencyStdDev) | ||
| return int(common.RandomNorm(mean, stddev)) | ||
| } | ||
|  | ||
|  | @@ -691,6 +698,30 @@ func (s *VllmSimulator) getTotalInterTokenLatency(numOfTokens int) int { | |
| return total | ||
| } | ||
|  | ||
| // calc the prefill overhead against number of tokens | ||
| func (s *VllmSimulator) calcPrefillOverhead(nPromptTokens int, doRemotePrefill bool) int { | ||
| if !doRemotePrefill { | ||
| constOverhead := s.config.PrefillOverhead | ||
|          | ||
| ptpt := s.config.PrefillTimePerToken | ||
| prefillTime := constOverhead + nPromptTokens*ptpt | ||
|  | ||
| stdDev := s.config.PrefillTimeStdDev | ||
| return int(common.RandomNorm(float64(prefillTime), float64(stdDev))) | ||
| } | ||
|  | ||
| if s.config.KVCacheTransferLatency != 0 || s.config.KVCacheTransferLatencyStdDev != 0 { | ||
|          | ||
| mean := float64(s.config.KVCacheTransferLatency) | ||
| stddev := float64(s.config.KVCacheTransferLatencyStdDev) | ||
| return int(common.RandomNorm(mean, stddev)) | ||
| } | ||
|  | ||
| kvCacheTransTPT := s.config.KVCacheTransferTimePerToken | ||
| kvCacheTransT := kvCacheTransTPT * nPromptTokens | ||
|  | ||
| stdDev := s.config.KVCacheTransferTimeStdDev | ||
| return int(common.RandomNorm(float64(kvCacheTransT), float64(stdDev))) | ||
| } | ||
|  | ||
| // createModelsResponse creates and returns ModelResponse for the current state, returned array of models contains the base model + LoRA adapters if exist | ||
| func (s *VllmSimulator) createModelsResponse() *vllmapi.ModelsResponse { | ||
| modelsResp := vllmapi.ModelsResponse{Object: "list", Data: []vllmapi.ModelsResponseModelInfo{}} | ||
|  | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it may stay the old code where mean and stddev are calculated according the doRemotePrefill and randomInt is called from one plase
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is resolved in the below change