Take cached prompt tokens into account in prefill time calculation (#184)

irar2 · web-flow · commit 639b40eefcfa · 2025-09-03T08:46:14.000Z
* Take cached prompt tokens into account in prefill time calculation

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

* Review comments

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;

---------

Signed-off-by: Ira &lt;IRAR@il.ibm.com&gt;
diff --git a/pkg/common/config.go b/pkg/common/config.go
@@ -340,6 +340,9 @@ func (c *Configuration) validate() error {
 	if c.KVCacheTransferTimeStdDev < 0 {
 		return errors.New("kv-cache tranfer time standard deviation cannot be negative")
 	}
+	if float32(c.KVCacheTransferTimeStdDev) > 0.3*float32(c.KVCacheTransferTimePerToken) {
+		return errors.New("kv-cache tranfer time standard deviation cannot be more than 30% of kv-cache tranfer time")
+	}
 
 	if c.KVCacheTransferLatency < 0 {
 		return errors.New("kv-cache tranfer time cannot be negative")
diff --git a/pkg/kv-cache/block_cache.go b/pkg/kv-cache/block_cache.go
@@ -76,13 +76,14 @@ func (b *blockCache) start(ctx context.Context) {
 }
 
 // startRequest adds a request with its associated block hashes to the cache
-func (bc *blockCache) startRequest(requestID string, blocks []uint64) error {
+// and returns the number of blocks that were already in the cache
+func (bc *blockCache) startRequest(requestID string, blocks []uint64) (int, error) {
 	bc.mu.Lock()
 	defer bc.mu.Unlock()
 
 	if _, exists := bc.requestToBlocks[requestID]; exists {
 		// request with the same id already exists
-		return fmt.Errorf("request already exists for id %s", requestID)
+		return 0, fmt.Errorf("request already exists for id %s", requestID)
 	}
 
 	// divide list of blocks to three lists:
@@ -107,7 +108,7 @@ func (bc *blockCache) startRequest(requestID string, blocks []uint64) error {
 	}
 
 	if len(bc.usedBlocks)+len(blocksToAdd)+len(blockToMoveToUsed) > bc.maxBlocks {
-		return errors.New(capacityError)
+		return 0, errors.New(capacityError)
 	}
 
 	// for blocks that are already in use - update the reference
@@ -148,7 +149,7 @@ func (bc *blockCache) startRequest(requestID string, blocks []uint64) error {
 	bc.requestToBlocks[requestID] = make([]uint64, len(blocks))
 	copy(bc.requestToBlocks[requestID], blocks)
 
-	return nil
+	return len(blockAreadyInUse) + len(blockToMoveToUsed), nil
 }
 
 // finishRequest processes the completion of a request, decreasing reference counts
@@ -159,7 +160,7 @@ func (bc *blockCache) finishRequest(requestID string) error {
 	// Get blocks associated with this request
 	blockHashes, exists := bc.requestToBlocks[requestID]
 	if !exists {
-		return errors.New("request not found")
+		return nil
 	}
 
 	now := time.Now()
diff --git a/pkg/kv-cache/kv_cache.go b/pkg/kv-cache/kv_cache.go
@@ -32,6 +32,7 @@ type KVCacheHelper struct {
 	tokensProcessor kvblock.TokenProcessor // turns tokens to kv block keys
 	logger          logr.Logger
 	blockCache      *blockCache
+	blockSize       int
 }
 
 func NewKVCacheHelper(config *common.Configuration, logger logr.Logger) (*KVCacheHelper, error) {
@@ -59,6 +60,7 @@ func NewKVCacheHelper(config *common.Configuration, logger logr.Logger) (*KVCach
 		tokensProcessor: tokensProcessor,
 		blockCache:      blockCache,
 		logger:          logger,
+		blockSize:       config.TokenBlockSize,
 	}, nil
 }
 
@@ -78,7 +80,7 @@ func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest
 	tokens, _, err := h.tokenizer.Encode(prompt, modelName)
 	if err != nil {
 		h.logger.Info("Prompt tokenization failed", "error", err.Error())
-		return h.blockCache.startRequest(requestID, make([]uint64, 0))
+		return err
 	}
 
 	// get block keys
@@ -90,7 +92,9 @@ func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest
 		blockHashes[i] = key.ChunkHash
 	}
 
-	return h.blockCache.startRequest(requestID, blockHashes)
+	nExistingBlocks, err := h.blockCache.startRequest(requestID, blockHashes)
+	vllmReq.SetNumberOfCachedPromptTokens(nExistingBlocks * h.blockSize)
+	return err
 }
 
 func (h *KVCacheHelper) OnRequestEnd(vllmReq openaiserverapi.CompletionRequest) error {
diff --git a/pkg/kv-cache/kv_cache_test.go b/pkg/kv-cache/kv_cache_test.go
@@ -237,7 +237,7 @@ var _ = Describe("KV cache", Ordered, func() {
 						var err error
 						switch action.action {
 						case actionStartRequest:
-							err = blockCache.startRequest(action.request.id, action.request.blocks)
+							_, err = blockCache.startRequest(action.request.id, action.request.blocks)
 						case actionFinishRequest:
 							err = blockCache.finishRequest(action.request.id)
 						}
@@ -344,17 +344,21 @@ var _ = Describe("KV cache", Ordered, func() {
 				req4 := testRequest{"req4", []uint64{5, 6}}
 
 				// blocks 1 and 2 stored
-				err = blockCache.startRequest(req1.id, req1.blocks)
+				alreadyInCache, err := blockCache.startRequest(req1.id, req1.blocks)
 				Expect(err).NotTo(HaveOccurred())
+				Expect(alreadyInCache).To(Equal(0))
 				// blocks 3 and 4 stored
-				err = blockCache.startRequest(req2.id, req2.blocks)
+				alreadyInCache, err = blockCache.startRequest(req2.id, req2.blocks)
 				Expect(err).NotTo(HaveOccurred())
+				Expect(alreadyInCache).To(Equal(0))
 				// no new blocks stored, reuse of 1 and 3
-				err = blockCache.startRequest(req3.id, req3.blocks)
+				alreadyInCache, err = blockCache.startRequest(req3.id, req3.blocks)
 				Expect(err).NotTo(HaveOccurred())
+				Expect(alreadyInCache).To(Equal(2))
 				// no space left - should fail
-				err = blockCache.startRequest(req4.id, req4.blocks)
+				alreadyInCache, err = blockCache.startRequest(req4.id, req4.blocks)
 				Expect(err).To(HaveOccurred())
+				Expect(alreadyInCache).To(Equal(0))
 
 				err = blockCache.finishRequest(req1.id)
 				Expect(err).NotTo(HaveOccurred())
@@ -363,8 +367,9 @@ var _ = Describe("KV cache", Ordered, func() {
 				// now 2 and 4 are not in use
 
 				// blocks 2 and 4 should be removed, and 5 and 6 stored
-				err = blockCache.startRequest(req4.id, req4.blocks)
+				alreadyInCache, err = blockCache.startRequest(req4.id, req4.blocks)
 				Expect(err).NotTo(HaveOccurred())
+				Expect(alreadyInCache).To(Equal(0))
 			}()
 
 			removedBlocks := make([]uint64, 0)
@@ -431,7 +436,7 @@ var _ = Describe("KV cache", Ordered, func() {
 							reqID := fmt.Sprintf("req_%d_%d", id, j)
 							blocks := createRandomArray(testCase.minBlockLen, testCase.maxBlockLen, testCase.maxHashValue)
 
-							err := blockCache.startRequest(reqID, blocks)
+							_, err := blockCache.startRequest(reqID, blocks)
 							if err != nil {
 								// some operations may fail due to cache being full, which is expected
 								Expect(err.Error()).To(Equal(capacityError))
diff --git a/pkg/llm-d-inference-sim/simulator.go b/pkg/llm-d-inference-sim/simulator.go
@@ -382,7 +382,6 @@ func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatComple
 		if s.config.EnableKVCache && !isChatCompletion {
 			err := s.kvcacheHelper.OnRequestEnd(vllmReq)
 			if err != nil {
-				// TODO should it be an error with http response error or just a warning?
 				s.logger.Error(err, "kv cache failed to process request end")
 			}
 		}
@@ -391,8 +390,7 @@ func (s *VllmSimulator) handleCompletions(ctx *fasthttp.RequestCtx, isChatComple
 		// kv cache is currently supported for /completion API only
 		err = s.kvcacheHelper.OnRequestStart(vllmReq)
 		if err != nil {
-			// TODO should it be an error with http response error or just a warning?
-			s.logger.Error(err, "kv cache failed to process request start")
+			s.sendCompletionError(ctx, openaiserverapi.NewCompletionError(err.Error(), fasthttp.StatusInternalServerError, nil), false)
 		}
 	}
 
@@ -490,28 +488,22 @@ func (s *VllmSimulator) reqProcessingWorker(ctx context.Context, id int) {
 					}
 					s.sendStreamingResponse(
 						&streamingContext{
-							ctx:              reqCtx.HTTPReqCtx,
-							isChatCompletion: reqCtx.IsChatCompletion,
-							model:            displayModel,
-							doRemotePrefill:  req.IsDoRemotePrefill(),
+							ctx:                 reqCtx.HTTPReqCtx,
+							isChatCompletion:    reqCtx.IsChatCompletion,
+							model:               displayModel,
+							doRemotePrefill:     req.IsDoRemotePrefill(),
+							nPromptTokens:       usageData.PromptTokens,
+							nCachedPromptTokens: reqCtx.CompletionReq.GetNumberOfCachedPromptTokens(),
 						},
-						usageData.PromptTokens, responseTokens, toolCalls, finishReason, usageDataToSend,
+						responseTokens, toolCalls, finishReason, usageDataToSend,
 					)
 				} else {
 					if req.IsDoRemoteDecode() {
 						// in case this is prefill pod processing, return special finish reason
 						finishReason = common.RemoteDecodeFinishReason
 					}
 
-					s.sendResponse(reqCtx.IsChatCompletion,
-						reqCtx.HTTPReqCtx,
-						responseTokens,
-						toolCalls,
-						displayModel,
-						finishReason,
-						&usageData,
-						req.IsDoRemoteDecode(),
-						req.IsDoRemotePrefill())
+					s.sendResponse(reqCtx, responseTokens, toolCalls, displayModel, finishReason, &usageData)
 				}
 			}
 			reqCtx.Wg.Done()
@@ -628,17 +620,19 @@ func (s *VllmSimulator) createCompletionResponse(isChatCompletion bool, respToke
 }
 
 // sendResponse sends response for completion API, supports both completions (text and chat)
-// according the value of isChatCompletion
+// according the value of isChatCompletion in reqCtx
 // respTokens - tokenized content to be sent in the response
 // toolCalls - tool calls to be sent in the response
 // modelName - display name returned to the client and used in metrics. It is either the first alias
 // from --served-model-name (for a base-model request) or the LoRA adapter name (for a LoRA request).
 // finishReason - a pointer to string that represents finish reason, can be nil, stop, length, or tools
 // usageData - usage (tokens statistics) for this response
-func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.RequestCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
-	modelName string, finishReason string, usageData *openaiserverapi.Usage, doRemoteDecode bool, doRemotePrefill bool) {
-	resp := s.createCompletionResponse(isChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName, doRemoteDecode)
+func (s *VllmSimulator) sendResponse(reqCtx *openaiserverapi.CompletionReqCtx, respTokens []string, toolCalls []openaiserverapi.ToolCall,
+	modelName string, finishReason string, usageData *openaiserverapi.Usage) {
+	resp := s.createCompletionResponse(reqCtx.IsChatCompletion, respTokens, toolCalls, &finishReason, usageData, modelName,
+		reqCtx.CompletionReq.IsDoRemoteDecode())
 
+	ctx := reqCtx.HTTPReqCtx
 	data, err := json.Marshal(resp)
 	if err != nil {
 		ctx.Error("Response body creation failed, "+err.Error(), fasthttp.StatusInternalServerError)
@@ -647,8 +641,10 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 
 	// calculate how long to wait before returning the response, time is based on number of tokens
 	nPromptTokens := usageData.PromptTokens
+	nCachedPromptTokens := reqCtx.CompletionReq.GetNumberOfCachedPromptTokens()
 	nGenTokens := usageData.CompletionTokens
-	totalMillisToWait := s.getTimeToFirstToken(nPromptTokens, doRemotePrefill) + s.getTotalInterTokenLatency(nGenTokens)
+	ttft := s.getTimeToFirstToken(nPromptTokens, nCachedPromptTokens, reqCtx.CompletionReq.IsDoRemotePrefill())
+	totalMillisToWait := ttft + s.getTotalInterTokenLatency(nGenTokens)
 	time.Sleep(time.Duration(totalMillisToWait) * time.Millisecond)
 
 	ctx.Response.Header.SetContentType("application/json")
@@ -666,7 +662,7 @@ func (s *VllmSimulator) sendResponse(isChatCompletion bool, ctx *fasthttp.Reques
 }
 
 // returns time to first token based on the current request's doRemotePrefill
-func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill bool) int {
+func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, nCachedPromptTokens int, doRemotePrefill bool) int {
 	if doRemotePrefill {
 		if s.config.KVCacheTransferLatency == 0 && s.config.KVCacheTransferLatencyStdDev == 0 {
 			// is disaggregated PD and ttft is calculated using number of prompt tokens
@@ -677,8 +673,8 @@ func (s *VllmSimulator) getTimeToFirstToken(nPromptTokens int, doRemotePrefill b
 		return int(common.RandomNorm(float64(s.config.KVCacheTransferLatency), float64(s.config.KVCacheTransferLatencyStdDev)))
 	}
 	if s.config.TimeToFirstToken == 0 && s.config.TimeToFirstTokenStdDev == 0 {
-		// is aggregated PD and ttft is calculated using number of prompt tokens
-		prefillTime := s.config.PrefillOverhead + nPromptTokens*s.config.PrefillTimePerToken
+		// is aggregated PD and ttft is calculated using number of prompt tokens that are not in kv cache
+		prefillTime := s.config.PrefillOverhead + (nPromptTokens-nCachedPromptTokens)*s.config.PrefillTimePerToken
 		return int(common.RandomNorm(float64(prefillTime), float64(s.config.PrefillTimeStdDev)))
 	}
 	// is aggregated PD and *not* using number of prompt tokens
diff --git a/pkg/llm-d-inference-sim/simulator_test.go b/pkg/llm-d-inference-sim/simulator_test.go
@@ -807,7 +807,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.TimeToFirstTokenStdDev = timeToFirstTokenStdDev
 				simulator.config.KVCacheTransferLatency = kvCacheLatency
 				simulator.config.KVCacheTransferLatencyStdDev = kvCacheLatencyStdDev
-				timeToFirst := simulator.getTimeToFirstToken(1, doREmotePrefill)
+				timeToFirst := simulator.getTimeToFirstToken(1, 0, doREmotePrefill)
 				if doREmotePrefill {
 					Expect(timeToFirst).To(BeNumerically(">=", int(float32(kvCacheLatency)*0.3)))
 					Expect(timeToFirst).To(BeNumerically("<=", int(float32(kvCacheLatency)*1.7)))
@@ -838,7 +838,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, false)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 
 			Expect(ttft).To(BeNumerically("==", timeToFirstToken))
 		})
@@ -851,33 +851,60 @@ var _ = Describe("Simulator", func() {
 			simulator.config.PrefillTimePerToken = 200
 			simulator.config.PrefillTimeStdDev = 80
 
-			ttft := simulator.getTimeToFirstToken(128, false)
+			ttft := simulator.getTimeToFirstToken(128, 0, false)
 			Expect(ttft).NotTo(BeNumerically("==", 0))
 		})
 
-		DescribeTable("time to first token is against number of prompt tokens",
-			func(prefillOverhead int, prefillTimePerToken int, stdDev int, nTokens int) {
+		DescribeTable("time to first token is against number of prompt tokens with std",
+			func(prefillOverhead int, prefillTimePerToken int, stdDev int, nTokens int, nCachedTokens int) {
 				simulator.config.TimeToFirstToken = 0
 				simulator.config.PrefillOverhead = prefillOverhead
 				simulator.config.PrefillTimePerToken = prefillTimePerToken
 				simulator.config.PrefillTimeStdDev = stdDev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, false)
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
 
-				expectedTTFT := prefillOverhead + prefillTimePerToken*nTokens
+				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
 				Expect(ttft).To(BeNumerically("<=", int(float64(expectedTTFT)*1.7)))
+			},
+			func(prefillOverhead int, prefillTimePerToken, stdDev int, nTokens int, nCachedTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d, prefillTimePerToken: %d, stdDev: %d, nTokens: %d nCachedTokens: %d",
+					prefillOverhead, prefillTimePerToken, stdDev, nTokens, nCachedTokens)
+			},
+			Entry("single token", 100, 50, 10, 1, 0),
+			Entry("single token big std", 100, 50, 70, 1, 0),
+			Entry("stddev is 0", 100, 50, 0, 1, 0),
+			Entry("medium overhead, 512 tokens", 200, 1000, 150, 512, 0),
+			Entry("large overhead, 1024 tokens", 2000, 3000, 800, 1024, 0),
+			Entry("very long prompt", 150, 200, 70, 20000, 0),
+			Entry("medium overhead, 512 tokens, 256 cached", 200, 1000, 150, 512, 256),
+			Entry("large overhead, 1024 tokens, 1008 cached", 2000, 3000, 800, 1024, 1008),
+			Entry("very long prompt, 1024 cached", 150, 200, 70, 20000, 1024),
+		)
+
+		DescribeTable("time to first token is against number of prompt tokens",
+			func(prefillOverhead int, prefillTimePerToken int, nTokens int, nCachedTokens int) {
+				simulator.config.TimeToFirstToken = 0
+				simulator.config.PrefillOverhead = prefillOverhead
+				simulator.config.PrefillTimePerToken = prefillTimePerToken
+				simulator.config.PrefillTimeStdDev = 0
 
+				ttft := simulator.getTimeToFirstToken(nTokens, nCachedTokens, false)
+				expectedTTFT := prefillOverhead + prefillTimePerToken*(nTokens-nCachedTokens)
+				Expect(ttft).To(Equal(expectedTTFT))
 			},
-			func(prefillOverhead int, prefillTimePerToken, stdDev int, nTokens int) string {
-				return fmt.Sprintf("prefillOverhead: %d, prefillTimePerToken: %d, stdDev: %d, nTokens: %d",
-					prefillOverhead, prefillTimePerToken, stdDev, nTokens)
+			func(prefillOverhead int, prefillTimePerToken, nTokens int, nCachedTokens int) string {
+				return fmt.Sprintf("prefillOverhead: %d, prefillTimePerToken: %d, nTokens: %d nCachedTokens: %d",
+					prefillOverhead, prefillTimePerToken, nTokens, nCachedTokens)
 			},
-			Entry("single token", 100, 50, 70, 1),
-			Entry("stddev is 0", 100, 50, 0, 1),
-			Entry("medium overhead, 512 tokens", 200, 1000, 150, 512),
-			Entry("large overhead, 1024 tokens", 2000, 3000, 1800, 1024),
-			Entry("very long prompt", 150, 200, 100, 20000),
+			Entry("single token", 100, 50, 1, 0),
+			Entry("medium overhead, 512 tokens", 200, 1000, 512, 0),
+			Entry("large overhead, 1024 tokens", 2000, 3000, 1024, 0),
+			Entry("very long prompt", 150, 200, 20000, 0),
+			Entry("medium overhead, 512 tokens, 256 cached", 200, 1000, 512, 256),
+			Entry("large overhead, 1024 tokens, 128 cached", 2000, 3000, 1024, 128),
+			Entry("very long prompt, 1024 cached", 150, 200, 20000, 1024),
 		)
 
 		It("when <kv-cache-transfer-latency> not 0, ignore <kv-cache-transfer-overhead>", func() {
@@ -887,7 +914,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, true)
+			ttft := simulator.getTimeToFirstToken(128, 0, true)
 			Expect(ttft).To(BeNumerically("==", 200))
 		})
 
@@ -898,7 +925,7 @@ var _ = Describe("Simulator", func() {
 			simulator.config.KVCacheTransferTimePerToken = 100
 			simulator.config.KVCacheTransferTimeStdDev = 0
 
-			ttft := simulator.getTimeToFirstToken(128, true)
+			ttft := simulator.getTimeToFirstToken(128, 0, true)
 			Expect(ttft).To(BeNumerically("==", 12800))
 		})
 
@@ -909,7 +936,7 @@ var _ = Describe("Simulator", func() {
 				simulator.config.KVCacheTransferTimePerToken = kvCacheTransTPT
 				simulator.config.KVCacheTransferTimeStdDev = stddev
 
-				ttft := simulator.getTimeToFirstToken(nTokens, true)
+				ttft := simulator.getTimeToFirstToken(nTokens, 0, true)
 
 				expectedTTFT := kvCacheTransTPT * nTokens
 				Expect(ttft).To(BeNumerically(">=", int(float64(expectedTTFT)*0.3)))
diff --git a/pkg/llm-d-inference-sim/streaming.go b/pkg/llm-d-inference-sim/streaming.go
diff --git a/pkg/openai-server-api/request.go b/pkg/openai-server-api/request.go

Original file line number	Diff line number	Diff line change
`@@ -340,6 +340,9 @@ func (c *Configuration) validate() error {`
`340`	`340`	`if c.KVCacheTransferTimeStdDev < 0 {`
`341`	`341`	`return errors.New("kv-cache tranfer time standard deviation cannot be negative")`
`342`	`342`	`}`
	`343`	`+ if float32(c.KVCacheTransferTimeStdDev) > 0.3*float32(c.KVCacheTransferTimePerToken) {`
	`344`	`+ return errors.New("kv-cache tranfer time standard deviation cannot be more than 30% of kv-cache tranfer time")`
	`345`	`+ }`
`343`	`346`
`344`	`347`	`if c.KVCacheTransferLatency < 0 {`
`345`	`348`	`return errors.New("kv-cache tranfer time cannot be negative")`
Original file line number	Diff line number	Diff line change
`@@ -76,13 +76,14 @@ func (b *blockCache) start(ctx context.Context) {`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`// startRequest adds a request with its associated block hashes to the cache`
`79`		`-func (bc *blockCache) startRequest(requestID string, blocks []uint64) error {`
	`79`	`+// and returns the number of blocks that were already in the cache`
	`80`	`+func (bc *blockCache) startRequest(requestID string, blocks []uint64) (int, error) {`
`80`	`81`	`bc.mu.Lock()`
`81`	`82`	`defer bc.mu.Unlock()`
`82`	`83`
`83`	`84`	`if _, exists := bc.requestToBlocks[requestID]; exists {`
`84`	`85`	`// request with the same id already exists`
`85`		`- return fmt.Errorf("request already exists for id %s", requestID)`
	`86`	`+ return 0, fmt.Errorf("request already exists for id %s", requestID)`
`86`	`87`	`}`
`87`	`88`
`88`	`89`	`// divide list of blocks to three lists:`
`@@ -107,7 +108,7 @@ func (bc *blockCache) startRequest(requestID string, blocks []uint64) error {`
`107`	`108`	`}`
`108`	`109`
`109`	`110`	`if len(bc.usedBlocks)+len(blocksToAdd)+len(blockToMoveToUsed) > bc.maxBlocks {`
`110`		`- return errors.New(capacityError)`
	`111`	`+ return 0, errors.New(capacityError)`
`111`	`112`	`}`
`112`	`113`
`113`	`114`	`// for blocks that are already in use - update the reference`
`@@ -148,7 +149,7 @@ func (bc *blockCache) startRequest(requestID string, blocks []uint64) error {`
`148`	`149`	`bc.requestToBlocks[requestID] = make([]uint64, len(blocks))`
`149`	`150`	`copy(bc.requestToBlocks[requestID], blocks)`
`150`	`151`
`151`		`- return nil`
	`152`	`+ return len(blockAreadyInUse) + len(blockToMoveToUsed), nil`
`152`	`153`	`}`
`153`	`154`
`154`	`155`	`// finishRequest processes the completion of a request, decreasing reference counts`
`@@ -159,7 +160,7 @@ func (bc *blockCache) finishRequest(requestID string) error {`
`159`	`160`	`// Get blocks associated with this request`
`160`	`161`	`blockHashes, exists := bc.requestToBlocks[requestID]`
`161`	`162`	`if !exists {`
`162`		`- return errors.New("request not found")`
	`163`	`+ return nil`
`163`	`164`	`}`
`164`	`165`
`165`	`166`	`now := time.Now()`
Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ type KVCacheHelper struct {`
`32`	`32`	`tokensProcessor kvblock.TokenProcessor // turns tokens to kv block keys`
`33`	`33`	`logger logr.Logger`
`34`	`34`	`blockCache *blockCache`
	`35`	`+ blockSize int`
`35`	`36`	`}`
`36`	`37`
`37`	`38`	`func NewKVCacheHelper(config common.Configuration, logger logr.Logger) (KVCacheHelper, error) {`
`@@ -59,6 +60,7 @@ func NewKVCacheHelper(config common.Configuration, logger logr.Logger) (KVCach`
`59`	`60`	`tokensProcessor: tokensProcessor,`
`60`	`61`	`blockCache: blockCache,`
`61`	`62`	`logger: logger,`
	`63`	`+ blockSize: config.TokenBlockSize,`
`62`	`64`	`}, nil`
`63`	`65`	`}`
`64`	`66`
`@@ -78,7 +80,7 @@ func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest`
`78`	`80`	`tokens, _, err := h.tokenizer.Encode(prompt, modelName)`
`79`	`81`	`if err != nil {`
`80`	`82`	`h.logger.Info("Prompt tokenization failed", "error", err.Error())`
`81`		`- return h.blockCache.startRequest(requestID, make([]uint64, 0))`
	`83`	`+ return err`
`82`	`84`	`}`
`83`	`85`
`84`	`86`	`// get block keys`
`@@ -90,7 +92,9 @@ func (h *KVCacheHelper) OnRequestStart(vllmReq openaiserverapi.CompletionRequest`
`90`	`92`	`blockHashes[i] = key.ChunkHash`
`91`	`93`	`}`
`92`	`94`
`93`		`- return h.blockCache.startRequest(requestID, blockHashes)`
	`95`	`+ nExistingBlocks, err := h.blockCache.startRequest(requestID, blockHashes)`
	`96`	`+ vllmReq.SetNumberOfCachedPromptTokens(nExistingBlocks * h.blockSize)`
	`97`	`+ return err`
`94`	`98`	`}`
`95`	`99`
`96`	`100`	`func (h *KVCacheHelper) OnRequestEnd(vllmReq openaiserverapi.CompletionRequest) error {`