Skip to content

Commit 302f3ac

Browse files
authored
[v1][KVCacheManager] Change prefix caching metric from counting blocks to counting tokens (#18003)
Signed-off-by: Chen Zhang <[email protected]>
1 parent e9c730c commit 302f3ac

File tree

3 files changed

+9
-9
lines changed

3 files changed

+9
-9
lines changed

vllm/v1/core/kv_cache_manager.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -161,22 +161,22 @@ def get_computed_blocks(self,
161161

162162
computed_blocks = (
163163
self.single_type_manager.find_longest_cache_hit(block_hashes))
164+
# NOTE(woosuk): Since incomplete blocks are not eligible for
165+
# sharing, `num_computed_tokens` is always a multiple of
166+
# `block_size`.
167+
num_computed_tokens = len(computed_blocks) * self.block_size
164168

165169
if self.log_stats:
166170
assert self.prefix_cache_stats is not None
167-
self.prefix_cache_stats.queries += len(block_hashes)
168-
self.prefix_cache_stats.hits += len(computed_blocks)
171+
self.prefix_cache_stats.queries += request.num_tokens
172+
self.prefix_cache_stats.hits += num_computed_tokens
169173

170174
if last_block_hash is not None:
171175
# Add back the last block hash if it was removed.
172176
# NOTE: Because block_hashes is cached in req_to_block_hashes,
173177
# we shouldn't modify it directly.
174178
block_hashes.append(last_block_hash)
175179

176-
# NOTE(woosuk): Since incomplete blocks are not eligible for
177-
# sharing, `num_computed_tokens` is always a multiple of
178-
# `block_size`.
179-
num_computed_tokens = len(computed_blocks) * self.block_size
180180
return KVCacheBlocks(computed_blocks), num_computed_tokens
181181

182182
def allocate_slots(

vllm/v1/metrics/loggers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,13 +183,13 @@ def __init__(self, vllm_config: VllmConfig, engine_index: int = 0):
183183
self.counter_gpu_prefix_cache_queries = prometheus_client.Counter(
184184
name="vllm:gpu_prefix_cache_queries",
185185
documentation=
186-
"GPU prefix cache queries, in terms of number of queried blocks.",
186+
"GPU prefix cache queries, in terms of number of queried tokens.",
187187
labelnames=labelnames).labels(*labelvalues)
188188

189189
self.counter_gpu_prefix_cache_hits = prometheus_client.Counter(
190190
name="vllm:gpu_prefix_cache_hits",
191191
documentation=
192-
"GPU prefix cache hits, in terms of number of cached blocks.",
192+
"GPU prefix cache hits, in terms of number of cached tokens.",
193193
labelnames=labelnames).labels(*labelvalues)
194194

195195
#

vllm/v1/metrics/stats.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class PrefixCacheStats:
1919
# The number of requests in this update.
2020
requests: int = 0
2121
# The number of queries in these requests. Note that "queries" here
22-
# means the number of blocks that were queried from the cache.
22+
# means the number of tokens that were queried from the cache.
2323
queries: int = 0
2424
# The number of hits in these requests.
2525
hits: int = 0

0 commit comments

Comments
 (0)