Skip to content

Commit ae122b1

Browse files
authored
[WIP][[V1][Metrics] Implement max_num_generation_tokens, request_params_n, and request_params_max_tokens metrics (vllm-project#14055)
Signed-off-by: Mark McLoughlin <[email protected]>
1 parent 872db2b commit ae122b1

File tree

5 files changed

+111
-2
lines changed

5 files changed

+111
-2
lines changed

tests/entrypoints/openai/test_metrics.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,12 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
239239
"vllm:request_generation_tokens_sum",
240240
"vllm:request_generation_tokens_bucket",
241241
"vllm:request_generation_tokens_count",
242+
"vllm:request_params_n_sum",
243+
"vllm:request_params_n_bucket",
244+
"vllm:request_params_n_count",
245+
"vllm:request_params_max_tokens_sum",
246+
"vllm:request_params_max_tokens_bucket",
247+
"vllm:request_params_max_tokens_count",
242248
"vllm:time_to_first_token_seconds_sum",
243249
"vllm:time_to_first_token_seconds_bucket",
244250
"vllm:time_to_first_token_seconds_count",

vllm/v1/engine/output_processor.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def __init__(
3636
prompt_token_ids: list[int],
3737
logprobs_processor: LogprobsProcessor,
3838
detokenizer: IncrementalDetokenizer,
39+
max_tokens_param: Optional[int],
3940
arrival_time: float,
4041
queue: Optional[asyncio.Queue[RequestOutput]],
4142
log_stats: bool,
@@ -50,6 +51,7 @@ def __init__(
5051
self.prompt_len = len(prompt_token_ids)
5152
self.logprobs_processor = logprobs_processor
5253
self.detokenizer = detokenizer
54+
self.max_tokens_param = max_tokens_param
5355
self.is_prefilling = True
5456
self.queue = queue
5557

@@ -83,6 +85,8 @@ def from_new_request(
8385
tokenizer=tokenizer,
8486
request=request,
8587
),
88+
max_tokens_param=(request.sampling_params.max_tokens if
89+
request.sampling_params is not None else None),
8690
arrival_time=request.arrival_time,
8791
queue=queue,
8892
log_stats=log_stats,
@@ -198,6 +202,8 @@ def abort_requests(
198202
req_state = self.request_states.pop(request_id, None)
199203
if req_state is not None:
200204
self.lora_states.abort_request(req_state)
205+
if req_state.parent_req is not None:
206+
req_state.parent_req.finish_child_request(request_id)
201207

202208
def add_request(
203209
self,
@@ -310,6 +316,8 @@ def process_outputs(
310316
# If req not finished in EngineCore, but Detokenizer
311317
# detected stop string, abort needed in EngineCore.
312318
reqs_to_abort.append(req_id)
319+
if req_state.parent_req is not None:
320+
req_state.parent_req.finish_child_request(req_id)
313321

314322
# Track per-request stats
315323
self._update_stats_from_finished(req_state, finish_reason,
@@ -350,5 +358,10 @@ def _update_stats_from_finished(self, req_state: RequestState,
350358
iteration_stats.update_from_finished_request(
351359
finish_reason=finish_reason,
352360
num_prompt_tokens=len(req_state.prompt_token_ids),
361+
max_tokens_param=req_state.max_tokens_param,
353362
req_stats=req_state.stats)
354363
self.lora_states.finish_request(req_state)
364+
365+
ParentRequest.observe_finished_request(
366+
req_state.parent_req, iteration_stats,
367+
req_state.stats.num_generation_tokens)

vllm/v1/engine/parallel_sampling.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from vllm.outputs import CompletionOutput, RequestOutput
77
from vllm.pooling_params import PoolingParams
88
from vllm.sampling_params import SamplingParams
9+
from vllm.v1.metrics.stats import IterationStats
910

1011

1112
class ParentRequest:
@@ -18,9 +19,15 @@ class ParentRequest:
1819
request_id: str
1920
sampling_params: SamplingParams
2021

22+
# To track the completion of child requests
23+
child_requests: set[str]
24+
2125
# To aggregate child completions when not streaming
2226
output_aggregator: Optional[RequestOutput]
2327

28+
# To find the max number of generated tokens across all children
29+
max_num_generation_tokens: int
30+
2431
# To efficiently obtain child sampling params
2532
cached_child_sampling_params: Optional[SamplingParams]
2633

@@ -29,7 +36,9 @@ def __init__(self, request_id: str,
2936
self.request_id = request_id
3037
self.sampling_params = sampling_params
3138

39+
self.child_requests = set()
3240
self.output_aggregator = None
41+
self.max_num_generation_tokens = 0
3342
self.cached_child_sampling_params = None
3443

3544
@classmethod
@@ -82,8 +91,12 @@ def get_child_info(self, index: int) -> tuple[str, SamplingParams]:
8291
Returns:
8392
(request ID, sampling_params) tuple
8493
"""
85-
return (f"{index}_{self.request_id}",
86-
self._get_child_sampling_params(index))
94+
child_req_id = f"{index}_{self.request_id}"
95+
self.child_requests.add(child_req_id)
96+
return (child_req_id, self._get_child_sampling_params(index))
97+
98+
def finish_child_request(self, req_id: str):
99+
self.child_requests.remove(req_id)
87100

88101
@property
89102
def n(self) -> int:
@@ -117,3 +130,25 @@ def make_request_output(
117130
request_output.outputs = sorted(request_output.outputs,
118131
key=lambda x: x.index)
119132
return request_output
133+
134+
def observe_num_generation_tokens(self, num_generation_tokens: int):
135+
self.max_num_generation_tokens = max(num_generation_tokens,
136+
self.max_num_generation_tokens)
137+
return self.max_num_generation_tokens
138+
139+
@staticmethod
140+
def observe_finished_request(parent_req: Optional['ParentRequest'],
141+
iteration_stats: IterationStats,
142+
num_generation_tokens: int):
143+
144+
n_param = parent_req.n if parent_req is not None else 1
145+
146+
if parent_req is not None:
147+
num_generation_tokens = parent_req.observe_num_generation_tokens(
148+
num_generation_tokens)
149+
150+
# Child requests finished, we can now record to iteration stats
151+
if parent_req is None or not parent_req.child_requests:
152+
iteration_stats.max_num_generation_tokens_iter.append(
153+
num_generation_tokens)
154+
iteration_stats.n_params_iter.append(n_param)

vllm/v1/metrics/loggers.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,9 @@ def __init__(self, vllm_config: VllmConfig):
106106

107107
max_model_len = vllm_config.model_config.max_model_len
108108

109+
#
110+
# Scheduler state
111+
#
109112
self.gauge_scheduler_running = prometheus_client.Gauge(
110113
name="vllm:num_requests_running",
111114
documentation="Number of requests in model execution batches.",
@@ -116,6 +119,9 @@ def __init__(self, vllm_config: VllmConfig):
116119
documentation="Number of requests waiting to be processed.",
117120
labelnames=labelnames).labels(*labelvalues)
118121

122+
#
123+
# GPU cache
124+
#
119125
self.gauge_gpu_cache_usage = prometheus_client.Gauge(
120126
name="vllm:gpu_cache_usage_perc",
121127
documentation="GPU KV-cache usage. 1 means 100 percent usage.",
@@ -133,6 +139,9 @@ def __init__(self, vllm_config: VllmConfig):
133139
"GPU prefix cache hits, in terms of number of cached blocks.",
134140
labelnames=labelnames).labels(*labelvalues)
135141

142+
#
143+
# Counters
144+
#
136145
self.counter_num_preempted_reqs = prometheus_client.Counter(
137146
name="vllm:num_preemptions_total",
138147
documentation="Cumulative number of preemption from the engine.",
@@ -159,6 +168,9 @@ def __init__(self, vllm_config: VllmConfig):
159168
reason] = counter_request_success_base.labels(*(labelvalues +
160169
[str(reason)]))
161170

171+
#
172+
# Histograms of counts
173+
#
162174
self.histogram_num_prompt_tokens_request = \
163175
prometheus_client.Histogram(
164176
name="vllm:request_prompt_tokens",
@@ -180,6 +192,31 @@ def __init__(self, vllm_config: VllmConfig):
180192
buckets=build_cudagraph_buckets(vllm_config),
181193
labelnames=labelnames).labels(*labelvalues)
182194

195+
self.histogram_max_num_generation_tokens_request = \
196+
prometheus_client.Histogram(
197+
name="vllm:request_max_num_generation_tokens",
198+
documentation=
199+
"Histogram of maximum number of requested generation tokens.",
200+
buckets=build_1_2_5_buckets(max_model_len),
201+
labelnames=labelnames).labels(*labelvalues)
202+
203+
self.histogram_n_request = \
204+
prometheus_client.Histogram(
205+
name="vllm:request_params_n",
206+
documentation="Histogram of the n request parameter.",
207+
buckets=[1, 2, 5, 10, 20],
208+
labelnames=labelnames).labels(*labelvalues)
209+
210+
self.histogram_max_tokens_request = \
211+
prometheus_client.Histogram(
212+
name="vllm:request_params_max_tokens",
213+
documentation="Histogram of the max_tokens request parameter.",
214+
buckets=build_1_2_5_buckets(max_model_len),
215+
labelnames=labelnames).labels(*labelvalues)
216+
217+
#
218+
# Histogram of timing intervals
219+
#
183220
self.histogram_time_to_first_token = \
184221
prometheus_client.Histogram(
185222
name="vllm:time_to_first_token_seconds",
@@ -239,6 +276,9 @@ def __init__(self, vllm_config: VllmConfig):
239276
buckets=request_latency_buckets,
240277
labelnames=labelnames).labels(*labelvalues)
241278

279+
#
280+
# LoRA metrics
281+
#
242282
self.gauge_lora_info: Optional[prometheus_client.Gauge] = None
243283
if vllm_config.lora_config is not None:
244284
self.labelname_max_lora = "max_lora"
@@ -255,6 +295,9 @@ def __init__(self, vllm_config: VllmConfig):
255295
self.labelname_running_lora_adapters,
256296
])
257297

298+
#
299+
# Cache config info metric
300+
#
258301
self.log_metrics_info("cache_config", vllm_config.cache_config)
259302

260303
def log_metrics_info(self, type: str, config_obj: SupportsMetricsInfo):
@@ -296,6 +339,11 @@ def record(self, scheduler_stats: SchedulerStats,
296339
iteration_stats.num_prompt_tokens + \
297340
iteration_stats.num_generation_tokens)
298341

342+
for max_gen_tokens in iteration_stats.max_num_generation_tokens_iter:
343+
self.histogram_max_num_generation_tokens_request.observe(
344+
max_gen_tokens)
345+
for n_param in iteration_stats.n_params_iter:
346+
self.histogram_n_request.observe(n_param)
299347
for ttft in iteration_stats.time_to_first_tokens_iter:
300348
self.histogram_time_to_first_token.observe(ttft)
301349
for tpot in iteration_stats.time_per_output_tokens_iter:
@@ -317,6 +365,8 @@ def record(self, scheduler_stats: SchedulerStats,
317365
finished_request.num_prompt_tokens)
318366
self.histogram_num_generation_tokens_request.observe(
319367
finished_request.num_generation_tokens)
368+
self.histogram_max_tokens_request.observe(
369+
finished_request.max_tokens_param)
320370

321371
if self.gauge_lora_info is not None:
322372
running_lora_adapters = \

vllm/v1/metrics/stats.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ class FinishedRequestStats:
6666
e2e_latency: float = 0.0
6767
num_prompt_tokens: int = 0
6868
num_generation_tokens: int = 0
69+
max_tokens_param: Optional[int] = None
6970
queued_time: float = 0.0
7071
prefill_time: float = 0.0
7172
inference_time: float = 0.0
@@ -81,6 +82,8 @@ def __init__(self):
8182
self.num_prompt_tokens = 0
8283
self.num_preempted_reqs = 0
8384
self.finished_requests: list[FinishedRequestStats] = []
85+
self.max_num_generation_tokens_iter: list[int] = []
86+
self.n_params_iter: list[int] = []
8487
self.time_to_first_tokens_iter: list[float] = []
8588
self.time_per_output_tokens_iter: list[float] = []
8689
self.waiting_lora_adapters: dict[str, int] = {}
@@ -150,6 +153,7 @@ def update_from_events(self, req_id: str, events: list["EngineCoreEvent"],
150153

151154
def update_from_finished_request(self, finish_reason: "FinishReason",
152155
num_prompt_tokens: int,
156+
max_tokens_param: Optional[int],
153157
req_stats: RequestStateStats):
154158
e2e_latency = self._time_since(req_stats.arrival_time)
155159

@@ -173,6 +177,7 @@ def update_from_finished_request(self, finish_reason: "FinishReason",
173177
e2e_latency=e2e_latency,
174178
num_prompt_tokens=num_prompt_tokens,
175179
num_generation_tokens=req_stats.num_generation_tokens,
180+
max_tokens_param=max_tokens_param,
176181
queued_time=queued_time,
177182
prefill_time=prefill_time,
178183
inference_time=inference_time,

0 commit comments

Comments
 (0)