Skip to content

Commit a35a8a8

Browse files
authored
[V1][Spec Decode] Avoid logging useless nan metrics (#16023)
Signed-off-by: Mark McLoughlin <[email protected]>
1 parent 4ef0bb1 commit a35a8a8

File tree

2 files changed

+28
-14
lines changed

2 files changed

+28
-14
lines changed

tests/v1/core/test_scheduler.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -671,10 +671,7 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
671671
assert running_req.num_tokens_with_spec == 2 + len(spec_tokens[i])
672672

673673
# No draft or accepted tokens counted yet
674-
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
675-
stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
676-
assert stats.num_draft_tokens == 0
677-
assert stats.num_accepted_tokens == 0
674+
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is None
678675

679676
# Schedule the speculated tokens for validation
680677
output = scheduler.schedule()
@@ -702,7 +699,11 @@ def test_schedule_spec_decoding_stats(spec_tokens, output_tokens, expected):
702699
engine_core_outputs = scheduler.update_from_output(output,
703700
model_runner_output)
704701

705-
assert engine_core_outputs.scheduler_stats.spec_decoding_stats is not None
706-
stats = engine_core_outputs.scheduler_stats.spec_decoding_stats
707-
assert stats.num_draft_tokens == expected[0]
708-
assert stats.num_accepted_tokens == expected[1]
702+
scheduler_stats = engine_core_outputs.scheduler_stats
703+
if expected[0] == 0:
704+
assert scheduler_stats.spec_decoding_stats is None
705+
else:
706+
assert scheduler_stats.spec_decoding_stats is not None
707+
stats = scheduler_stats.spec_decoding_stats
708+
assert stats.num_draft_tokens == expected[0]
709+
assert stats.num_accepted_tokens == expected[1]

vllm/v1/core/sched/scheduler.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -553,11 +553,11 @@ def update_from_output(
553553
spec_token_ids = model_runner_output.spec_token_ids
554554
logprobs = model_runner_output.logprobs
555555
prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
556-
spec_decoding_stats = SpecDecodingStats() if self.log_stats else None
557556
num_scheduled_tokens = scheduler_output.num_scheduled_tokens
558557

559558
new_running: list[Request] = []
560559
outputs: list[EngineCoreOutput] = []
560+
spec_decoding_stats: Optional[SpecDecodingStats] = None
561561

562562
# NOTE(woosuk): As len(self.running) can be up to 1K or more, the below
563563
# loop can be a performance bottleneck. We should do our best to avoid
@@ -585,11 +585,10 @@ def update_from_output(
585585
num_tokens_rejected = (len(scheduled_spec_token_ids) + 1 -
586586
len(generated_token_ids))
587587
request.num_computed_tokens -= num_tokens_rejected
588-
589-
if spec_decoding_stats is not None:
590-
spec_decoding_stats.observe(
591-
num_draft_tokens=len(scheduled_spec_token_ids),
592-
num_accepted_tokens=len(generated_token_ids) - 1)
588+
spec_decoding_stats = self.make_spec_decoding_stats(
589+
spec_decoding_stats,
590+
num_draft_tokens=len(scheduled_spec_token_ids),
591+
num_accepted_tokens=len(generated_token_ids) - 1)
593592

594593
cached_encoder_input_ids = (
595594
self.encoder_cache_manager.get_cached_input_ids(request))
@@ -744,3 +743,17 @@ def make_stats(
744743
prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
745744
spec_decoding_stats=spec_decoding_stats,
746745
)
746+
747+
def make_spec_decoding_stats(
748+
self,
749+
spec_decoding_stats: Optional[SpecDecodingStats],
750+
num_draft_tokens: int,
751+
num_accepted_tokens: int,
752+
) -> Optional[SpecDecodingStats]:
753+
if not self.log_stats:
754+
return None
755+
if spec_decoding_stats is None:
756+
spec_decoding_stats = SpecDecodingStats()
757+
spec_decoding_stats.observe(num_draft_tokens=num_draft_tokens,
758+
num_accepted_tokens=num_accepted_tokens)
759+
return spec_decoding_stats

0 commit comments

Comments
 (0)