Skip to content

Commit faa0275

Browse files
authored
[V1] Optimize the overhead of rewinding (#14905)
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent 8a5a9b7 commit faa0275

File tree

1 file changed

+5
-6
lines changed

1 file changed

+5
-6
lines changed

vllm/v1/worker/gpu_model_runner.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,17 +1032,16 @@ def execute_model(
10321032

10331033
# TODO(woosuk): The following loop can be slow since it iterates over
10341034
# the requests one by one. Optimize.
1035-
for i, req_id in enumerate(self.input_batch.req_ids):
1035+
for i, generator in self.input_batch.generators.items():
1036+
req_id = self.input_batch.req_ids[i]
10361037
req_state = self.requests[req_id]
10371038
seq_len = (req_state.num_computed_tokens +
10381039
scheduler_output.num_scheduled_tokens[req_id])
10391040
if seq_len < req_state.num_tokens:
1040-
# Ignore the sampled token.
1041+
# Ignore the sampled token for partial prefills.
10411042
# Rewind the generator state as if the token was not sampled.
1042-
generator = self.input_batch.generators.get(i)
1043-
if generator is not None:
1044-
# This relies on cuda-specific torch-internal impl details
1045-
generator.set_offset(generator.get_offset() - 4)
1043+
# This relies on cuda-specific torch-internal impl details
1044+
generator.set_offset(generator.get_offset() - 4)
10461045

10471046
# NOTE: GPU -> CPU Sync happens here.
10481047
# Move as many CPU operations as possible before this sync point.

0 commit comments

Comments
 (0)