File tree Expand file tree Collapse file tree 1 file changed +5
-6
lines changed Expand file tree Collapse file tree 1 file changed +5
-6
lines changed Original file line number Diff line number Diff line change @@ -1032,17 +1032,16 @@ def execute_model(
1032
1032
1033
1033
# TODO(woosuk): The following loop can be slow since it iterates over
1034
1034
# the requests one by one. Optimize.
1035
- for i , req_id in enumerate (self .input_batch .req_ids ):
1035
+ for i , generator in self .input_batch .generators .items ():
1036
+ req_id = self .input_batch .req_ids [i ]
1036
1037
req_state = self .requests [req_id ]
1037
1038
seq_len = (req_state .num_computed_tokens +
1038
1039
scheduler_output .num_scheduled_tokens [req_id ])
1039
1040
if seq_len < req_state .num_tokens :
1040
- # Ignore the sampled token.
1041
+ # Ignore the sampled token for partial prefills .
1041
1042
# Rewind the generator state as if the token was not sampled.
1042
- generator = self .input_batch .generators .get (i )
1043
- if generator is not None :
1044
- # This relies on cuda-specific torch-internal impl details
1045
- generator .set_offset (generator .get_offset () - 4 )
1043
+ # This relies on cuda-specific torch-internal impl details
1044
+ generator .set_offset (generator .get_offset () - 4 )
1046
1045
1047
1046
# NOTE: GPU -> CPU Sync happens here.
1048
1047
# Move as many CPU operations as possible before this sync point.
You can’t perform that action at this time.
0 commit comments