File tree Expand file tree Collapse file tree 1 file changed +5
-6
lines changed Expand file tree Collapse file tree 1 file changed +5
-6
lines changed Original file line number Diff line number Diff line change @@ -1032,17 +1032,16 @@ def execute_model(
10321032
10331033 # TODO(woosuk): The following loop can be slow since it iterates over
10341034 # the requests one by one. Optimize.
1035- for i , req_id in enumerate (self .input_batch .req_ids ):
1035+ for i , generator in self .input_batch .generators .items ():
1036+ req_id = self .input_batch .req_ids [i ]
10361037 req_state = self .requests [req_id ]
10371038 seq_len = (req_state .num_computed_tokens +
10381039 scheduler_output .num_scheduled_tokens [req_id ])
10391040 if seq_len < req_state .num_tokens :
1040- # Ignore the sampled token.
1041+ # Ignore the sampled token for partial prefills .
10411042 # Rewind the generator state as if the token was not sampled.
1042- generator = self .input_batch .generators .get (i )
1043- if generator is not None :
1044- # This relies on cuda-specific torch-internal impl details
1045- generator .set_offset (generator .get_offset () - 4 )
1043+ # This relies on cuda-specific torch-internal impl details
1044+ generator .set_offset (generator .get_offset () - 4 )
10461045
10471046 # NOTE: GPU -> CPU Sync happens here.
10481047 # Move as many CPU operations as possible before this sync point.
You can’t perform that action at this time.
0 commit comments