[Bugfix] Qwen-vl output is inconsistent in speculative decoding (#10350)

skylee-01 · web-flow · commit 2ec88272881a · 2024-11-15T05:40:10.000Z
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
@@ -353,6 +353,7 @@ def _create_single_target_seq_group_metadata(
         seq_data = seq_group_metadata.seq_data[seq_id]
         prompt_token_ids = seq_data.prompt_token_ids_array
         new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids]
+        mrope_position_delta = seq_data.mrope_position_delta
 
         new_seq_data_dict = {
             target_seq_id:
@@ -368,6 +369,7 @@ def _create_single_target_seq_group_metadata(
         # the kv cache is filled by a previous batch in the batch expansion.
         for data in new_seq_data_dict.values():
             data.update_num_computed_tokens(data.get_len() - 1)
+            data.mrope_position_delta = mrope_position_delta
 
         return SequenceGroupMetadata(
             request_id=seq_group_metadata.request_id,