Skip to content

Commit 6192bc9

Browse files
pjgaopjgao
andauthored
[Bugfix] fix tensor not same device in qwen2_5_vl_without_padding (vllm-project#2051)
bugfix cherry-pick from v0.9.1-dev vllm-project#2007 ### What this PR does / why we need it? Minimum reproducing code: ```python # test.py from vllm import LLM, SamplingParams prompts = [ "Hello, my name is", "The future of AI is", ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) llm = LLM(model="Qwen2.5-VL-7B-Instruct", max_model_len=26240) outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` ```bash export USE_OPTIMIZED_MODEL=0 python test.py ``` exception as follow: ``` [rank0]: File "/home/xxx/vllm_ascend/models/qwen2_5_vl_without_padding.py", line 84, in forward [rank0]: q = torch_npu.npu_rotary_mul(q, cos, sin) [rank0]: File "/home/anaconda3/envs/xxx/lib/python3.10/site-packages/torch/_ops.py", line 1116, in __call__ [rank0]: return self._op(*args, **(kwargs or {})) [rank0]: RuntimeError: Expected all tensors to be on the same device, but found at least two devices, npu:0 and cpu! (when checking argument for argument r1 in method wrapper__npu_rotary_mul) ``` In `AscendQwen2_5_VisionAttention_Without_Padding`, `torch_npu.npu_rotary_mul(q, cos, sin)`, `cos`/`sin` on cpu, but `q` on npu, so there will be an error. `qwen2_5_vl_without_padding.py` need this bugfix, because `AscendQwen2_5_VisionTransformer_Without_Padding.rot_pos_emb` in wen2_5_vl_without_padding.py is from vllm and `inv_freq` will create on cpu. https://github.com/vllm-project/vllm/blob/40d86ee412eeeca93e0c37432db6b96829cb64e2/vllm/model_executor/models/qwen2_5_vl.py#L482 ```python inv_freq = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float, device='cpu') / dim)) ``` `qwen2_5_vl.py` do not need, because `AscendQwen2_5_VisionRotaryEmbedding` in qwen2_5_vl.py rewrite `AscendQwen2_5_VisionRotaryEmbedding` and `inv_freq` will create on device. ```python inv_freq = 1.0 / (theta**(torch.arange(0, dim, 2, dtype=torch.float) / dim)) ``` ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? CI passed with new added/existing test. - vLLM version: v0.10.0 - vLLM main: vllm-project/vllm@18cc33d Signed-off-by: pjgao <[email protected]> Co-authored-by: pjgao <[email protected]>
1 parent 72eceff commit 6192bc9

File tree

2 files changed

+12
-1
lines changed

2 files changed

+12
-1
lines changed

tests/ut/models/test_qwen2_5_vl_without_padding.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,8 @@ def init_vision_transformer(
231231
vision_config.in_channels = 3
232232
vision_config.hidden_act = "gelu"
233233
vision_config.depth = 0
234+
vision_config.hidden_size = 1280
235+
vision_config.num_heads = 16
234236

235237
mocker.patch("torch.nn.Module.__setattr__")
236238
mocker.patch("torch.nn.Module.__getattr__")
@@ -239,6 +241,10 @@ def init_vision_transformer(
239241
"vllm.model_executor.models.qwen2_5_vl.Qwen2_5_VisionTransformer.__init__",
240242
return_value=None,
241243
)
244+
mocker_vision_rotary_embedding = mocker.patch(
245+
"vllm_ascend.models.qwen2_5_vl.AscendQwen2_5_VisionRotaryEmbedding.__init__",
246+
return_value=None,
247+
)
242248
mocker.patch(
243249
"vllm_ascend.models.qwen2_5_vl_without_padding.AscendQwen2_5_VisionBlock_Without_Padding.__init__",
244250
return_value=None,
@@ -264,7 +270,7 @@ def init_vision_transformer(
264270
args, kwargs = mocker_vit.call_args
265271
assert args == (vision_config, norm_eps, None, "")
266272
assert not kwargs
267-
273+
mocker_vision_rotary_embedding.assert_called_once()
268274
return vision_transformer
269275

270276
def test_init_vision_transformer(self, mocker: MockerFixture):

vllm_ascend/models/qwen2_5_vl_without_padding.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
from vllm.model_executor.models.utils import maybe_prefix
4242
from vllm.multimodal import MULTIMODAL_REGISTRY
4343

44+
from vllm_ascend.models.qwen2_5_vl import AscendQwen2_5_VisionRotaryEmbedding
45+
4446

4547
class AscendQwen2_5_VisionAttention_Without_Padding(Qwen2_5_VisionAttention):
4648

@@ -160,6 +162,9 @@ def __init__(
160162
super().__init__(vision_config, norm_eps, quant_config, prefix)
161163
norm_layer = partial(RMSNorm, eps=norm_eps)
162164
self.interleaved = interleaved
165+
head_dim = self.hidden_size // self.num_heads
166+
self.rotary_pos_emb = AscendQwen2_5_VisionRotaryEmbedding(head_dim //
167+
2)
163168
self.patch_embed = AscendQwen2_5_VisionPatchEmbed_Without_Padding(
164169
patch_size=vision_config.patch_size,
165170
temporal_patch_size=vision_config.temporal_patch_size,

0 commit comments

Comments
 (0)