Skip to content

Commit 5ff10c8

Browse files
authored
[Model] Qwen2.5VL support --use-cudagraph and unit testing (#4087)
* [BugFix] qwen2.5vl enable_thinking=true and image_patch_id bug fix * [Docs]offine infer add apply_chat_template add_generation_prompt parameter * [Model]qwen2.5VL support --use-cudagraph * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v2 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v3 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v4 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v5 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v6 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v7
1 parent 18f4977 commit 5ff10c8

File tree

5 files changed

+1048
-39
lines changed

5 files changed

+1048
-39
lines changed

docs/offline_inference.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ messages = [
107107
}
108108
]
109109

110-
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
110+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111111
images, videos = [], []
112112
for message in messages:
113113
content = message["content"]

docs/zh/offline_inference.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ messages = [
107107
}
108108
]
109109

110-
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
110+
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
111111
images, videos = [], []
112112
for message in messages:
113113
content = message["content"]

fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py

Lines changed: 40 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from paddleformers.utils.log import logger
2828

2929
from fastdeploy.config import FDConfig
30+
from fastdeploy.model_executor.forward_meta import ForwardMeta
3031
from fastdeploy.model_executor.graph_optimization.decorator import (
3132
support_graph_optimization,
3233
)
@@ -39,12 +40,6 @@
3940
ModelRegistry,
4041
)
4142
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer
42-
from fastdeploy.platforms import current_platform
43-
44-
if current_platform.is_cuda():
45-
from fastdeploy.model_executor.ops.gpu import extract_text_token_output
46-
47-
from fastdeploy.model_executor.forward_meta import ForwardMeta
4843

4944

5045
@support_graph_optimization
@@ -108,31 +103,17 @@ def load_state_dict(self, state_dict):
108103
logger.info(f"Start load layer {i}")
109104
self.layers[i].load_state_dict(state_dict)
110105

106+
def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor:
107+
return self.embed_tokens(ids_remove_padding=ids_remove_padding)
108+
111109
def forward(
112110
self,
111+
input_embeddings: paddle.Tensor,
113112
ids_remove_padding: paddle.Tensor,
114113
image_features: Optional[paddle.Tensor],
115114
forward_meta: ForwardMeta,
116115
):
117-
118-
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
119-
120-
# -----------------------
121-
# 将 image_embeds 替换 input_embeds 里的 image video 占位符
122-
image_mask = ids_remove_padding == self.image_token_id
123-
image_token_num = image_mask.sum()
124-
125-
video_mask = ids_remove_padding == self.video_token_id
126-
video_token_num = video_mask.sum()
127-
128-
# 由于框架只有 image_features,所以目前不支持图片和视频混合
129-
# TODO(wangyafeng) 后续考虑支持传入 video_features
130-
if image_token_num > 0:
131-
hidden_states[image_mask] = image_features.cast(self._dtype)
132-
if video_token_num > 0:
133-
hidden_states[video_mask] = image_features.cast(self._dtype)
134-
135-
# -----------------------
116+
hidden_states = input_embeddings
136117

137118
residual = None
138119
for i in range(self.num_layers):
@@ -144,18 +125,6 @@ def forward(
144125

145126
hidden_states = hidden_states + residual
146127

147-
# -----------------------
148-
max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time, k=1)
149-
hidden_states = extract_text_token_output(
150-
max_seq_len,
151-
max_seq_len_index.cast("int32"),
152-
image_token_num.cast("int32"),
153-
forward_meta.seq_lens_this_time,
154-
forward_meta.cu_seqlens_q,
155-
hidden_states.cast("float32"),
156-
).cast(self._dtype)
157-
# -----------------------
158-
159128
out = self.norm(hidden_states)
160129

161130
return out
@@ -183,6 +152,12 @@ def __init__(self, fd_config: FDConfig):
183152
# ----------- language model -------------
184153
self.model = Qwen2_5_VLModel(fd_config=fd_config)
185154

155+
# Persistent buffers for CUDA graphs.
156+
self._input_embeddings = paddle.zeros(
157+
[fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size],
158+
dtype=fd_config.model_config.dtype,
159+
)
160+
186161
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
187162

188163
self.lm_head = ParallelLMHead(
@@ -246,14 +221,42 @@ def empty_input_forward(self):
246221
self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states)
247222
self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states)
248223

224+
def get_input_embeddings(
225+
self,
226+
ids_remove_padding: paddle.Tensor,
227+
image_features: Optional[paddle.Tensor] = None,
228+
) -> paddle.Tensor:
229+
230+
input_embeddings = self.model.get_input_embeddings(ids_remove_padding=ids_remove_padding)
231+
232+
image_mask = ids_remove_padding == self.model.image_token_id
233+
image_token_num = image_mask.sum()
234+
235+
video_mask = ids_remove_padding == self.model.video_token_id
236+
video_token_num = video_mask.sum()
237+
238+
# 由于框架只有 image_features,所以目前不支持图片和视频混合
239+
# TODO(wangyafeng) 后续考虑支持传入 video_features
240+
if image_token_num > 0:
241+
input_embeddings[image_mask] = image_features.cast(self.model._dtype)
242+
if video_token_num > 0:
243+
input_embeddings[video_mask] = image_features.cast(self.model._dtype)
244+
245+
return input_embeddings
246+
249247
def forward(
250248
self,
251249
ids_remove_padding: paddle.Tensor,
252250
image_features: Optional[paddle.Tensor],
253251
forward_meta: ForwardMeta,
254252
):
253+
input_embeddings = self.get_input_embeddings(
254+
ids_remove_padding=ids_remove_padding, image_features=image_features
255+
)
256+
self._input_embeddings.copy_(input_embeddings, False)
255257

256258
hidden_states = self.model(
259+
input_embeddings=self._input_embeddings,
257260
ids_remove_padding=ids_remove_padding,
258261
image_features=image_features,
259262
forward_meta=forward_meta,

0 commit comments

Comments
 (0)