27
27
from paddleformers .utils .log import logger
28
28
29
29
from fastdeploy .config import FDConfig
30
+ from fastdeploy .model_executor .forward_meta import ForwardMeta
30
31
from fastdeploy .model_executor .graph_optimization .decorator import (
31
32
support_graph_optimization ,
32
33
)
39
40
ModelRegistry ,
40
41
)
41
42
from fastdeploy .model_executor .models .qwen2 import Qwen2DecoderLayer
42
- from fastdeploy .platforms import current_platform
43
-
44
- if current_platform .is_cuda ():
45
- from fastdeploy .model_executor .ops .gpu import extract_text_token_output
46
-
47
- from fastdeploy .model_executor .forward_meta import ForwardMeta
48
43
49
44
50
45
@support_graph_optimization
@@ -108,31 +103,17 @@ def load_state_dict(self, state_dict):
108
103
logger .info (f"Start load layer { i } " )
109
104
self .layers [i ].load_state_dict (state_dict )
110
105
106
+ def get_input_embeddings (self , ids_remove_padding : paddle .Tensor ) -> paddle .Tensor :
107
+ return self .embed_tokens (ids_remove_padding = ids_remove_padding )
108
+
111
109
def forward (
112
110
self ,
111
+ input_embeddings : paddle .Tensor ,
113
112
ids_remove_padding : paddle .Tensor ,
114
113
image_features : Optional [paddle .Tensor ],
115
114
forward_meta : ForwardMeta ,
116
115
):
117
-
118
- hidden_states = self .embed_tokens (ids_remove_padding = ids_remove_padding )
119
-
120
- # -----------------------
121
- # 将 image_embeds 替换 input_embeds 里的 image video 占位符
122
- image_mask = ids_remove_padding == self .image_token_id
123
- image_token_num = image_mask .sum ()
124
-
125
- video_mask = ids_remove_padding == self .video_token_id
126
- video_token_num = video_mask .sum ()
127
-
128
- # 由于框架只有 image_features,所以目前不支持图片和视频混合
129
- # TODO(wangyafeng) 后续考虑支持传入 video_features
130
- if image_token_num > 0 :
131
- hidden_states [image_mask ] = image_features .cast (self ._dtype )
132
- if video_token_num > 0 :
133
- hidden_states [video_mask ] = image_features .cast (self ._dtype )
134
-
135
- # -----------------------
116
+ hidden_states = input_embeddings
136
117
137
118
residual = None
138
119
for i in range (self .num_layers ):
@@ -144,18 +125,6 @@ def forward(
144
125
145
126
hidden_states = hidden_states + residual
146
127
147
- # -----------------------
148
- max_seq_len , max_seq_len_index = paddle .topk (forward_meta .seq_lens_this_time , k = 1 )
149
- hidden_states = extract_text_token_output (
150
- max_seq_len ,
151
- max_seq_len_index .cast ("int32" ),
152
- image_token_num .cast ("int32" ),
153
- forward_meta .seq_lens_this_time ,
154
- forward_meta .cu_seqlens_q ,
155
- hidden_states .cast ("float32" ),
156
- ).cast (self ._dtype )
157
- # -----------------------
158
-
159
128
out = self .norm (hidden_states )
160
129
161
130
return out
@@ -183,6 +152,12 @@ def __init__(self, fd_config: FDConfig):
183
152
# ----------- language model -------------
184
153
self .model = Qwen2_5_VLModel (fd_config = fd_config )
185
154
155
+ # Persistent buffers for CUDA graphs.
156
+ self ._input_embeddings = paddle .zeros (
157
+ [fd_config .parallel_config .max_model_len , fd_config .model_config .hidden_size ],
158
+ dtype = fd_config .model_config .dtype ,
159
+ )
160
+
186
161
self .ori_vocab_size = fd_config .model_config .ori_vocab_size
187
162
188
163
self .lm_head = ParallelLMHead (
@@ -246,14 +221,42 @@ def empty_input_forward(self):
246
221
self .ernie .layers [i ].mlp .text_fused_moe (fake_hidden_states )
247
222
self .ernie .layers [i ].mlp .image_fused_moe (fake_hidden_states )
248
223
224
+ def get_input_embeddings (
225
+ self ,
226
+ ids_remove_padding : paddle .Tensor ,
227
+ image_features : Optional [paddle .Tensor ] = None ,
228
+ ) -> paddle .Tensor :
229
+
230
+ input_embeddings = self .model .get_input_embeddings (ids_remove_padding = ids_remove_padding )
231
+
232
+ image_mask = ids_remove_padding == self .model .image_token_id
233
+ image_token_num = image_mask .sum ()
234
+
235
+ video_mask = ids_remove_padding == self .model .video_token_id
236
+ video_token_num = video_mask .sum ()
237
+
238
+ # 由于框架只有 image_features,所以目前不支持图片和视频混合
239
+ # TODO(wangyafeng) 后续考虑支持传入 video_features
240
+ if image_token_num > 0 :
241
+ input_embeddings [image_mask ] = image_features .cast (self .model ._dtype )
242
+ if video_token_num > 0 :
243
+ input_embeddings [video_mask ] = image_features .cast (self .model ._dtype )
244
+
245
+ return input_embeddings
246
+
249
247
def forward (
250
248
self ,
251
249
ids_remove_padding : paddle .Tensor ,
252
250
image_features : Optional [paddle .Tensor ],
253
251
forward_meta : ForwardMeta ,
254
252
):
253
+ input_embeddings = self .get_input_embeddings (
254
+ ids_remove_padding = ids_remove_padding , image_features = image_features
255
+ )
256
+ self ._input_embeddings .copy_ (input_embeddings , False )
255
257
256
258
hidden_states = self .model (
259
+ input_embeddings = self ._input_embeddings ,
257
260
ids_remove_padding = ids_remove_padding ,
258
261
image_features = image_features ,
259
262
forward_meta = forward_meta ,
0 commit comments