PaddlePaddle · chang-wenbin · Mar 30, 2025 · Apr 1, 2025 · Apr 1, 2025 · Apr 1, 2025
diff --git a/deploy/deepseek_vl2/deepseek_vl2_infer.py b/deploy/deepseek_vl2/deepseek_vl2_infer.py
@@ -17,22 +17,21 @@
 from dataclasses import dataclass, field
 from typing import Dict, List
 
-import PIL.Image
-
 import numpy as np
 import paddle
+import PIL.Image
 from paddlenlp.generation import GenerationConfig
 from paddlenlp.trainer import PdArgumentParser
-from paddlenlp.transformers import AutoInferenceModelForCausalLM
+from paddlenlp.transformers import AutoInferenceModelForCausalLM, DeepseekTokenizerFast
 from paddlenlp.trl import llm_utils
-from paddlenlp.transformers import DeepseekTokenizerFast
 
 from paddlemix.models.deepseek_vl2 import DeepseekVLV2Config, DeepseekVLV2ForCausalLM
 from paddlemix.processors.deepseek_vl2_processing import DeepseekVLV2Processor
 
 sys.path.append("PaddleNLP/llm/predict")
 from predictor import ModelArgument, PredictorArgument
 
+
 def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]:
     """
 
@@ -75,6 +74,7 @@ class Mix_PredictorArgument(PredictorArgument):
 class Mix_ModelArgument(ModelArgument):
     pass
 
+
 def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument):
     assert len(inputs_embeds.shape) == 3
     batch_size = inputs_embeds.shape[0]
@@ -101,7 +101,6 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument):
     model_inputs["min_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.min_length, dtype="int64")
     model_inputs["max_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.max_length, dtype="int64")
 
-
     model_inputs["bad_tokens"] = paddle.to_tensor([-1], dtype="int64")
     model_inputs["is_block_step"] = paddle.full(shape=[batch_size], fill_value=False, dtype="bool")
 
@@ -116,7 +115,7 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument):
         # for mla's absorption
         assert cache_v_shapes is None
         cache_kvs = [paddle.zeros(shape, dtype=cachekv_dtype) for shape in cache_k_shapes]
- 
+
     model_inputs["cache_kvs"] = cache_kvs
 
     block_nums = arg_config.total_max_length // arg_config.block_size
@@ -127,14 +126,17 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument):
     model_inputs["seq_lens_encoder"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1))
     model_inputs["seq_lens_decoder"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int32")
     model_inputs["step_idx"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int64")
-    model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool").cpu() # must at cpu place, paddlenlp_ops bug: update_inputs_v2
+    model_inputs["not_need_stop"] = paddle.full(
+        shape=[1], fill_value=True, dtype="bool"
+    ).cpu()  # must at cpu place, paddlenlp_ops bug: update_inputs_v2
     model_inputs["stop_flags"] = paddle.full(shape=[batch_size, 1], fill_value=False, dtype="bool")
     model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=batch_size, dtype="int64")
     model_inputs["pre_ids"] = paddle.full(shape=[batch_size, arg_config.max_length], fill_value=-1, dtype="int64")
     model_inputs["next_tokens"] = paddle.full(shape=[batch_size, 1], fill_value=-1, dtype="int64")
 
     return model_inputs
 
+
 def run_model(predictor_args):
     conversation = [
         {
@@ -145,11 +147,8 @@ def run_model(predictor_args):
         {"role": "<|Assistant|>", "content": ""},
     ]
 
-
     pil_images = load_pil_images(conversation)
-    prepare_inputs = processor(
-        conversations=conversation, images=pil_images, force_batchify=True, system_prompt=""
-    )
+    prepare_inputs = processor(conversations=conversation, images=pil_images, force_batchify=True, system_prompt="")
     prepare_inputs.images = prepare_inputs.images.astype(predictor_args.dtype)
     with paddle.no_grad():
         inputs_embeds = vl_model.prepare_inputs_embeds(**prepare_inputs)
@@ -195,10 +194,8 @@ def run_model(predictor_args):
 
 # register llm config
 llm_config = config.language_config
-llm_config.architectures = ['DeepseekVLV2ForCausalLM']
-llm_config.rope_scaling = {
-    "factor": 1
-}
+llm_config.architectures = ["DeepseekVLV2ForCausalLM"]
+llm_config.rope_scaling = {"factor": 1}
 llm_config.rope_scaling_type = {}
 llm_config.qk_rope_head_dim = 64
 llm_config.rope_theta = 10000
@@ -239,7 +236,7 @@ def run_model(predictor_args):
     sumtime = 0.0
     times = repeat_times + warm_up
     for i in range(times):
-        print("run",i)
+        print("run", i)
         if i > 2:
             paddle.device.synchronize()
             starttime = datetime.datetime.now()
@@ -265,4 +262,4 @@ def run_model(predictor_args):
 
 else:
     generated_text = run_model(predictor_args)
-    print("Final output_text:\n", generated_text[0])
+    print("Final output_text:\n", generated_text[0])
diff --git a/deploy/deepseek_vl2/shell/run.sh b/deploy/deepseek_vl2/shell/run.sh
@@ -1,6 +1,21 @@
-export CUDA_VISIBLE_DEVICES=0
-export FLAGS_cascade_attention_max_partition_size=163840
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export CUDA_VISIBLE_DEVICES=1
 export FLAGS_mla_use_tensorcore=1
+export FLAGS_cascade_attention_max_partition_size=128
+export FLAGS_cascade_attention_deal_each_time=16
 
 python deploy/deepseek_vl2/deepseek_vl2_infer.py \
     --model_name_or_path deepseek-ai/deepseek-vl2-small \
@@ -16,5 +31,5 @@ python deploy/deepseek_vl2/deepseek_vl2_infer.py \
     --append_attn True \
     --mode dynamic \
     --dtype bfloat16 \
-    --mla_use_matrix_absorption \
-    --benchmark
+    --enable_stream_output False \
+    --benchmark
diff --git a/deploy/qwen2_5_vl/README.md b/deploy/qwen2_5_vl/README.md
@@ -14,8 +14,7 @@
 
 
 ## 2 环境准备
-1）
-[安装PaddlePaddle](https://github.com/PaddlePaddle/PaddleMIX?tab=readme-ov-file#3-%EF%B8%8F%E5%AE%89%E8%A3%85paddlepaddle)
+1） [安装PaddlePaddle](https://github.com/PaddlePaddle/PaddleMIX?tab=readme-ov-file#3-%EF%B8%8F%E5%AE%89%E8%A3%85paddlepaddle)
 - **python >= 3.10**
 - **paddlepaddle-gpu 要求develop版本**
 ```bash
@@ -61,6 +60,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
     --inference_model True \
     --mode dynamic \
     --dtype bfloat16 \
+    --output_via_mq False \
     --benchmark True
 ```
 
@@ -82,6 +82,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
     --inference_model True \
     --mode dynamic \
     --dtype bfloat16 \
+    --output_via_mq False \
     --quant_type "weight_only_int8" \
     --benchmark True
 ```
@@ -104,6 +105,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3" deploy/qwen2_5_vl/qwen2_5_v
     --mode dynamic \
     --append_attn 1 \
     --dtype bfloat16 \
+    --output_via_mq False \
     --benchmark True
 ```
 
@@ -136,7 +138,7 @@ sh deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh
 |  output_tokens_len |  128 tokens    |
 
 |             model           | Paddle Inference wint8 | Paddle Inference|    PyTorch     | VLLM     |
-| --------------------------- | ---------------------  | --------------- | -------------- |-------------- | 
+| --------------------------- | ---------------------  | --------------- | -------------- |-------------- |
 | Qwen/Qwen2.5-VL-3B-Instruct |          0.994 s       |     1.247 s     |      4.92 s    | 1.39s     |
 | Qwen/Qwen2.5-VL-7B-Instruct |          1.244 s       |     1.768 s     |      3.89 s    | 1.92s     |
 | Qwen/Qwen2.5-VL-72B-Instruct|             -          |     4.806 s     |        -       | -        |
diff --git a/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh b/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh
@@ -12,8 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-export CUDA_VISIBLE_DEVICES=0
+export PYTHONPATH=/root/paddlejob/workspace/env_run/output/changwenbin/PaddleMIX/PaddleNLP
+
+export CUDA_VISIBLE_DEVICES=2
+export USE_FASTER_TOP_P_SAMPLING=1
+
 #fp16  高性能推理
+
 python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
     --model_name_or_path Qwen/Qwen2.5-VL-7B-Instruct \
     --question "Describe this image." \
@@ -27,9 +32,12 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
     --block_attn True \
     --inference_model True \
     --mode dynamic \
+    --append_attn 1 \
     --dtype bfloat16 \
-    --benchmark True 
-
+    --output_via_mq False \
+    --benchmark True
+
+
 
 # # weight only int8 量化推理
 # python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
@@ -46,6 +54,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
 #     --inference_model True \
 #     --mode dynamic \
 #     --dtype bfloat16 \
+#     --output_via_mq False \
 #     --quant_type "weight_only_int8" \
 #     --benchmark True 
 
@@ -67,4 +76,5 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
 #     --mode dynamic \
 #     --append_attn 1 \
 #     --dtype bfloat16 \
+#     --output_via_mq False \
 #     --benchmark True 
diff --git a/deploy/qwen2_vl/README.md b/deploy/qwen2_vl/README.md
@@ -50,6 +50,7 @@ python deploy/qwen2_vl/single_image_infer.py\
     --inference_model True \
     --mode dynamic \
     --dtype bfloat16 \
+    --output_via_mq False \
     --benchmark True
 
 ### 3.2. 文本&视频输入高性能推理

diff --git a/deploy/qwen2_vl/scripts/qwen2_vl.sh b/deploy/qwen2_vl/scripts/qwen2_vl.sh
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+export PYTHONPATH=/root/paddlejob/workspace/env_run/output/changwenbin/PaddleMIX/PaddleNLP
 
 export CUDA_VISIBLE_DEVICES=0
 #fp16  高性能推理
@@ -29,6 +30,7 @@ python deploy/qwen2_vl/single_image_infer.py\
     --inference_model True \
     --mode dynamic \
     --dtype bfloat16 \
+    --output_via_mq False \
     --benchmark True 
 
 
@@ -48,6 +50,7 @@ python deploy/qwen2_vl/single_image_infer.py\
 #     --inference_model True \
 #     --mode dynamic \
 #     --dtype bfloat16 \
+#     --output_via_mq False \
 #     --quant_type "weight_only_int8" \
 #     --benchmark True
 
@@ -67,4 +70,5 @@ python deploy/qwen2_vl/single_image_infer.py\
 #     --inference_model True \
 #     --mode dynamic \
 #     --dtype bfloat16 \
+#     --output_via_mq False \
 #     --benchmark True 
diff --git a/paddlemix/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/paddlemix/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@@ -622,6 +622,11 @@ def __init__(self, config, attn_implementation: str = "sdpa") -> None:
 
         self.mlp = Qwen2_5_VLMLP(config, bias=True)
 
+    @paddle.incubate.jit.inference(
+        save_model_dir="./tmp/qwen2_5_VL",
+        enable_new_ir=True,
+        cache_static_model=True,
+    )
     def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> paddle.Tensor:
         hidden_states = hidden_states + self.attn(
             self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb