From 49b7217be6e8cd5726495edab32b5037629b1c76 Mon Sep 17 00:00:00 2001 From: chang-wenbin Date: Sun, 30 Mar 2025 14:29:22 +0800 Subject: [PATCH 1/4] update qwen2_5vl inference optimize --- deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh | 12 +++++++++--- paddlemix/models/qwen2_5_vl/modeling_qwen2_5_vl.py | 5 +++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh b/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh index c09a0f0bb..bace67a82 100644 --- a/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh +++ b/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -export CUDA_VISIBLE_DEVICES=0 + +export CUDA_VISIBLE_DEVICES=2 +export USE_FASTER_TOP_P_SAMPLING=1 + #fp16 高性能推理 + python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ --model_name_or_path Qwen/Qwen2.5-VL-7B-Instruct \ --question "Describe this image." \ @@ -27,9 +31,11 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ --block_attn True \ --inference_model True \ --mode dynamic \ + --append_attn 1 \ --dtype bfloat16 \ - --benchmark True - + --benchmark True + + # # weight only int8 量化推理 # python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ diff --git a/paddlemix/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/paddlemix/models/qwen2_5_vl/modeling_qwen2_5_vl.py index fc84aa3ad..fd037a515 100644 --- a/paddlemix/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/paddlemix/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -622,6 +622,11 @@ def __init__(self, config, attn_implementation: str = "sdpa") -> None: self.mlp = Qwen2_5_VLMLP(config, bias=True) + @paddle.incubate.jit.inference( + save_model_dir="./tmp/qwen2_5_VL", + enable_new_ir=True, + cache_static_model=True, + ) def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> paddle.Tensor: hidden_states = hidden_states + self.attn( self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb From 8a1c01f7bf1b69ace891d8e17f700e7e7dcdf33c Mon Sep 17 00:00:00 2001 From: chang-wenbin Date: Tue, 1 Apr 2025 20:14:09 +0800 Subject: [PATCH 2/4] fix stream output bug --- deploy/qwen2_5_vl/README.md | 8 +++++--- deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh | 3 +++ deploy/qwen2_vl/README.md | 1 + deploy/qwen2_vl/scripts/qwen2_vl.sh | 3 +++ 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/deploy/qwen2_5_vl/README.md b/deploy/qwen2_5_vl/README.md index 24f2ca9be..144776c0e 100644 --- a/deploy/qwen2_5_vl/README.md +++ b/deploy/qwen2_5_vl/README.md @@ -14,8 +14,7 @@ ## 2 环境准备 -1) -[安装PaddlePaddle](https://github.com/PaddlePaddle/PaddleMIX?tab=readme-ov-file#3-%EF%B8%8F%E5%AE%89%E8%A3%85paddlepaddle) +1) [安装PaddlePaddle](https://github.com/PaddlePaddle/PaddleMIX?tab=readme-ov-file#3-%EF%B8%8F%E5%AE%89%E8%A3%85paddlepaddle) - **python >= 3.10** - **paddlepaddle-gpu 要求develop版本** ```bash @@ -61,6 +60,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ --inference_model True \ --mode dynamic \ --dtype bfloat16 \ + --enable_stream_output False \ --benchmark True ``` @@ -82,6 +82,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ --inference_model True \ --mode dynamic \ --dtype bfloat16 \ + --enable_stream_output False \ --quant_type "weight_only_int8" \ --benchmark True ``` @@ -104,6 +105,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3" deploy/qwen2_5_vl/qwen2_5_v --mode dynamic \ --append_attn 1 \ --dtype bfloat16 \ + --enable_stream_output False \ --benchmark True ``` @@ -136,7 +138,7 @@ sh deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh | output_tokens_len | 128 tokens | | model | Paddle Inference wint8 | Paddle Inference| PyTorch | VLLM | -| --------------------------- | --------------------- | --------------- | -------------- |-------------- | +| --------------------------- | --------------------- | --------------- | -------------- |-------------- | | Qwen/Qwen2.5-VL-3B-Instruct | 0.994 s | 1.247 s | 4.92 s | 1.39s | | Qwen/Qwen2.5-VL-7B-Instruct | 1.244 s | 1.768 s | 3.89 s | 1.92s | | Qwen/Qwen2.5-VL-72B-Instruct| - | 4.806 s | - | - | diff --git a/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh b/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh index bace67a82..55e4a9e97 100644 --- a/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh +++ b/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh @@ -33,6 +33,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ --mode dynamic \ --append_attn 1 \ --dtype bfloat16 \ + --enable_stream_output False \ --benchmark True @@ -52,6 +53,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ # --inference_model True \ # --mode dynamic \ # --dtype bfloat16 \ +# --enable_stream_output False \ # --quant_type "weight_only_int8" \ # --benchmark True @@ -73,4 +75,5 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ # --mode dynamic \ # --append_attn 1 \ # --dtype bfloat16 \ +# --enable_stream_output False \ # --benchmark True diff --git a/deploy/qwen2_vl/README.md b/deploy/qwen2_vl/README.md index 57706b688..597ad6a21 100644 --- a/deploy/qwen2_vl/README.md +++ b/deploy/qwen2_vl/README.md @@ -50,6 +50,7 @@ python deploy/qwen2_vl/single_image_infer.py\ --inference_model True \ --mode dynamic \ --dtype bfloat16 \ + --enable_stream_output False \ --benchmark True ### 3.2. 文本&视频输入高性能推理 diff --git a/deploy/qwen2_vl/scripts/qwen2_vl.sh b/deploy/qwen2_vl/scripts/qwen2_vl.sh index 910bd66f5..216856dcc 100644 --- a/deploy/qwen2_vl/scripts/qwen2_vl.sh +++ b/deploy/qwen2_vl/scripts/qwen2_vl.sh @@ -29,6 +29,7 @@ python deploy/qwen2_vl/single_image_infer.py\ --inference_model True \ --mode dynamic \ --dtype bfloat16 \ + --enable_stream_output False \ --benchmark True @@ -48,6 +49,7 @@ python deploy/qwen2_vl/single_image_infer.py\ # --inference_model True \ # --mode dynamic \ # --dtype bfloat16 \ +# --enable_stream_output False \ # --quant_type "weight_only_int8" \ # --benchmark True @@ -67,4 +69,5 @@ python deploy/qwen2_vl/single_image_infer.py\ # --inference_model True \ # --mode dynamic \ # --dtype bfloat16 \ +# --enable_stream_output False \ # --benchmark True From 3c8d7f857da5cd2765912c5dc588e7d184125bf9 Mon Sep 17 00:00:00 2001 From: chang-wenbin Date: Tue, 1 Apr 2025 20:16:24 +0800 Subject: [PATCH 3/4] update deepseek --- deploy/deepseek_vl2/deepseek_vl2_infer.py | 31 ++++++++++------------- deploy/deepseek_vl2/shell/run.sh | 23 ++++++++++++++--- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/deploy/deepseek_vl2/deepseek_vl2_infer.py b/deploy/deepseek_vl2/deepseek_vl2_infer.py index 62fc2600f..fc811179c 100644 --- a/deploy/deepseek_vl2/deepseek_vl2_infer.py +++ b/deploy/deepseek_vl2/deepseek_vl2_infer.py @@ -17,15 +17,13 @@ from dataclasses import dataclass, field from typing import Dict, List -import PIL.Image - import numpy as np import paddle +import PIL.Image from paddlenlp.generation import GenerationConfig from paddlenlp.trainer import PdArgumentParser -from paddlenlp.transformers import AutoInferenceModelForCausalLM +from paddlenlp.transformers import AutoInferenceModelForCausalLM, DeepseekTokenizerFast from paddlenlp.trl import llm_utils -from paddlenlp.transformers import DeepseekTokenizerFast from paddlemix.models.deepseek_vl2 import DeepseekVLV2Config, DeepseekVLV2ForCausalLM from paddlemix.processors.deepseek_vl2_processing import DeepseekVLV2Processor @@ -33,6 +31,7 @@ sys.path.append("PaddleNLP/llm/predict") from predictor import ModelArgument, PredictorArgument + def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]: """ @@ -75,6 +74,7 @@ class Mix_PredictorArgument(PredictorArgument): class Mix_ModelArgument(ModelArgument): pass + def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument): assert len(inputs_embeds.shape) == 3 batch_size = inputs_embeds.shape[0] @@ -101,7 +101,6 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument): model_inputs["min_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.min_length, dtype="int64") model_inputs["max_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.max_length, dtype="int64") - model_inputs["bad_tokens"] = paddle.to_tensor([-1], dtype="int64") model_inputs["is_block_step"] = paddle.full(shape=[batch_size], fill_value=False, dtype="bool") @@ -116,7 +115,7 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument): # for mla's absorption assert cache_v_shapes is None cache_kvs = [paddle.zeros(shape, dtype=cachekv_dtype) for shape in cache_k_shapes] - + model_inputs["cache_kvs"] = cache_kvs block_nums = arg_config.total_max_length // arg_config.block_size @@ -127,7 +126,9 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument): model_inputs["seq_lens_encoder"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1)) model_inputs["seq_lens_decoder"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int32") model_inputs["step_idx"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int64") - model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool").cpu() # must at cpu place, paddlenlp_ops bug: update_inputs_v2 + model_inputs["not_need_stop"] = paddle.full( + shape=[1], fill_value=True, dtype="bool" + ).cpu() # must at cpu place, paddlenlp_ops bug: update_inputs_v2 model_inputs["stop_flags"] = paddle.full(shape=[batch_size, 1], fill_value=False, dtype="bool") model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=batch_size, dtype="int64") model_inputs["pre_ids"] = paddle.full(shape=[batch_size, arg_config.max_length], fill_value=-1, dtype="int64") @@ -135,6 +136,7 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument): return model_inputs + def run_model(predictor_args): conversation = [ { @@ -145,11 +147,8 @@ def run_model(predictor_args): {"role": "<|Assistant|>", "content": ""}, ] - pil_images = load_pil_images(conversation) - prepare_inputs = processor( - conversations=conversation, images=pil_images, force_batchify=True, system_prompt="" - ) + prepare_inputs = processor(conversations=conversation, images=pil_images, force_batchify=True, system_prompt="") prepare_inputs.images = prepare_inputs.images.astype(predictor_args.dtype) with paddle.no_grad(): inputs_embeds = vl_model.prepare_inputs_embeds(**prepare_inputs) @@ -195,10 +194,8 @@ def run_model(predictor_args): # register llm config llm_config = config.language_config -llm_config.architectures = ['DeepseekVLV2ForCausalLM'] -llm_config.rope_scaling = { - "factor": 1 -} +llm_config.architectures = ["DeepseekVLV2ForCausalLM"] +llm_config.rope_scaling = {"factor": 1} llm_config.rope_scaling_type = {} llm_config.qk_rope_head_dim = 64 llm_config.rope_theta = 10000 @@ -239,7 +236,7 @@ def run_model(predictor_args): sumtime = 0.0 times = repeat_times + warm_up for i in range(times): - print("run",i) + print("run", i) if i > 2: paddle.device.synchronize() starttime = datetime.datetime.now() @@ -265,4 +262,4 @@ def run_model(predictor_args): else: generated_text = run_model(predictor_args) - print("Final output_text:\n", generated_text[0]) \ No newline at end of file + print("Final output_text:\n", generated_text[0]) diff --git a/deploy/deepseek_vl2/shell/run.sh b/deploy/deepseek_vl2/shell/run.sh index e519eba86..fde5660ba 100644 --- a/deploy/deepseek_vl2/shell/run.sh +++ b/deploy/deepseek_vl2/shell/run.sh @@ -1,6 +1,21 @@ -export CUDA_VISIBLE_DEVICES=0 -export FLAGS_cascade_attention_max_partition_size=163840 +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +export CUDA_VISIBLE_DEVICES=1 export FLAGS_mla_use_tensorcore=1 +export FLAGS_cascade_attention_max_partition_size=128 +export FLAGS_cascade_attention_deal_each_time=16 python deploy/deepseek_vl2/deepseek_vl2_infer.py \ --model_name_or_path deepseek-ai/deepseek-vl2-small \ @@ -16,5 +31,5 @@ python deploy/deepseek_vl2/deepseek_vl2_infer.py \ --append_attn True \ --mode dynamic \ --dtype bfloat16 \ - --mla_use_matrix_absorption \ - --benchmark \ No newline at end of file + --enable_stream_output False \ + --benchmark From 9254cb4b83d797face5750e2284547a37281d10b Mon Sep 17 00:00:00 2001 From: chang-wenbin Date: Wed, 2 Apr 2025 16:15:00 +0800 Subject: [PATCH 4/4] update name --- deploy/qwen2_5_vl/README.md | 6 +++--- deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh | 7 ++++--- deploy/qwen2_vl/README.md | 2 +- deploy/qwen2_vl/scripts/qwen2_vl.sh | 7 ++++--- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/deploy/qwen2_5_vl/README.md b/deploy/qwen2_5_vl/README.md index 144776c0e..a5635f201 100644 --- a/deploy/qwen2_5_vl/README.md +++ b/deploy/qwen2_5_vl/README.md @@ -60,7 +60,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ --inference_model True \ --mode dynamic \ --dtype bfloat16 \ - --enable_stream_output False \ + --output_via_mq False \ --benchmark True ``` @@ -82,7 +82,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ --inference_model True \ --mode dynamic \ --dtype bfloat16 \ - --enable_stream_output False \ + --output_via_mq False \ --quant_type "weight_only_int8" \ --benchmark True ``` @@ -105,7 +105,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3" deploy/qwen2_5_vl/qwen2_5_v --mode dynamic \ --append_attn 1 \ --dtype bfloat16 \ - --enable_stream_output False \ + --output_via_mq False \ --benchmark True ``` diff --git a/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh b/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh index 55e4a9e97..2c69fe4b5 100644 --- a/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh +++ b/deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +export PYTHONPATH=/root/paddlejob/workspace/env_run/output/changwenbin/PaddleMIX/PaddleNLP export CUDA_VISIBLE_DEVICES=2 export USE_FASTER_TOP_P_SAMPLING=1 @@ -33,7 +34,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ --mode dynamic \ --append_attn 1 \ --dtype bfloat16 \ - --enable_stream_output False \ + --output_via_mq False \ --benchmark True @@ -53,7 +54,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ # --inference_model True \ # --mode dynamic \ # --dtype bfloat16 \ -# --enable_stream_output False \ +# --output_via_mq False \ # --quant_type "weight_only_int8" \ # --benchmark True @@ -75,5 +76,5 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \ # --mode dynamic \ # --append_attn 1 \ # --dtype bfloat16 \ -# --enable_stream_output False \ +# --output_via_mq False \ # --benchmark True diff --git a/deploy/qwen2_vl/README.md b/deploy/qwen2_vl/README.md index 597ad6a21..8f98fe3a7 100644 --- a/deploy/qwen2_vl/README.md +++ b/deploy/qwen2_vl/README.md @@ -50,7 +50,7 @@ python deploy/qwen2_vl/single_image_infer.py\ --inference_model True \ --mode dynamic \ --dtype bfloat16 \ - --enable_stream_output False \ + --output_via_mq False \ --benchmark True ### 3.2. 文本&视频输入高性能推理 diff --git a/deploy/qwen2_vl/scripts/qwen2_vl.sh b/deploy/qwen2_vl/scripts/qwen2_vl.sh index 216856dcc..8dd0b57ab 100644 --- a/deploy/qwen2_vl/scripts/qwen2_vl.sh +++ b/deploy/qwen2_vl/scripts/qwen2_vl.sh @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +export PYTHONPATH=/root/paddlejob/workspace/env_run/output/changwenbin/PaddleMIX/PaddleNLP export CUDA_VISIBLE_DEVICES=0 #fp16 高性能推理 @@ -29,7 +30,7 @@ python deploy/qwen2_vl/single_image_infer.py\ --inference_model True \ --mode dynamic \ --dtype bfloat16 \ - --enable_stream_output False \ + --output_via_mq False \ --benchmark True @@ -49,7 +50,7 @@ python deploy/qwen2_vl/single_image_infer.py\ # --inference_model True \ # --mode dynamic \ # --dtype bfloat16 \ -# --enable_stream_output False \ +# --output_via_mq False \ # --quant_type "weight_only_int8" \ # --benchmark True @@ -69,5 +70,5 @@ python deploy/qwen2_vl/single_image_infer.py\ # --inference_model True \ # --mode dynamic \ # --dtype bfloat16 \ -# --enable_stream_output False \ +# --output_via_mq False \ # --benchmark True