Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 14 additions & 17 deletions deploy/deepseek_vl2/deepseek_vl2_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,21 @@
from dataclasses import dataclass, field
from typing import Dict, List

import PIL.Image

import numpy as np
import paddle
import PIL.Image
from paddlenlp.generation import GenerationConfig
from paddlenlp.trainer import PdArgumentParser
from paddlenlp.transformers import AutoInferenceModelForCausalLM
from paddlenlp.transformers import AutoInferenceModelForCausalLM, DeepseekTokenizerFast
from paddlenlp.trl import llm_utils
from paddlenlp.transformers import DeepseekTokenizerFast

from paddlemix.models.deepseek_vl2 import DeepseekVLV2Config, DeepseekVLV2ForCausalLM
from paddlemix.processors.deepseek_vl2_processing import DeepseekVLV2Processor

sys.path.append("PaddleNLP/llm/predict")
from predictor import ModelArgument, PredictorArgument


def load_pil_images(conversations: List[Dict[str, str]]) -> List[PIL.Image.Image]:
"""

Expand Down Expand Up @@ -75,6 +74,7 @@ class Mix_PredictorArgument(PredictorArgument):
class Mix_ModelArgument(ModelArgument):
pass


def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument):
assert len(inputs_embeds.shape) == 3
batch_size = inputs_embeds.shape[0]
Expand All @@ -101,7 +101,6 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument):
model_inputs["min_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.min_length, dtype="int64")
model_inputs["max_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.max_length, dtype="int64")


model_inputs["bad_tokens"] = paddle.to_tensor([-1], dtype="int64")
model_inputs["is_block_step"] = paddle.full(shape=[batch_size], fill_value=False, dtype="bool")

Expand All @@ -116,7 +115,7 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument):
# for mla's absorption
assert cache_v_shapes is None
cache_kvs = [paddle.zeros(shape, dtype=cachekv_dtype) for shape in cache_k_shapes]

model_inputs["cache_kvs"] = cache_kvs

block_nums = arg_config.total_max_length // arg_config.block_size
Expand All @@ -127,14 +126,17 @@ def init_llm_model_inputs(inputs_embeds, arg_config: Mix_PredictorArgument):
model_inputs["seq_lens_encoder"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1))
model_inputs["seq_lens_decoder"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int32")
model_inputs["step_idx"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int64")
model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool").cpu() # must at cpu place, paddlenlp_ops bug: update_inputs_v2
model_inputs["not_need_stop"] = paddle.full(
shape=[1], fill_value=True, dtype="bool"
).cpu() # must at cpu place, paddlenlp_ops bug: update_inputs_v2
model_inputs["stop_flags"] = paddle.full(shape=[batch_size, 1], fill_value=False, dtype="bool")
model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=batch_size, dtype="int64")
model_inputs["pre_ids"] = paddle.full(shape=[batch_size, arg_config.max_length], fill_value=-1, dtype="int64")
model_inputs["next_tokens"] = paddle.full(shape=[batch_size, 1], fill_value=-1, dtype="int64")

return model_inputs


def run_model(predictor_args):
conversation = [
{
Expand All @@ -145,11 +147,8 @@ def run_model(predictor_args):
{"role": "<|Assistant|>", "content": ""},
]


pil_images = load_pil_images(conversation)
prepare_inputs = processor(
conversations=conversation, images=pil_images, force_batchify=True, system_prompt=""
)
prepare_inputs = processor(conversations=conversation, images=pil_images, force_batchify=True, system_prompt="")
prepare_inputs.images = prepare_inputs.images.astype(predictor_args.dtype)
with paddle.no_grad():
inputs_embeds = vl_model.prepare_inputs_embeds(**prepare_inputs)
Expand Down Expand Up @@ -195,10 +194,8 @@ def run_model(predictor_args):

# register llm config
llm_config = config.language_config
llm_config.architectures = ['DeepseekVLV2ForCausalLM']
llm_config.rope_scaling = {
"factor": 1
}
llm_config.architectures = ["DeepseekVLV2ForCausalLM"]
llm_config.rope_scaling = {"factor": 1}
llm_config.rope_scaling_type = {}
llm_config.qk_rope_head_dim = 64
llm_config.rope_theta = 10000
Expand Down Expand Up @@ -239,7 +236,7 @@ def run_model(predictor_args):
sumtime = 0.0
times = repeat_times + warm_up
for i in range(times):
print("run",i)
print("run", i)
if i > 2:
paddle.device.synchronize()
starttime = datetime.datetime.now()
Expand All @@ -265,4 +262,4 @@ def run_model(predictor_args):

else:
generated_text = run_model(predictor_args)
print("Final output_text:\n", generated_text[0])
print("Final output_text:\n", generated_text[0])
23 changes: 19 additions & 4 deletions deploy/deepseek_vl2/shell/run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
export CUDA_VISIBLE_DEVICES=0
export FLAGS_cascade_attention_max_partition_size=163840
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=1
export FLAGS_mla_use_tensorcore=1
export FLAGS_cascade_attention_max_partition_size=128
export FLAGS_cascade_attention_deal_each_time=16

python deploy/deepseek_vl2/deepseek_vl2_infer.py \
--model_name_or_path deepseek-ai/deepseek-vl2-small \
Expand All @@ -16,5 +31,5 @@ python deploy/deepseek_vl2/deepseek_vl2_infer.py \
--append_attn True \
--mode dynamic \
--dtype bfloat16 \
--mla_use_matrix_absorption \
--benchmark
--enable_stream_output False \
--benchmark
8 changes: 5 additions & 3 deletions deploy/qwen2_5_vl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@


## 2 环境准备
1)
[安装PaddlePaddle](https://github.com/PaddlePaddle/PaddleMIX?tab=readme-ov-file#3-%EF%B8%8F%E5%AE%89%E8%A3%85paddlepaddle)
1) [安装PaddlePaddle](https://github.com/PaddlePaddle/PaddleMIX?tab=readme-ov-file#3-%EF%B8%8F%E5%AE%89%E8%A3%85paddlepaddle)
- **python >= 3.10**
- **paddlepaddle-gpu 要求develop版本**
```bash
Expand Down Expand Up @@ -61,6 +60,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
--inference_model True \
--mode dynamic \
--dtype bfloat16 \
--output_via_mq False \
--benchmark True
```

Expand All @@ -82,6 +82,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
--inference_model True \
--mode dynamic \
--dtype bfloat16 \
--output_via_mq False \
--quant_type "weight_only_int8" \
--benchmark True
```
Expand All @@ -104,6 +105,7 @@ python -m paddle.distributed.launch --gpus "0,1,2,3" deploy/qwen2_5_vl/qwen2_5_v
--mode dynamic \
--append_attn 1 \
--dtype bfloat16 \
--output_via_mq False \
--benchmark True
```

Expand Down Expand Up @@ -136,7 +138,7 @@ sh deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh
| output_tokens_len | 128 tokens |

| model | Paddle Inference wint8 | Paddle Inference| PyTorch | VLLM |
| --------------------------- | --------------------- | --------------- | -------------- |-------------- |
| --------------------------- | --------------------- | --------------- | -------------- |-------------- |
| Qwen/Qwen2.5-VL-3B-Instruct | 0.994 s | 1.247 s | 4.92 s | 1.39s |
| Qwen/Qwen2.5-VL-7B-Instruct | 1.244 s | 1.768 s | 3.89 s | 1.92s |
| Qwen/Qwen2.5-VL-72B-Instruct| - | 4.806 s | - | - |
16 changes: 13 additions & 3 deletions deploy/qwen2_5_vl/scripts/qwen2_5_vl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_VISIBLE_DEVICES=0
export PYTHONPATH=/root/paddlejob/workspace/env_run/output/changwenbin/PaddleMIX/PaddleNLP

export CUDA_VISIBLE_DEVICES=2
export USE_FASTER_TOP_P_SAMPLING=1

#fp16 高性能推理

python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
--model_name_or_path Qwen/Qwen2.5-VL-7B-Instruct \
--question "Describe this image." \
Expand All @@ -27,9 +32,12 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
--block_attn True \
--inference_model True \
--mode dynamic \
--append_attn 1 \
--dtype bfloat16 \
--benchmark True

--output_via_mq False \
--benchmark True



# # weight only int8 量化推理
# python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
Expand All @@ -46,6 +54,7 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
# --inference_model True \
# --mode dynamic \
# --dtype bfloat16 \
# --output_via_mq False \
# --quant_type "weight_only_int8" \
# --benchmark True

Expand All @@ -67,4 +76,5 @@ python deploy/qwen2_5_vl/qwen2_5_vl_infer.py \
# --mode dynamic \
# --append_attn 1 \
# --dtype bfloat16 \
# --output_via_mq False \
# --benchmark True
1 change: 1 addition & 0 deletions deploy/qwen2_vl/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ python deploy/qwen2_vl/single_image_infer.py\
--inference_model True \
--mode dynamic \
--dtype bfloat16 \
--output_via_mq False \
--benchmark True

### 3.2. 文本&视频输入高性能推理
Expand Down
4 changes: 4 additions & 0 deletions deploy/qwen2_vl/scripts/qwen2_vl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

export PYTHONPATH=/root/paddlejob/workspace/env_run/output/changwenbin/PaddleMIX/PaddleNLP

export CUDA_VISIBLE_DEVICES=0
#fp16 高性能推理
Expand All @@ -29,6 +30,7 @@ python deploy/qwen2_vl/single_image_infer.py\
--inference_model True \
--mode dynamic \
--dtype bfloat16 \
--output_via_mq False \
--benchmark True


Expand All @@ -48,6 +50,7 @@ python deploy/qwen2_vl/single_image_infer.py\
# --inference_model True \
# --mode dynamic \
# --dtype bfloat16 \
# --output_via_mq False \
# --quant_type "weight_only_int8" \
# --benchmark True

Expand All @@ -67,4 +70,5 @@ python deploy/qwen2_vl/single_image_infer.py\
# --inference_model True \
# --mode dynamic \
# --dtype bfloat16 \
# --output_via_mq False \
# --benchmark True
5 changes: 5 additions & 0 deletions paddlemix/models/qwen2_5_vl/modeling_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,6 +622,11 @@ def __init__(self, config, attn_implementation: str = "sdpa") -> None:

self.mlp = Qwen2_5_VLMLP(config, bias=True)

@paddle.incubate.jit.inference(
save_model_dir="./tmp/qwen2_5_VL",
enable_new_ir=True,
cache_static_model=True,
)
def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> paddle.Tensor:
hidden_states = hidden_states + self.attn(
self.norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
Expand Down