Skip to content

Commit c72ee5e

Browse files
committed
[Iluvatar GPU] Adapt VL model
1 parent 7c91907 commit c72ee5e

File tree

12 files changed

+341
-7
lines changed

12 files changed

+341
-7
lines changed

custom_ops/setup_ops.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,10 @@ def find_end_files(directory, end_str):
535535
"gpu_ops/token_penalty_multi_scores.cu",
536536
"gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
537537
"gpu_ops/sample_kernels/top_k_renorm_probs.cu",
538+
"gpu_ops/text_image_index_out.cu",
539+
"gpu_ops/text_image_gather_scatter.cu",
540+
"gpu_ops/extract_text_token_output.cu",
541+
"gpu_ops/set_data_ipc.cu",
538542
"iluvatar_ops/moe_dispatch.cu",
539543
"iluvatar_ops/moe_reduce.cu",
540544
"iluvatar_ops/paged_attn.cu",

docs/get_started/installation/iluvatar_gpu.md

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,3 +409,148 @@ Accuracy: 0.962
409409
Invaild: 0.000
410410
Latency: 17332.728 s
411411
```
412+
413+
# Run ERNIE-4.5-VL-28B-A3B-Paddle model on iluvatar machine
414+
415+
## Machine Preparation
416+
First, the `TP=2` when running the ERNIE-4.5-VL-28B-A3B-Paddle model and so you need to prepare a machine with the following configurations:
417+
418+
| CPU | Memory | Card | Hard Disk|
419+
| :---: | :---: | :---: | :---: |
420+
| x86 | 1TB| 2xBI150| 1TB|
421+
422+
## Image Preparation
423+
Pull the Docker image
424+
425+
```bash
426+
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
427+
```
428+
429+
## Container Preparation
430+
### Start Container
431+
432+
```bash
433+
docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
434+
docker exec -it paddle_infer bash
435+
```
436+
437+
/home/paddle contains the model files, *.whl packages, and scripts.
438+
439+
### Install paddle
440+
441+
```bash
442+
pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
443+
pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
444+
```
445+
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
446+
447+
### Install FastDeploy
448+
```bash
449+
pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
450+
```
451+
452+
## Prepare the inference demo script
453+
454+
script list below:
455+
456+
`run_demo_vl.sh`:
457+
458+
```bash
459+
#!/bin/bash
460+
export PADDLE_XCCL_BACKEND=iluvatar_gpu
461+
export INFERENCE_MSG_QUEUE_ID=232132
462+
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
463+
export FD_SAMPLING_CLASS=rejection
464+
export FD_DEBUG=1
465+
python3 run_demo_vl.py
466+
```
467+
468+
`run_demo_vl.py`:
469+
470+
```python
471+
import io
472+
import requests
473+
from PIL import Image
474+
475+
from fastdeploy.entrypoints.llm import LLM
476+
from fastdeploy.engine.sampling_params import SamplingParams
477+
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
478+
479+
480+
PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle"
481+
tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
482+
483+
messages = [
484+
{
485+
"role": "user",
486+
"content": [
487+
{"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
488+
{"type":"text", "text":"图中的文物属于哪个年代"}
489+
]
490+
}
491+
]
492+
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
493+
images, videos = [], []
494+
for message in messages:
495+
content = message["content"]
496+
if not isinstance(content, list):
497+
continue
498+
for part in content:
499+
if part["type"] == "image_url":
500+
url = part["image_url"]["url"]
501+
image_bytes = requests.get(url).content
502+
img = Image.open(io.BytesIO(image_bytes))
503+
images.append(img)
504+
elif part["type"] == "video_url":
505+
url = part["video_url"]["url"]
506+
video_bytes = requests.get(url).content
507+
videos.append({
508+
"video": video_bytes,
509+
"max_frames": 30
510+
})
511+
512+
sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
513+
llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
514+
outputs = llm.generate(prompts={
515+
"prompt": prompt,
516+
"multimodal_data": {
517+
"image": images,
518+
"video": videos
519+
}
520+
}, sampling_params=sampling_params)
521+
# 输出结果
522+
for output in outputs:
523+
prompt = output.prompt
524+
generated_text = output.outputs.text
525+
reasoning_text = output.outputs.reasoning_content
526+
print(f"generated_text={generated_text}")
527+
```
528+
529+
## run demo
530+
531+
```bash
532+
./run_demo_vl.sh
533+
```
534+
535+
The following logs will be printed:
536+
537+
```
538+
[2025-09-23 10:13:10,844] [ INFO] - Using download source: huggingface
539+
[2025-09-23 10:13:10,844] [ INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json
540+
[2025-09-23 10:13:10,845] [ INFO] - Using download source: huggingface
541+
[2025-09-23 10:13:10,845] [ INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json
542+
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or
543+
unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
544+
warnings.warn(
545+
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset
546+
`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. warnings.warn(
547+
INFO 2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready...
548+
Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00, 1.41s/it]
549+
Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00, 6.65it/s]
550+
INFO 2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds.
551+
prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
552+
generated_text=
553+
图中的文物是**北齐释迦牟尼佛像**,属于**北齐(公元550年-577年)**的文物。
554+
555+
这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。
556+
```

docs/zh/get_started/installation/iluvatar_gpu.md

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -409,3 +409,148 @@ Accuracy: 0.962
409409
Invaild: 0.000
410410
Latency: 17332.728 s
411411
```
412+
413+
# 如何在天数机器上运行ERNIE-4.5-VL-28B-A3B-Paddle model
414+
415+
## 准备机器
416+
首先运行ERNIE-4.5-VL-28B-A3B-Paddle模型需要`TP=2`, 所以您需要准备以下配置的机器::
417+
418+
| CPU | Memory | Card | Hard Disk|
419+
| :---: | :---: | :---: | :---: |
420+
| x86 | 1TB| 2xBI150| 1TB|
421+
422+
## 准备镜像
423+
拉取镜像:
424+
425+
```bash
426+
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
427+
```
428+
429+
## 准备容器
430+
### 启动容器
431+
432+
```bash
433+
docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
434+
docker exec -it paddle_infer bash
435+
```
436+
437+
/home/paddle 为模型文件、whl包、脚本所在目录。
438+
439+
### Install paddle
440+
441+
```bash
442+
pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
443+
pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
444+
```
445+
获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
446+
447+
### 安装FastDeploy
448+
```bash
449+
pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
450+
```
451+
452+
## 准备推理demo脚本
453+
454+
脚本列表如下所示:
455+
456+
`run_demo_vl.sh`:
457+
458+
```bash
459+
#!/bin/bash
460+
export PADDLE_XCCL_BACKEND=iluvatar_gpu
461+
export INFERENCE_MSG_QUEUE_ID=232132
462+
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
463+
export FD_SAMPLING_CLASS=rejection
464+
export FD_DEBUG=1
465+
python3 run_demo_vl.py
466+
```
467+
468+
`run_demo_vl.py`:
469+
470+
```python
471+
import io
472+
import requests
473+
from PIL import Image
474+
475+
from fastdeploy.entrypoints.llm import LLM
476+
from fastdeploy.engine.sampling_params import SamplingParams
477+
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
478+
479+
480+
PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle"
481+
tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
482+
483+
messages = [
484+
{
485+
"role": "user",
486+
"content": [
487+
{"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
488+
{"type":"text", "text":"图中的文物属于哪个年代"}
489+
]
490+
}
491+
]
492+
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
493+
images, videos = [], []
494+
for message in messages:
495+
content = message["content"]
496+
if not isinstance(content, list):
497+
continue
498+
for part in content:
499+
if part["type"] == "image_url":
500+
url = part["image_url"]["url"]
501+
image_bytes = requests.get(url).content
502+
img = Image.open(io.BytesIO(image_bytes))
503+
images.append(img)
504+
elif part["type"] == "video_url":
505+
url = part["video_url"]["url"]
506+
video_bytes = requests.get(url).content
507+
videos.append({
508+
"video": video_bytes,
509+
"max_frames": 30
510+
})
511+
512+
sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
513+
llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
514+
outputs = llm.generate(prompts={
515+
"prompt": prompt,
516+
"multimodal_data": {
517+
"image": images,
518+
"video": videos
519+
}
520+
}, sampling_params=sampling_params)
521+
# 输出结果
522+
for output in outputs:
523+
prompt = output.prompt
524+
generated_text = output.outputs.text
525+
reasoning_text = output.outputs.reasoning_content
526+
print(f"generated_text={generated_text}")
527+
```
528+
529+
## 运行demo
530+
531+
```bash
532+
./run_demo_vl.sh
533+
```
534+
535+
打印如下log:
536+
537+
```
538+
[2025-09-23 10:13:10,844] [ INFO] - Using download source: huggingface
539+
[2025-09-23 10:13:10,844] [ INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json
540+
[2025-09-23 10:13:10,845] [ INFO] - Using download source: huggingface
541+
[2025-09-23 10:13:10,845] [ INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json
542+
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or
543+
unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
544+
warnings.warn(
545+
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset
546+
`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. warnings.warn(
547+
INFO 2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready...
548+
Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00, 1.41s/it]
549+
Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00, 6.65it/s]
550+
INFO 2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds.
551+
prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
552+
generated_text=
553+
图中的文物是**北齐释迦牟尼佛像**,属于**北齐(公元550年-577年)**的文物。
554+
555+
这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。
556+
```

fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,11 +86,19 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_
8686
self.scale = 1.0 / sqrt(head_dim)
8787
self.num_layers = fd_config.model_config.num_hidden_layers
8888
self.dtype = paddle.get_default_dtype()
89+
self.enable_mm = fd_config.model_config.enable_mm
8990

9091
def init_attention_metadata(self, forward_meta: ForwardMeta):
9192
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
92-
self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
93-
self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
93+
if self.enable_mm:
94+
# VL: TODO: The first 0 may need to be replaced with batch_id
95+
# of max_num_seqs when running multiple batch case later
96+
self.rope_cos = forward_meta.rotary_embs[0, 0, 0, :, :, :]
97+
self.rope_sin = forward_meta.rotary_embs[0, 1, 0, :, :, :]
98+
else:
99+
# text
100+
self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
101+
self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
94102
self.prefill_info_dict = {}
95103
self.decode_info_dict = {}
96104
self.prefill_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_encoder)[0]
@@ -115,7 +123,9 @@ def init_attention_metadata(self, forward_meta: ForwardMeta):
115123
self.prefill_info_dict["cu_seqlens_q"][1:] = forward_meta.seq_lens_encoder[
116124
self.prefill_info_dict["batch_ids"], 0
117125
]
118-
self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"])
126+
self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(
127+
self.prefill_info_dict["cu_seqlens_q"], dtype="int32"
128+
)
119129

120130
self.tmp_buffer = paddle.zeros(
121131
[self.prefill_num_tokens + self.decode_len, self.hidden_dim], dtype=self.dtype

fastdeploy/model_executor/layers/rotary_embedding.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,9 @@ def __call__(self, position_ids):
411411
rot_emb[0] = cos_thw
412412
rot_emb[1] = sin_thw
413413

414+
if current_platform.is_iluvatar():
415+
rot_emb = paddle.stack([rot_emb, rot_emb], axis=-1).reshape([2, 1, self.max_position, 1, self.rotary_dim])
416+
414417
return rot_emb
415418

416419

fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535

3636
from fastdeploy.model_executor.layers.utils import divide, get_tensor
3737
from fastdeploy.model_executor.utils import set_weight_attrs
38+
from fastdeploy.platforms import current_platform
3839

3940
from .activation import ACT2FN
4041
from .configuration import DFNRopeVisionTransformerConfig
@@ -174,7 +175,7 @@ def __init__(
174175
mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(),
175176
weight_attr=None,
176177
has_bias=True,
177-
fuse_matmul_bias=True,
178+
fuse_matmul_bias=False if current_platform.is_iluvatar() else True,
178179
gather_output=False,
179180
)
180181
self.proj = RowParallelLinear(

fastdeploy/model_executor/models/ernie4_5_vl/image_op.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,11 @@
2626
text_image_gather_scatter,
2727
text_image_index_out,
2828
)
29+
elif current_platform.is_iluvatar():
30+
from fastdeploy.model_executor.ops.iluvatar import (
31+
text_image_gather_scatter,
32+
text_image_index_out,
33+
)
2934
else:
3035
raise ImportError("Unsupported platform, only support CUDA and XPU")
3136

fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
scatter_axis,
3232
)
3333
from fastdeploy.model_executor.utils import set_weight_attrs
34+
from fastdeploy.platforms import current_platform
3435

3536

3637
class ScatterOp(PyLayer):
@@ -172,7 +173,7 @@ def __init__(
172173
self.spatial_dim,
173174
input_is_parallel=True,
174175
has_bias=True,
175-
fuse_matmul_bias=True,
176+
fuse_matmul_bias=False if current_platform.is_iluvatar() else True,
176177
)
177178
if self.tensor_parallel_degree > 1
178179
else nn.Linear(self.spatial_dim, self.spatial_dim)

0 commit comments

Comments
 (0)