Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions custom_ops/setup_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,10 @@ def find_end_files(directory, end_str):
"gpu_ops/token_penalty_multi_scores.cu",
"gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
"gpu_ops/sample_kernels/top_k_renorm_probs.cu",
"gpu_ops/text_image_index_out.cu",
"gpu_ops/text_image_gather_scatter.cu",
"gpu_ops/extract_text_token_output.cu",
"gpu_ops/set_data_ipc.cu",
"iluvatar_ops/moe_dispatch.cu",
"iluvatar_ops/moe_reduce.cu",
"iluvatar_ops/paged_attn.cu",
Expand Down
145 changes: 145 additions & 0 deletions docs/get_started/installation/iluvatar_gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -409,3 +409,148 @@ Accuracy: 0.962
Invaild: 0.000
Latency: 17332.728 s
```

# Run ERNIE-4.5-VL-28B-A3B-Paddle model on iluvatar machine

## Machine Preparation
First, the `TP=2` when running the ERNIE-4.5-VL-28B-A3B-Paddle model and so you need to prepare a machine with the following configurations:

| CPU | Memory | Card | Hard Disk|
| :---: | :---: | :---: | :---: |
| x86 | 1TB| 2xBI150| 1TB|

## Image Preparation
Pull the Docker image

```bash
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
```

## Container Preparation
### Start Container

```bash
docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
docker exec -it paddle_infer bash
```

/home/paddle contains the model files, *.whl packages, and scripts.

### Install paddle

```bash
pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
```
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)

### Install FastDeploy
```bash
pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
```

## Prepare the inference demo script

script list below:

`run_demo_vl.sh`:

```bash
#!/bin/bash
export PADDLE_XCCL_BACKEND=iluvatar_gpu
export INFERENCE_MSG_QUEUE_ID=232132
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
export FD_SAMPLING_CLASS=rejection
export FD_DEBUG=1
python3 run_demo_vl.py
```

`run_demo_vl.py`:

```python
import io
import requests
from PIL import Image

from fastdeploy.entrypoints.llm import LLM
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer


PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle"
tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)

messages = [
{
"role": "user",
"content": [
{"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
{"type":"text", "text":"图中的文物属于哪个年代"}
]
}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
images, videos = [], []
for message in messages:
content = message["content"]
if not isinstance(content, list):
continue
for part in content:
if part["type"] == "image_url":
url = part["image_url"]["url"]
image_bytes = requests.get(url).content
img = Image.open(io.BytesIO(image_bytes))
images.append(img)
elif part["type"] == "video_url":
url = part["video_url"]["url"]
video_bytes = requests.get(url).content
videos.append({
"video": video_bytes,
"max_frames": 30
})

sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
outputs = llm.generate(prompts={
"prompt": prompt,
"multimodal_data": {
"image": images,
"video": videos
}
}, sampling_params=sampling_params)
# 输出结果
for output in outputs:
prompt = output.prompt
generated_text = output.outputs.text
reasoning_text = output.outputs.reasoning_content
print(f"generated_text={generated_text}")
```

## run demo

```bash
./run_demo_vl.sh
```

The following logs will be printed:

```
[2025-09-23 10:13:10,844] [ INFO] - Using download source: huggingface
[2025-09-23 10:13:10,844] [ INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json
[2025-09-23 10:13:10,845] [ INFO] - Using download source: huggingface
[2025-09-23 10:13:10,845] [ INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or
unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
warnings.warn(
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset
`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. warnings.warn(
INFO 2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready...
Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00, 1.41s/it]
Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00, 6.65it/s]
INFO 2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds.
prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
generated_text=
图中的文物是**北齐释迦牟尼佛像**,属于**北齐(公元550年-577年)**的文物。
这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。
```
145 changes: 145 additions & 0 deletions docs/zh/get_started/installation/iluvatar_gpu.md
Original file line number Diff line number Diff line change
Expand Up @@ -409,3 +409,148 @@ Accuracy: 0.962
Invaild: 0.000
Latency: 17332.728 s
```

# 如何在天数机器上运行ERNIE-4.5-VL-28B-A3B-Paddle model

## 准备机器
首先运行ERNIE-4.5-VL-28B-A3B-Paddle模型需要`TP=2`, 所以您需要准备以下配置的机器::

| CPU | Memory | Card | Hard Disk|
| :---: | :---: | :---: | :---: |
| x86 | 1TB| 2xBI150| 1TB|

## 准备镜像
拉取镜像:

```bash
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
```

## 准备容器
### 启动容器

```bash
docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
docker exec -it paddle_infer bash
```

/home/paddle 为模型文件、whl包、脚本所在目录。

### Install paddle

```bash
pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
```
获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)

### 安装FastDeploy
```bash
pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
```

## 准备推理demo脚本

脚本列表如下所示:

`run_demo_vl.sh`:

```bash
#!/bin/bash
export PADDLE_XCCL_BACKEND=iluvatar_gpu
export INFERENCE_MSG_QUEUE_ID=232132
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
export FD_SAMPLING_CLASS=rejection
export FD_DEBUG=1
python3 run_demo_vl.py
```

`run_demo_vl.py`:

```python
import io
import requests
from PIL import Image

from fastdeploy.entrypoints.llm import LLM
from fastdeploy.engine.sampling_params import SamplingParams
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer


PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle"
tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)

messages = [
{
"role": "user",
"content": [
{"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
{"type":"text", "text":"图中的文物属于哪个年代"}
]
}
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
images, videos = [], []
for message in messages:
content = message["content"]
if not isinstance(content, list):
continue
for part in content:
if part["type"] == "image_url":
url = part["image_url"]["url"]
image_bytes = requests.get(url).content
img = Image.open(io.BytesIO(image_bytes))
images.append(img)
elif part["type"] == "video_url":
url = part["video_url"]["url"]
video_bytes = requests.get(url).content
videos.append({
"video": video_bytes,
"max_frames": 30
})

sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
outputs = llm.generate(prompts={
"prompt": prompt,
"multimodal_data": {
"image": images,
"video": videos
}
}, sampling_params=sampling_params)
# 输出结果
for output in outputs:
prompt = output.prompt
generated_text = output.outputs.text
reasoning_text = output.outputs.reasoning_content
print(f"generated_text={generated_text}")
```

## 运行demo

```bash
./run_demo_vl.sh
```

打印如下log:

```
[2025-09-23 10:13:10,844] [ INFO] - Using download source: huggingface
[2025-09-23 10:13:10,844] [ INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json
[2025-09-23 10:13:10,845] [ INFO] - Using download source: huggingface
[2025-09-23 10:13:10,845] [ INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or
unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
warnings.warn(
/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset
`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed. warnings.warn(
INFO 2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready...
Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00, 1.41s/it]
Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00, 6.65it/s]
INFO 2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds.
prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
generated_text=
图中的文物是**北齐释迦牟尼佛像**,属于**北齐(公元550年-577年)**的文物。

这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。
```
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,19 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_
self.scale = 1.0 / sqrt(head_dim)
self.num_layers = fd_config.model_config.num_hidden_layers
self.dtype = paddle.get_default_dtype()
self.enable_mm = fd_config.model_config.enable_mm

def init_attention_metadata(self, forward_meta: ForwardMeta):
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""
self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
if self.enable_mm:
# VL: TODO: The first 0 may need to be replaced with batch_id
# of max_num_seqs when running multiple batch case later
self.rope_cos = forward_meta.rotary_embs[0, 0, 0, :, :, :]
self.rope_sin = forward_meta.rotary_embs[0, 1, 0, :, :, :]
else:
# text
self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
self.prefill_info_dict = {}
self.decode_info_dict = {}
self.prefill_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_encoder)[0]
Expand All @@ -115,7 +123,9 @@ def init_attention_metadata(self, forward_meta: ForwardMeta):
self.prefill_info_dict["cu_seqlens_q"][1:] = forward_meta.seq_lens_encoder[
self.prefill_info_dict["batch_ids"], 0
]
self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"])
self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(
self.prefill_info_dict["cu_seqlens_q"], dtype="int32"
)

self.tmp_buffer = paddle.zeros(
[self.prefill_num_tokens + self.decode_len, self.hidden_dim], dtype=self.dtype
Expand Down
3 changes: 3 additions & 0 deletions fastdeploy/model_executor/layers/rotary_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,9 @@ def __call__(self, position_ids):
rot_emb[0] = cos_thw
rot_emb[1] = sin_thw

if current_platform.is_iluvatar():
rot_emb = paddle.stack([rot_emb, rot_emb], axis=-1).reshape([2, 1, self.max_position, 1, self.rotary_dim])

return rot_emb


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

from fastdeploy.model_executor.layers.utils import divide, get_tensor
from fastdeploy.model_executor.utils import set_weight_attrs
from fastdeploy.platforms import current_platform

from .activation import ACT2FN
from .configuration import DFNRopeVisionTransformerConfig
Expand Down Expand Up @@ -174,7 +175,7 @@ def __init__(
mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(),
weight_attr=None,
has_bias=True,
fuse_matmul_bias=True,
fuse_matmul_bias=False if current_platform.is_iluvatar() else True,
gather_output=False,
)
self.proj = RowParallelLinear(
Expand Down
5 changes: 5 additions & 0 deletions fastdeploy/model_executor/models/ernie4_5_vl/image_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
text_image_gather_scatter,
text_image_index_out,
)
elif current_platform.is_iluvatar():
from fastdeploy.model_executor.ops.iluvatar import (
text_image_gather_scatter,
text_image_index_out,
)
else:
raise ImportError("Unsupported platform, only support CUDA and XPU")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
scatter_axis,
)
from fastdeploy.model_executor.utils import set_weight_attrs
from fastdeploy.platforms import current_platform


class ScatterOp(PyLayer):
Expand Down Expand Up @@ -172,7 +173,7 @@ def __init__(
self.spatial_dim,
input_is_parallel=True,
has_bias=True,
fuse_matmul_bias=True,
fuse_matmul_bias=False if current_platform.is_iluvatar() else True,
)
if self.tensor_parallel_degree > 1
else nn.Linear(self.spatial_dim, self.spatial_dim)
Expand Down
Loading
Loading