[Iluvatar GPU] Adapt VL model

wuyujiji · wuyujiji · commit c72ee5e818a4 · 2025-09-29T11:36:42.000+08:00
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
@@ -535,6 +535,10 @@ def find_end_files(directory, end_str):
                 "gpu_ops/token_penalty_multi_scores.cu",
                 "gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
                 "gpu_ops/sample_kernels/top_k_renorm_probs.cu",
+                "gpu_ops/text_image_index_out.cu",
+                "gpu_ops/text_image_gather_scatter.cu",
+                "gpu_ops/extract_text_token_output.cu",
+                "gpu_ops/set_data_ipc.cu",
                 "iluvatar_ops/moe_dispatch.cu",
                 "iluvatar_ops/moe_reduce.cu",
                 "iluvatar_ops/paged_attn.cu",
diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md
@@ -409,3 +409,148 @@ Accuracy: 0.962
 Invaild: 0.000
 Latency: 17332.728 s
 ```
+
+# Run ERNIE-4.5-VL-28B-A3B-Paddle model on iluvatar machine
+
+## Machine Preparation
+First, the `TP=2` when running the ERNIE-4.5-VL-28B-A3B-Paddle model and so you need to prepare a machine with the following configurations:
+
+| CPU | Memory | Card | Hard Disk|
+| :---: | :---: | :---: | :---: |
+| x86 | 1TB| 2xBI150| 1TB|
+
+## Image Preparation
+Pull the Docker image
+
+```bash
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+```
+
+## Container Preparation
+### Start Container
+
+```bash
+docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker exec -it paddle_infer bash
+```
+
+/home/paddle contains the model files, *.whl packages, and scripts.
+
+### Install paddle
+
+```bash
+pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+```
+For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
+
+### Install FastDeploy
+```bash
+pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
+```
+
+## Prepare the inference demo script
+
+script list below:
+
+`run_demo_vl.sh`:
+
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_SAMPLING_CLASS=rejection
+export FD_DEBUG=1
+python3 run_demo_vl.py
+```
+
+`run_demo_vl.py`:
+
+```python
+import io
+import requests
+from PIL import Image
+
+from fastdeploy.entrypoints.llm import LLM
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
+
+
+PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle"
+tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
+            {"type":"text", "text":"图中的文物属于哪个年代"}
+        ]
+     }
+]
+prompt = tokenizer.apply_chat_template(messages, tokenize=False)
+images, videos = [], []
+for message in messages:
+    content = message["content"]
+    if not isinstance(content, list):
+        continue
+    for part in content:
+        if part["type"] == "image_url":
+            url = part["image_url"]["url"]
+            image_bytes = requests.get(url).content
+            img = Image.open(io.BytesIO(image_bytes))
+            images.append(img)
+        elif part["type"] == "video_url":
+            url = part["video_url"]["url"]
+            video_bytes = requests.get(url).content
+            videos.append({
+                "video": video_bytes,
+                "max_frames": 30
+            })
+
+sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
+llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
+outputs = llm.generate(prompts={
+    "prompt": prompt,
+    "multimodal_data": {
+        "image": images,
+        "video": videos
+    }
+}, sampling_params=sampling_params)
+# 输出结果
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs.text
+    reasoning_text = output.outputs.reasoning_content
+    print(f"generated_text={generated_text}")
+```
+
+## run demo
+
+```bash
+./run_demo_vl.sh
+```
+
+The following logs will be printed:
+
+```
+[2025-09-23 10:13:10,844] [    INFO] - Using download source: huggingface
+[2025-09-23 10:13:10,844] [    INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json
+[2025-09-23 10:13:10,845] [    INFO] - Using download source: huggingface
+[2025-09-23 10:13:10,845] [    INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or
+unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+  warnings.warn(
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset
+`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.                                                                                                            warnings.warn(
+INFO     2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready...
+Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00,  1.41s/it]
+Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.65it/s]
+INFO     2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds.
+prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
+generated_text=
+图中的文物是**北齐释迦牟尼佛像**，属于**北齐（公元550年－577年）**的文物。
+
+这件佛像具有典型的北齐风格，佛像结跏趺坐于莲花座上，身披通肩袈裟，面部圆润，神态安详，体现了北齐佛教艺术的独特魅力。
+```
diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md
@@ -409,3 +409,148 @@ Accuracy: 0.962
 Invaild: 0.000
 Latency: 17332.728 s
 ```
+
+# 如何在天数机器上运行ERNIE-4.5-VL-28B-A3B-Paddle model
+
+## 准备机器
+首先运行ERNIE-4.5-VL-28B-A3B-Paddle模型需要`TP=2`, 所以您需要准备以下配置的机器：:
+
+| CPU | Memory | Card | Hard Disk|
+| :---: | :---: | :---: | :---: |
+| x86 | 1TB| 2xBI150| 1TB|
+
+## 准备镜像
+拉取镜像：
+
+```bash
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+```
+
+## 准备容器
+### 启动容器
+
+```bash
+docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker exec -it paddle_infer bash
+```
+
+/home/paddle 为模型文件、whl包、脚本所在目录。
+
+### Install paddle
+
+```bash
+pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+```
+获取Paddle的最新安装版本： [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
+
+### 安装FastDeploy
+```bash
+pip3 install fastdeploy_iluvatar_gpu==2.3.0.dev0 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ --extra-index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simplels
+```
+
+## 准备推理demo脚本
+
+脚本列表如下所示:
+
+`run_demo_vl.sh`:
+
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_SAMPLING_CLASS=rejection
+export FD_DEBUG=1
+python3 run_demo_vl.py
+```
+
+`run_demo_vl.py`:
+
+```python
+import io
+import requests
+from PIL import Image
+
+from fastdeploy.entrypoints.llm import LLM
+from fastdeploy.engine.sampling_params import SamplingParams
+from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
+
+
+PATH = "/home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle"
+tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type":"image_url", "image_url": {"url":"https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
+            {"type":"text", "text":"图中的文物属于哪个年代"}
+        ]
+     }
+]
+prompt = tokenizer.apply_chat_template(messages, tokenize=False)
+images, videos = [], []
+for message in messages:
+    content = message["content"]
+    if not isinstance(content, list):
+        continue
+    for part in content:
+        if part["type"] == "image_url":
+            url = part["image_url"]["url"]
+            image_bytes = requests.get(url).content
+            img = Image.open(io.BytesIO(image_bytes))
+            images.append(img)
+        elif part["type"] == "video_url":
+            url = part["video_url"]["url"]
+            video_bytes = requests.get(url).content
+            videos.append({
+                "video": video_bytes,
+                "max_frames": 30
+            })
+
+sampling_params = SamplingParams(temperature=0.1, max_tokens=6400)
+llm = LLM(model=PATH, tensor_parallel_size=2, max_model_len=32768, block_size=16, quantization="wint8", limit_mm_per_prompt={"image": 100}, reasoning_parser="ernie-45-vl")
+outputs = llm.generate(prompts={
+    "prompt": prompt,
+    "multimodal_data": {
+        "image": images,
+        "video": videos
+    }
+}, sampling_params=sampling_params)
+# 输出结果
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs.text
+    reasoning_text = output.outputs.reasoning_content
+    print(f"generated_text={generated_text}")
+```
+
+## 运行demo
+
+```bash
+./run_demo_vl.sh
+```
+
+打印如下log:
+
+```
+[2025-09-23 10:13:10,844] [    INFO] - Using download source: huggingface
+[2025-09-23 10:13:10,844] [    INFO] - loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/preprocessor_config.json
+[2025-09-23 10:13:10,845] [    INFO] - Using download source: huggingface
+[2025-09-23 10:13:10,845] [    INFO] - Loading configuration file /home/paddle/ERNIE-4.5-VL-28B-A3B-Paddle/generation_config.json
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:250: UserWarning: using greedy search strategy. However, `temperature` is set to `0.2` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or
+unset `temperature`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.
+  warnings.warn(
+/usr/local/lib/python3.10/site-packages/paddleformers/generation/configuration_utils.py:255: UserWarning: using greedy search strategy. However, `top_p` is set to `0.8` -- this flag is only used in sample-based generation modes. You should set `decode_strategy="greedy_search" ` or unset
+`top_p`. This was detected when initializing the generation config instance, which means the corresponding file may hold incorrect parameterization and should be fixed.                                                                                                            warnings.warn(
+INFO     2025-09-23 10:13:11,969 3880245 engine.py[line:136] Waiting worker processes ready...
+Loading Weights: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [02:21<00:00,  1.41s/it]
+Loading Layers: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:15<00:00,  6.65it/s]
+INFO     2025-09-23 10:15:53,672 3880245 engine.py[line:173] Worker processes are launched with 181.2426426410675 seconds.
+prompts: 100%|███████████████████████████████████| 1/1 [01:52<00:00, 112.74s/it, est. speed input: 0.00 toks/s, output: 0.00 toks/s]
+generated_text=
+图中的文物是**北齐释迦牟尼佛像**，属于**北齐（公元550年－577年）**的文物。
+
+这件佛像具有典型的北齐风格，佛像结跏趺坐于莲花座上，身披通肩袈裟，面部圆润，神态安详，体现了北齐佛教艺术的独特魅力。
+```
diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py
@@ -86,11 +86,19 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int, head_
         self.scale = 1.0 / sqrt(head_dim)
         self.num_layers = fd_config.model_config.num_hidden_layers
         self.dtype = paddle.get_default_dtype()
+        self.enable_mm = fd_config.model_config.enable_mm
 
     def init_attention_metadata(self, forward_meta: ForwardMeta):
         """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
-        self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
-        self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
+        if self.enable_mm:
+            # VL: TODO: The first 0 may need to be replaced with batch_id
+            # of max_num_seqs when running multiple batch case later
+            self.rope_cos = forward_meta.rotary_embs[0, 0, 0, :, :, :]
+            self.rope_sin = forward_meta.rotary_embs[0, 1, 0, :, :, :]
+        else:
+            # text
+            self.rope_cos = forward_meta.rotary_embs[0, 0, :, :, :]
+            self.rope_sin = forward_meta.rotary_embs[1, 0, :, :, :]
         self.prefill_info_dict = {}
         self.decode_info_dict = {}
         self.prefill_info_dict["batch_ids"] = paddle.where(forward_meta.seq_lens_encoder)[0]
@@ -115,7 +123,9 @@ def init_attention_metadata(self, forward_meta: ForwardMeta):
             self.prefill_info_dict["cu_seqlens_q"][1:] = forward_meta.seq_lens_encoder[
                 self.prefill_info_dict["batch_ids"], 0
             ]
-            self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(self.prefill_info_dict["cu_seqlens_q"])
+            self.prefill_info_dict["cu_seqlens_q"] = paddle.cumsum(
+                self.prefill_info_dict["cu_seqlens_q"], dtype="int32"
+            )
 
             self.tmp_buffer = paddle.zeros(
                 [self.prefill_num_tokens + self.decode_len, self.hidden_dim], dtype=self.dtype
diff --git a/fastdeploy/model_executor/layers/rotary_embedding.py b/fastdeploy/model_executor/layers/rotary_embedding.py
@@ -411,6 +411,9 @@ def __call__(self, position_ids):
         rot_emb[0] = cos_thw
         rot_emb[1] = sin_thw
 
+        if current_platform.is_iluvatar():
+            rot_emb = paddle.stack([rot_emb, rot_emb], axis=-1).reshape([2, 1, self.max_position, 1, self.rotary_dim])
+
         return rot_emb
 
 
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py b/fastdeploy/model_executor/models/ernie4_5_vl/dfnrope/modeling.py
@@ -35,6 +35,7 @@
 
 from fastdeploy.model_executor.layers.utils import divide, get_tensor
 from fastdeploy.model_executor.utils import set_weight_attrs
+from fastdeploy.platforms import current_platform
 
 from .activation import ACT2FN
 from .configuration import DFNRopeVisionTransformerConfig
@@ -174,7 +175,7 @@ def __init__(
                 mp_group=fleet.get_hybrid_communicate_group().get_model_parallel_group(),
                 weight_attr=None,
                 has_bias=True,
-                fuse_matmul_bias=True,
+                fuse_matmul_bias=False if current_platform.is_iluvatar() else True,
                 gather_output=False,
             )
             self.proj = RowParallelLinear(
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/image_op.py b/fastdeploy/model_executor/models/ernie4_5_vl/image_op.py
@@ -26,6 +26,11 @@
         text_image_gather_scatter,
         text_image_index_out,
     )
+elif current_platform.is_iluvatar():
+    from fastdeploy.model_executor.ops.iluvatar import (
+        text_image_gather_scatter,
+        text_image_index_out,
+    )
 else:
     raise ImportError("Unsupported platform, only support CUDA and XPU")
 
diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py b/fastdeploy/model_executor/models/ernie4_5_vl/modeling_resampler.py
@@ -31,6 +31,7 @@
     scatter_axis,
 )
 from fastdeploy.model_executor.utils import set_weight_attrs
+from fastdeploy.platforms import current_platform
 
 
 class ScatterOp(PyLayer):
@@ -172,7 +173,7 @@ def __init__(
                         self.spatial_dim,
                         input_is_parallel=True,
                         has_bias=True,
-                        fuse_matmul_bias=True,
+                        fuse_matmul_bias=False if current_platform.is_iluvatar() else True,
                     )
                     if self.tensor_parallel_degree > 1
                     else nn.Linear(self.spatial_dim, self.spatial_dim)
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py
diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py
diff --git a/requirements_iluvatar.txt b/requirements_iluvatar.txt

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@`
`31`	`31`	`scatter_axis,`
`32`	`32`	`)`
`33`	`33`	`from fastdeploy.model_executor.utils import set_weight_attrs`
	`34`	`+from fastdeploy.platforms import current_platform`
`34`	`35`
`35`	`36`
`36`	`37`	`class ScatterOp(PyLayer):`
`@@ -172,7 +173,7 @@ def __init__(`
`172`	`173`	`self.spatial_dim,`
`173`	`174`	`input_is_parallel=True,`
`174`	`175`	`has_bias=True,`
`175`		`- fuse_matmul_bias=True,`
	`176`	`+ fuse_matmul_bias=False if current_platform.is_iluvatar() else True,`
`176`	`177`	`)`
`177`	`178`	`if self.tensor_parallel_degree > 1`
`178`	`179`	`else nn.Linear(self.spatial_dim, self.spatial_dim)`