diff --git a/.github/workflows/deploy_docs.yml b/.github/workflows/deploy_docs.yml
index a95e48a2fe..209027caa4 100644
--- a/.github/workflows/deploy_docs.yml
+++ b/.github/workflows/deploy_docs.yml
@@ -2,7 +2,7 @@ name: Develop Docs
 on:
   push:
     branches: #设置更新哪个分支会更新站点
-      - release/3.2
+      - release/3.3
 permissions:
   contents: write
 jobs:
@@ -27,5 +27,5 @@ jobs:
       - run: pip install mike mkdocs-material jieba mkdocs-git-revision-date-localized-plugin mkdocs-git-committers-plugin-2 mkdocs-git-authors-plugin mkdocs-static-i18n mkdocs-minify-plugin 
       - run: |
           git fetch origin gh-pages --depth=1
-          mike deploy --push --update-aliases 3.2 latest
+          mike deploy --push --update-aliases 3.3 latest
           mike set-default --push latest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d522ad3863..f480361043 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -55,7 +55,7 @@ repos:
     -   id: isort
         args:
             - --profile=black
-        exclude: ^deploy/ultra-infer/python/ultra_infer/
+        files: ^paddlex/
 
 # check license
 -   repo: local
diff --git a/.precommit/check_imports.py b/.precommit/check_imports.py
index 2e736cb201..613f4f94d9 100644
--- a/.precommit/check_imports.py
+++ b/.precommit/check_imports.py
@@ -24,13 +24,13 @@
 from stdlib_list import stdlib_list
 
 sys.path.append(str(pathlib.Path(__file__).parent.parent))
-from setup import DEP_SPECS, REQUIRED_DEPS
+from setup import REQUIRED_DEPS
 
 # NOTE: We do not use `importlib.metadata.packages_distributions` here because
 # 1. It is supported only in Python 3.10+.
 # 2. It requires the packages to be installed, but we are doing a static check.
 MOD_TO_DEP = {
-    "aistudio_sdk": "aistudio_sdk",
+    "aistudio_sdk": "aistudio-sdk",
     "aiohttp": "aiohttp",
     "baidubce": "bce-python-sdk",
     "bs4": "beautifulsoup4",
@@ -43,9 +43,10 @@
     "fastapi": "fastapi",
     "filelock": "filelock",
     "filetype": "filetype",
+    "flash_attn": "flash-attn",
     "ftfy": "ftfy",
     "GPUtil": "GPUtil",
-    "huggingface_hub": "huggingface_hub",
+    "huggingface_hub": "huggingface-hub",
     "imagesize": "imagesize",
     "jinja2": "Jinja2",
     "joblib": "joblib",
@@ -61,6 +62,7 @@
     "cv2": "opencv-contrib-python",
     "openpyxl": "openpyxl",
     "packaging": "packaging",
+    "paddle2onnx": "paddle2onnx",
     "pandas": "pandas",
     "PIL": "pillow",
     "premailer": "premailer",
@@ -74,22 +76,28 @@
     "regex": "regex",
     "requests": "requests",
     "ruamel.yaml": "ruamel.yaml",
+    "safetensors": "safetensors",
     "skimage": "scikit-image",
     "sklearn": "scikit-learn",
+    "sentencepiece": "sentencepiece",
+    "sglang": "sglang",
     "shapely": "shapely",
     "soundfile": "soundfile",
     "starlette": "starlette",
     "tiktoken": "tiktoken",
     "tokenizers": "tokenizers",
+    "torch": "torch",
     "tqdm": "tqdm",
+    "transformers": "transformers",
     "typing_extensions": "typing-extensions",
     "ujson": "ujson",
     "uvicorn": "uvicorn",
+    "uvloop": "uvloop",
+    "vllm": "vllm",
+    "xformers": "xformers",
     "yarl": "yarl",
+    "bidi": "python-bidi",
 }
-assert (
-    set(MOD_TO_DEP.values()) == DEP_SPECS.keys()
-), f"`MOD_TO_DEP` should be updated to match `DEP_SPECS`. Symmetric difference: {set(MOD_TO_DEP.values()) ^ DEP_SPECS.keys()}"
 MOD_PATTERN = re.compile(
     rf"^(?:{'|'.join([re.escape(mod) for mod in MOD_TO_DEP])})(?=\.|$)"
 )
@@ -107,7 +115,11 @@
     "paddle3d",
     "paddlevideo",
 }
-MANUALLY_MANAGED_OPTIONAL_HEAVY_MODS = {"paddle_custom_device", "ultra_infer"}
+MANUALLY_MANAGED_OPTIONAL_HEAVY_MODS = {
+    "paddle_custom_device",
+    "ultra_infer",
+    "fastdeploy",
+}
 
 
 def check(file_path):
diff --git a/README.md b/README.md
index 681d8620e4..1f13e06436 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,10 @@ PaddleX 3.0 是基于飞桨框架构建的低代码开发工具，它集成了
 
 ## 📣 近期更新
 
+🔥🔥 **2025.10.16，发布 PaddleX v3.3.0**，新增能力如下：
+
+- **支持PaddleOCR-VL、PP-OCRv5多语种模型的推理部署能力。**
+
 🔥🔥 **2025.8.20，发布 PaddleX v3.2.0**，新增能力如下：
 
 - **部署能力升级：**
diff --git a/README_en.md b/README_en.md
index f3f66890b4..4553fc02b8 100644
--- a/README_en.md
+++ b/README_en.md
@@ -37,6 +37,10 @@ PaddleX 3.0 is a low-code development tool for AI models built on the PaddlePadd
 
 ## 📣 Recent Updates
 
+🔥🔥 **2025.10.16, PaddleX v3.3.0 Released**
+
+- **Added support for inference and deployment of PaddleOCR-VL and PP-OCRv5 multilingual models.**
+
 🔥🔥 **2025.8.20, PaddleX v3.2.0 Released**
 
 - **Deployment Capability Upgrades:**
diff --git a/api_examples/pipelines/test_pp_ocr_vl.py b/api_examples/pipelines/test_pp_ocr_vl.py
new file mode 100644
index 0000000000..af71e90b6b
--- /dev/null
+++ b/api_examples/pipelines/test_pp_ocr_vl.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="PaddleOCR-VL")
+
+output = pipeline.predict(
+    "/paddle/project/PaddleX/demo_paper.png",
+    use_doc_orientation_classify=False,
+    use_doc_unwarping=False,
+)
+
+for res in output:
+    res.print()
+    res.save_to_img("./output")
+    res.save_to_json("./output")
+    res.save_to_xlsx("./output")
+    res.save_to_html("./output")
+    res.save_to_markdown("./output", pretty=False)
diff --git a/deploy/hps/README.md b/deploy/hps/README.md
index 755271fa8f..06d0bae8a7 100644
--- a/deploy/hps/README.md
+++ b/deploy/hps/README.md
@@ -10,9 +10,9 @@ comments: true
 
 **请注意，本项目依赖于如下环境配置：**
 
+- **CPU 架构**：x86-64
 - **操作系统**：Linux
-- **Docker 版本**：`>= 20.10.0`，用于镜像构建和部署
-- **CPU 架构**：x86-64 
+- **Docker Engine 版本**：`>= 20.10.0`，用于镜像构建和部署
 
 本文档主要介绍如何基于本项目提供的脚本完成高稳定性服务化部署环境搭建与物料打包。整体流程分为两个阶段：
 
@@ -48,7 +48,7 @@ comments: true
 为了使构建结果的可重现性更强，本步骤将依赖锁定到精确版本。请切换至 `server_env` 目录执行如下脚本：
 
 ```bash
-./script/freeze_requirements.sh
+./scripts/freeze_requirements.sh
 ```
 
 该脚本调用 `pip-tools compile` 解析依赖源文件，并最终生成一系列 `.txt` 文件（如 `requirements/gpu.txt`、`requirements/cpu.txt` 等），这些文件将为 [1.3 镜像构建](./README.md#13-镜像构建) 提供依赖版本约束。
@@ -85,7 +85,7 @@ comments: true
 对于 Triton Server，项目使用预先编译好的版本，将在构建镜像时自动下载，无需手动下载。以构建 GPU 镜像为例，在 `server_env` 目录下执行以下命令：
 
 ```bash
-./scripts/build_deployment_image.sh -k gpu -t latest-gpu 
+./scripts/build_deployment_image.sh -k gpu -t latest-gpu
 ```
 
 构建镜像的参数配置项包括
@@ -118,10 +118,10 @@ comments: true
 执行成功后，命令行会输出以下提示信息：
 
 ```text
- => => exporting to image                                                         
- => => exporting layers                                                      
- => => writing image  sha256:ba3d0b2b079d63ee0239a99043fec7e25f17bf2a7772ec2fc80503c1582b3459   
- => => naming to ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:latest-gpu   
+ => => exporting to image
+ => => exporting layers
+ => => writing image  sha256:ba3d0b2b079d63ee0239a99043fec7e25f17bf2a7772ec2fc80503c1582b3459
+ => => naming to ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:latest-gpu
 ```
 
 如需批量构建 GPU 和 CPU 镜像，可以执行以下命令：
@@ -172,7 +172,7 @@ comments: true
 </tbody>
 </table>
 
-调用后存储到当前目录 `/output` 路径下。
+调用后存储到当前目录 `output` 路径下。
 
 ## 3.FAQ
 
diff --git a/deploy/hps/README_en.md b/deploy/hps/README_en.md
index 24efc3d3b5..cb842341a2 100644
--- a/deploy/hps/README_en.md
+++ b/deploy/hps/README_en.md
@@ -11,9 +11,9 @@ This project provides a high-stability serving solution, consisting of two main
 
 **Note: This project relies on the following environment configurations:**
 
-- **Operating System**: Linux
-- **Docker Version**: `>= 20.10.0` (Used for image building and deployment)
 - **CPU Architecture**: x86-64
+- **Operating System**: Linux
+- **Docker Engine Version**: `>= 20.10.0` (Used for image building and deployment)
 
 This  document  mainly introduces how to set up a high stability serving environment and package related materials using the scripts provided by this project. The overall process consists of two main stages:
 
@@ -32,13 +32,13 @@ Image Building Steps:
 
 1. Build a requirement collection image. (Optional)
 2. Freeze requirement versions to improve the reproducibility of deployment image building. (Optional)
-3. Build the deployment image based on the frozen requirement information to generate the final deployment image and provide image support for subsequent pipeline execution. 
+3. Build the deployment image based on the frozen requirement information to generate the final deployment image and provide image support for subsequent pipeline execution.
 
 **If you do not need to modify requirement-related information, you can skip to [1.3 Building Image](./README_en.md#13-building-image) to build the deployment image using cached requirement information.**
 
 ## 1.1 Build the Requirement Collection Image (Optional)
 
-Navigate to the `server_env` directory and run follow script for building the requirement collection image in this directory. 
+Navigate to the `server_env` directory and run follow script for building the requirement collection image in this directory.
 
 ```bash
 ./scripts/prepare_rc_image.sh
@@ -121,10 +121,10 @@ If the basic image cannot be pulled, please refer to the solutions in the [FAQ](
 After run successfully, the command line will display the following message:
 
 ```text
- => => exporting to image                                                         
- => => exporting layers                                                      
- => => writing image  sha256:ba3d0b2b079d63ee0239a99043fec7e25f17bf2a7772ec2fc80503c1582b3459   
- => => naming to ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:latest-gpu   
+ => => exporting to image
+ => => exporting layers
+ => => writing image  sha256:ba3d0b2b079d63ee0239a99043fec7e25f17bf2a7772ec2fc80503c1582b3459
+ => => naming to ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:latest-gpu
 ```
 
 To build both GPU and CPU images  run the following command:
@@ -140,7 +140,7 @@ This stage mainly introduces how to package pipeline materials. This function is
 - `client`: Responsible for invoking the model services.
 - `server`: Deployed using the images built in [1. Image Building](./README_en.md#1-image-building), serving as the runtime environment for model services.
 
-Before packaging the pipeline materials, you need to switch to the `sdk` directory and run the `scripts/assemble.sh` script in this directory for  packaging. For example, to package the general OCR pipeline, run:
+Before packaging the pipeline materials, you need to switch to the `sdk` directory and run the `scripts/assemble.sh` script in this directory for packaging. For example, to package the general OCR pipeline, run:
 
 ```bash
 ./scripts/assemble.sh OCR
@@ -175,7 +175,7 @@ The parameters for the packaging script are described as follows:
 </tbody>
 </table>
 
-After run successfully, the packaged  will be stored in the `/output` directory.
+After run successfully, the packaged  will be stored in the `output` directory.
 
 ## 3. FAQ
 
@@ -191,4 +191,4 @@ When running the image build scripts, you can use the `-p` parameter to specify
 
 ```bash
 ./scripts/prepare_rc_image.sh -p  https://pypi.tuna.tsinghua.edu.cn/simple
-```
\ No newline at end of file
+```
diff --git a/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/__init__.py b/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/__init__.py
index 8d326bec49..34cc032650 100644
--- a/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/__init__.py
+++ b/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from importlib import metadata as _metadata
 
 from .request import triton_request
diff --git a/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/constants.py b/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/constants.py
index 8d3a5d9359..2c1c6322cc 100644
--- a/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/constants.py
+++ b/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/constants.py
@@ -1,2 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 INPUT_NAME = "input"
 OUTPUT_NAME = "output"
diff --git a/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/request.py b/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/request.py
index 4b63ef3f34..5cdc0d2954 100644
--- a/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/request.py
+++ b/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/request.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 
 import numpy as np
diff --git a/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/utils.py b/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/utils.py
index 11871045d6..448e56712b 100644
--- a/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/utils.py
+++ b/deploy/hps/sdk/paddlex-hps-client/src/paddlex_hps_client/utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import base64
 import mimetypes
 import shutil
diff --git a/deploy/hps/sdk/pipelines/3d_bev_detection/client/client.py b/deploy/hps/sdk/pipelines/3d_bev_detection/client/client.py
index b1a1b89bbb..f0145939bc 100755
--- a/deploy/hps/sdk/pipelines/3d_bev_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/3d_bev_detection/client/client.py
@@ -1,12 +1,27 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
 
-from paddlex_hps_client import triton_request, utils
 from tritonclient import grpc as triton_grpc
 
+from paddlex_hps_client import triton_request, utils
+
 
 def main():
     parser = argparse.ArgumentParser()
diff --git a/deploy/hps/sdk/pipelines/3d_bev_detection/server/model_repo/bev-3d-object-detection/1/model.py b/deploy/hps/sdk/pipelines/3d_bev_detection/server/model_repo/bev-3d-object-detection/1/model.py
index 47023c91b4..bd7a91baba 100644
--- a/deploy/hps/sdk/pipelines/3d_bev_detection/server/model_repo/bev-3d-object-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/3d_bev_detection/server/model_repo/bev-3d-object-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from typing import Any, Dict, List
 
diff --git a/deploy/hps/sdk/pipelines/OCR/client/client.py b/deploy/hps/sdk/pipelines/OCR/client/client.py
index d5632c7a06..a09e26ab1e 100755
--- a/deploy/hps/sdk/pipelines/OCR/client/client.py
+++ b/deploy/hps/sdk/pipelines/OCR/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/OCR/server/model_repo/ocr/1/model.py b/deploy/hps/sdk/pipelines/OCR/server/model_repo/ocr/1/model.py
index 7e231f0e92..f9215cf979 100644
--- a/deploy/hps/sdk/pipelines/OCR/server/model_repo/ocr/1/model.py
+++ b/deploy/hps/sdk/pipelines/OCR/server/model_repo/ocr/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
@@ -95,6 +113,7 @@ def run(self, input, log_id):
                 text_det_box_thresh=input.textDetBoxThresh,
                 text_det_unclip_ratio=input.textDetUnclipRatio,
                 text_rec_score_thresh=input.textRecScoreThresh,
+                return_word_box=input.returnWordBox,
             )
         )
 
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/client/client.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/client/client.py
index ad8c8d6d27..282f562600 100755
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/client/client.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-chat/1/model.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-chat/1/model.py
index 9304676441..58eccc32e8 100644
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-chat/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-chat/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas
 
 
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-vector/1/model.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-vector/1/model.py
index be6244d8e4..7e59f2c826 100644
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-vector/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-vector/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas
 
 
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-visual/1/model.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-visual/1/model.py
index b68e2a5111..7b2568a7a1 100644
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-visual/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv3-doc/server/model_repo/chatocr-visual/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/client/client.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/client/client.py
index fc0a102ba4..4fa79ae669 100755
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/client/client.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-chat/1/model.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-chat/1/model.py
index b59ec5713c..ebb48a0ecb 100644
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-chat/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-chat/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas
 
 
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-mllm/1/model.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-mllm/1/model.py
index 5d4112c7b6..764793fb82 100644
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-mllm/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-mllm/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
 
 
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-vector/1/model.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-vector/1/model.py
index f1002bb33b..da7ade7049 100644
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-vector/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-vector/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas
 
 
diff --git a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-visual/1/model.py b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-visual/1/model.py
index f73632e28b..de0a16bdee 100644
--- a/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-visual/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ChatOCRv4-doc/server/model_repo/chatocr-visual/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
diff --git a/deploy/hps/sdk/pipelines/PP-DocTranslation/client/client.py b/deploy/hps/sdk/pipelines/PP-DocTranslation/client/client.py
index 91c3be56f2..c9dd8809dc 100755
--- a/deploy/hps/sdk/pipelines/PP-DocTranslation/client/client.py
+++ b/deploy/hps/sdk/pipelines/PP-DocTranslation/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 from pathlib import Path
diff --git a/deploy/hps/sdk/pipelines/PP-DocTranslation/server/model_repo/doctrans-translate/1/model.py b/deploy/hps/sdk/pipelines/PP-DocTranslation/server/model_repo/doctrans-translate/1/model.py
index d57eee2687..6f25005fd9 100644
--- a/deploy/hps/sdk/pipelines/PP-DocTranslation/server/model_repo/doctrans-translate/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-DocTranslation/server/model_repo/doctrans-translate/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas
diff --git a/deploy/hps/sdk/pipelines/PP-DocTranslation/server/model_repo/doctrans-visual/1/model.py b/deploy/hps/sdk/pipelines/PP-DocTranslation/server/model_repo/doctrans-visual/1/model.py
index 531c0598f7..361ce50332 100644
--- a/deploy/hps/sdk/pipelines/PP-DocTranslation/server/model_repo/doctrans-visual/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-DocTranslation/server/model_repo/doctrans-visual/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
diff --git a/deploy/hps/sdk/pipelines/PP-ShiTuV2/client/client.py b/deploy/hps/sdk/pipelines/PP-ShiTuV2/client/client.py
index ab86b029f9..d58a79745b 100755
--- a/deploy/hps/sdk/pipelines/PP-ShiTuV2/client/client.py
+++ b/deploy/hps/sdk/pipelines/PP-ShiTuV2/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-add/1/model.py b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-add/1/model.py
index ad97864951..864126b366 100644
--- a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-add/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-add/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from operator import attrgetter
 
 from paddlex.inference.pipelines.components import IndexData
diff --git a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-build/1/model.py b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-build/1/model.py
index 0d43756a0c..4511fe6aa9 100644
--- a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-build/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-build/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import uuid
 from operator import attrgetter
 
diff --git a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-remove/1/model.py b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-remove/1/model.py
index 45cd636480..f27c99446f 100644
--- a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-remove/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-index-remove/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex.inference.pipelines.components import IndexData
 from paddlex_hps_server import schemas
 
diff --git a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-infer/1/model.py b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-infer/1/model.py
index d284f987d0..b5de05589e 100644
--- a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-infer/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/model_repo/shitu-infer/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex.inference.pipelines.components import IndexData
@@ -16,7 +30,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         image_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(image_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         if input.indexKey is not None:
             index_storage = self.context["index_storage"]
diff --git a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/shared_mods/common/__init__.py b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/shared_mods/common/__init__.py
index e69de29bb2..b64cf01fdc 100644
--- a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/shared_mods/common/__init__.py
+++ b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/shared_mods/common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/shared_mods/common/base_model.py b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/shared_mods/common/base_model.py
index c59bea85d4..cec0c8d555 100644
--- a/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/shared_mods/common/base_model.py
+++ b/deploy/hps/sdk/pipelines/PP-ShiTuV2/server/shared_mods/common/base_model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel
 from paddlex_hps_server.storage import create_storage
 
diff --git a/deploy/hps/sdk/pipelines/PP-StructureV3/client/client.py b/deploy/hps/sdk/pipelines/PP-StructureV3/client/client.py
index 73e04bbabb..4d26692da1 100755
--- a/deploy/hps/sdk/pipelines/PP-StructureV3/client/client.py
+++ b/deploy/hps/sdk/pipelines/PP-StructureV3/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 from pathlib import Path
diff --git a/deploy/hps/sdk/pipelines/PP-StructureV3/server/model_repo/layout-parsing/1/model.py b/deploy/hps/sdk/pipelines/PP-StructureV3/server/model_repo/layout-parsing/1/model.py
index 43cc81d221..5bb10962da 100644
--- a/deploy/hps/sdk/pipelines/PP-StructureV3/server/model_repo/layout-parsing/1/model.py
+++ b/deploy/hps/sdk/pipelines/PP-StructureV3/server/model_repo/layout-parsing/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
@@ -94,6 +112,7 @@ def run(self, input, log_id):
                 use_formula_recognition=input.useFormulaRecognition,
                 use_chart_recognition=input.useChartRecognition,
                 use_region_detection=input.useRegionDetection,
+                format_block_content=input.formatBlockContent,
                 layout_threshold=input.layoutThreshold,
                 layout_nms=input.layoutNms,
                 layout_unclip_ratio=input.layoutUnclipRatio,
diff --git a/deploy/hps/sdk/pipelines/PP-StructureV3/server/pipeline_config.yaml b/deploy/hps/sdk/pipelines/PP-StructureV3/server/pipeline_config.yaml
index 74b421fab3..a93952771a 100644
--- a/deploy/hps/sdk/pipelines/PP-StructureV3/server/pipeline_config.yaml
+++ b/deploy/hps/sdk/pipelines/PP-StructureV3/server/pipeline_config.yaml
@@ -9,6 +9,7 @@ use_table_recognition: True
 use_formula_recognition: True
 use_chart_recognition: False
 use_region_detection: True
+format_block_content: False
 
 SubModules:
   LayoutDetection:
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/client/client.py b/deploy/hps/sdk/pipelines/PaddleOCR-VL/client/client.py
new file mode 100755
index 0000000000..b74e2db1ce
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/client/client.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import sys
+from pathlib import Path
+
+from paddlex_hps_client import triton_request, utils
+from tritonclient import grpc as triton_grpc
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--file", type=str, required=True)
+    parser.add_argument("--file-type", type=int, choices=[0, 1])
+    parser.add_argument("--no-visualization", action="store_true")
+    parser.add_argument("--url", type=str, default="localhost:8001")
+
+    args = parser.parse_args()
+
+    client = triton_grpc.InferenceServerClient(args.url)
+    input_ = {"file": utils.prepare_input_file(args.file)}
+    if args.file_type is not None:
+        input_["fileType"] = args.file_type
+    if args.no_visualization:
+        input_["visualize"] = False
+    output = triton_request(client, "layout-parsing", input_)
+    if output["errorCode"] != 0:
+        print(f"Error code: {output['errorCode']}", file=sys.stderr)
+        print(f"Error message: {output['errorMsg']}", file=sys.stderr)
+        sys.exit(1)
+    result = output["result"]
+    for i, res in enumerate(result["layoutParsingResults"]):
+        print(res["prunedResult"])
+        md_dir = Path(f"markdown_{i}")
+        md_dir.mkdir(exist_ok=True)
+        (md_dir / "doc.md").write_text(res["markdown"]["text"])
+        for img_path, img in res["markdown"]["images"].items():
+            img_path = md_dir / img_path
+            img_path.parent.mkdir(parents=True, exist_ok=True)
+            utils.save_output_file(img, img_path)
+        print(f"Markdown document saved at {md_dir / 'doc.md'}")
+        for img_name, img in res["outputImages"].items():
+            img_path = f"{img_name}_{i}.jpg"
+            Path(img_path).parent.mkdir(exist_ok=True)
+            utils.save_output_file(img, img_path)
+            print(f"Output image saved at {img_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/client/requirements.txt b/deploy/hps/sdk/pipelines/PaddleOCR-VL/client/requirements.txt
new file mode 100644
index 0000000000..e505ca1c6b
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/client/requirements.txt
@@ -0,0 +1,3 @@
+# paddlex-hps-client
+protobuf == 3.19.6
+tritonclient [grpc] == 2.15
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/1/model.py b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/1/model.py
new file mode 100644
index 0000000000..75c96504fc
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/1/model.py
@@ -0,0 +1,291 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from typing import Any, Dict, Final, List, Tuple
+
+from paddlex_hps_server import (
+    BaseTritonPythonModel,
+    app_common,
+    protocol,
+    schemas,
+    utils,
+)
+from paddlex_hps_server.storage import SupportsGetURL, create_storage
+
+_DEFAULT_MAX_NUM_INPUT_IMGS: Final[int] = 10
+_DEFAULT_MAX_OUTPUT_IMG_SIZE: Final[Tuple[int, int]] = (2000, 2000)
+
+
+class _SequentialExecutor(object):
+    def map(self, fn, *iterables):
+        return map(fn, *iterables)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+
+class TritonPythonModel(BaseTritonPythonModel):
+    def initialize(self, args):
+        super().initialize(args)
+        self.context = {}
+        self.context["file_storage"] = None
+        self.context["return_img_urls"] = False
+        self.context["max_num_input_imgs"] = _DEFAULT_MAX_NUM_INPUT_IMGS
+        self.context["max_output_img_size"] = _DEFAULT_MAX_OUTPUT_IMG_SIZE
+        if self.app_config.extra:
+            if "file_storage" in self.app_config.extra:
+                self.context["file_storage"] = create_storage(
+                    self.app_config.extra["file_storage"]
+                )
+            if "return_img_urls" in self.app_config.extra:
+                self.context["return_img_urls"] = self.app_config.extra[
+                    "return_img_urls"
+                ]
+            if "max_num_input_imgs" in self.app_config.extra:
+                self.context["max_num_input_imgs"] = self.app_config.extra[
+                    "max_num_input_imgs"
+                ]
+            if "max_output_img_size" in self.app_config.extra:
+                self.context["max_output_img_size"] = self.app_config.extra[
+                    "max_output_img_size"
+                ]
+        if self.context["return_img_urls"]:
+            file_storage = self.context["file_storage"]
+            if not file_storage:
+                raise ValueError(
+                    "The file storage must be properly configured when URLs need to be returned."
+                )
+            if not isinstance(file_storage, SupportsGetURL):
+                raise TypeError(f"{type(file_storage)} does not support getting URLs.")
+
+    def get_input_model_type(self):
+        return schemas.paddleocr_vl.InferRequest
+
+    def get_result_model_type(self):
+        return schemas.paddleocr_vl.InferResult
+
+    def run(self, input, log_id):
+        return self.run_batch([input], [log_id])
+
+    def run_batch(self, inputs, log_ids, batch_id):
+        result_or_output_dic = {}
+
+        input_groups = self._group_inputs(inputs)
+
+        max_group_size = max(map(len, input_groups))
+        if max_group_size > 1:
+            executor = ThreadPoolExecutor(max_workers=max_group_size)
+        else:
+            executor = _SequentialExecutor()
+
+        with executor:
+            for input_group in input_groups:
+                input_ids_g = list(map(itemgetter(0), input_group))
+                inputs_g = list(map(itemgetter(1), input_group))
+
+                log_ids_g = [log_ids[i] for i in input_ids_g]
+
+                ret = executor.map(self._preprocess, inputs_g, log_ids_g)
+                ind_img_lsts, ind_data_info_lst, ind_visualize_enabled_lst = [], [], []
+                for i, item in enumerate(ret):
+                    if isinstance(item, tuple):
+                        assert len(item) == 3, len(item)
+                        ind_img_lsts.append(item[0])
+                        ind_data_info_lst.append(item[1])
+                        ind_visualize_enabled_lst.append(item[2])
+                    else:
+                        input_id = input_ids_g[i]
+                        result_or_output_dic[input_id] = item
+
+                if len(ind_img_lsts):
+                    images = [img for item in ind_img_lsts for img in item]
+                    preds = list(
+                        self.pipeline(
+                            images,
+                            use_doc_orientation_classify=inputs_g[
+                                0
+                            ].useDocOrientationClassify,
+                            use_doc_unwarping=inputs_g[0].useDocUnwarping,
+                            use_layout_detection=inputs_g[0].useLayoutDetection,
+                            use_chart_recognition=inputs_g[0].useChartRecognition,
+                            layout_threshold=inputs_g[0].layoutThreshold,
+                            layout_nms=inputs_g[0].layoutNms,
+                            layout_unclip_ratio=inputs_g[0].layoutUnclipRatio,
+                            layout_merge_bboxes_mode=inputs_g[0].layoutMergeBboxesMode,
+                            prompt_label=inputs_g[0].promptLabel,
+                            format_block_content=inputs_g[0].formatBlockContent,
+                            repetition_penalty=inputs_g[0].repetitionPenalty,
+                            temperature=inputs_g[0].temperature,
+                            top_p=inputs_g[0].topP,
+                            min_pixels=inputs_g[0].minPixels,
+                            max_pixels=inputs_g[0].maxPixels,
+                        )
+                    )
+
+                    if len(preds) != len(images):
+                        raise RuntimeError(
+                            f"The number of predictions ({len(preds)}) is not the same as the number of input images ({len(images)})."
+                        )
+
+                    start_idx = 0
+                    ind_preds = []
+                    for item in ind_img_lsts:
+                        ind_preds.append(preds[start_idx : start_idx + len(item)])
+                        start_idx += len(item)
+
+                    for i, result in zip(
+                        input_ids_g,
+                        executor.map(
+                            self._postprocess,
+                            ind_img_lsts,
+                            ind_data_info_lst,
+                            ind_visualize_enabled_lst,
+                            ind_preds,
+                            log_ids_g,
+                            inputs_g,
+                        ),
+                    ):
+                        result_or_output_dic[i] = result
+
+            assert len(result_or_output_dic) == len(
+                inputs
+            ), f"Expected {len(inputs)} results or outputs, but got {len(result_or_output_dic)}"
+
+            return [result_or_output_dic[i] for i in range(len(inputs))]
+
+    def _group_inputs(self, inputs):
+        def _hash(input):
+            return hash(
+                (
+                    input.useDocOrientationClassify,
+                    input.useDocUnwarping,
+                    input.useLayoutDetection,
+                    input.useChartRecognition,
+                    input.layoutThreshold,
+                    input.layoutNms,
+                    input.layoutUnclipRatio,
+                    input.layoutMergeBboxesMode,
+                    input.promptLabel,
+                    input.formatBlockContent,
+                    input.repetitionPenalty,
+                    input.temperature,
+                    input.topP,
+                    input.minPixels,
+                    input.maxPixels,
+                )
+            )
+
+        groups = {}
+        for i, inp in enumerate(inputs):
+            group_key = _hash(inp)
+            if group_key not in groups:
+                groups[group_key] = []
+            groups[group_key].append((i, inp))
+
+        return list(groups.values())
+
+    def _preprocess(self, input, log_id):
+        if input.fileType is None:
+            if utils.is_url(input.file):
+                maybe_file_type = utils.infer_file_type(input.file)
+                if maybe_file_type is None or not (
+                    maybe_file_type == "PDF" or maybe_file_type == "IMAGE"
+                ):
+                    return protocol.create_aistudio_output_without_result(
+                        422,
+                        "Unsupported file type",
+                        log_id=log_id,
+                    )
+                file_type = maybe_file_type
+            else:
+                return protocol.create_aistudio_output_without_result(
+                    422,
+                    "File type cannot be determined",
+                    log_id=log_id,
+                )
+        else:
+            file_type = "PDF" if input.fileType == 0 else "IMAGE"
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
+
+        file_bytes = utils.get_raw_bytes(input.file)
+        images, data_info = utils.file_to_images(
+            file_bytes,
+            file_type,
+            max_num_imgs=self.context["max_num_input_imgs"],
+        )
+
+        return images, data_info, visualize_enabled
+
+    def _postprocess(self, images, data_info, visualize_enabled, preds, log_id, input):
+        layout_parsing_results: List[Dict[str, Any]] = []
+        for i, (img, item) in enumerate(zip(images, preds)):
+            pruned_res = app_common.prune_result(item.json["res"])
+            # XXX
+            md_data = item._to_markdown(
+                pretty=input.prettifyMarkdown,
+                show_formula_number=input.showFormulaNumber,
+            )
+            md_text = md_data["markdown_texts"]
+            md_imgs = app_common.postprocess_images(
+                md_data["markdown_images"],
+                log_id,
+                filename_template=f"markdown_{i}/{{key}}",
+                file_storage=self.context["file_storage"],
+                return_urls=self.context["return_img_urls"],
+                max_img_size=self.context["max_output_img_size"],
+            )
+            if visualize_enabled:
+                imgs = {
+                    "input_img": img,
+                    **item.img,
+                }
+                imgs = app_common.postprocess_images(
+                    imgs,
+                    log_id,
+                    filename_template=f"{{key}}_{i}.jpg",
+                    file_storage=self.context["file_storage"],
+                    return_urls=self.context["return_img_urls"],
+                    max_img_size=self.context["max_output_img_size"],
+                )
+            else:
+                imgs = {}
+            layout_parsing_results.append(
+                dict(
+                    prunedResult=pruned_res,
+                    markdown=dict(
+                        text=md_text,
+                        images=md_imgs,
+                    ),
+                    outputImages=(
+                        {k: v for k, v in imgs.items() if k != "input_img"}
+                        if imgs
+                        else None
+                    ),
+                    inputImage=imgs.get("input_img"),
+                )
+            )
+
+        return schemas.paddleocr_vl.InferResult(
+            layoutParsingResults=layout_parsing_results,
+            dataInfo=data_info,
+        )
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/config_cpu.pbtxt b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/config_cpu.pbtxt
new file mode 100644
index 0000000000..c5209123c1
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/config_cpu.pbtxt
@@ -0,0 +1,23 @@
+backend: "python"
+max_batch_size: 8
+input [
+  {
+    name: "input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+instance_group [
+  {
+      count: 1
+      kind: KIND_CPU
+  }
+]
+dynamic_batching { }
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/config_gpu.pbtxt b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/config_gpu.pbtxt
new file mode 100644
index 0000000000..39769a2182
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/model_repo/layout-parsing/config_gpu.pbtxt
@@ -0,0 +1,24 @@
+backend: "python"
+max_batch_size: 8
+input [
+  {
+    name: "input"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+output [
+  {
+    name: "output"
+    data_type: TYPE_STRING
+    dims: [ -1 ]
+  }
+]
+instance_group [
+  {
+      count: 1
+      kind: KIND_GPU
+      gpus: [ 0 ]
+  }
+]
+dynamic_batching { }
diff --git a/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/pipeline_config.yaml b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/pipeline_config.yaml
new file mode 100644
index 0000000000..43335a9def
--- /dev/null
+++ b/deploy/hps/sdk/pipelines/PaddleOCR-VL/server/pipeline_config.yaml
@@ -0,0 +1,96 @@
+
+pipeline_name: PaddleOCR-VL
+
+batch_size: 64
+
+use_queues: True
+
+use_doc_preprocessor: False
+use_layout_detection: True
+use_chart_recognition: False
+format_block_content: False
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayoutV2
+    model_dir: null
+    batch_size: 8
+    threshold: 
+      0: 0.5 # abstract
+      1: 0.5 # algorithm
+      2: 0.5 # aside_text
+      3: 0.5 # chart
+      4: 0.5 # content
+      5: 0.4 # formula
+      6: 0.4 # doc_title
+      7: 0.5 # figure_title
+      8: 0.5 # footer
+      9: 0.5 # footer
+      10: 0.5 # footnote
+      11: 0.5 # formula_number
+      12: 0.5 # header
+      13: 0.5 # header
+      14: 0.5 # image
+      15: 0.4 # formula
+      16: 0.5 # number
+      17: 0.4 # paragraph_title
+      18: 0.5 # reference
+      19: 0.5 # reference_content
+      20: 0.45 # seal
+      21: 0.5 # table
+      22: 0.4 # text
+      23: 0.4 # text
+      24: 0.5 # vision_footnote
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "union" # abstract
+      1: "union" # algorithm
+      2: "union" # aside_text
+      3: "large" # chart
+      4: "union" # content
+      5: "large" # display_formula
+      6: "large" # doc_title
+      7: "union" # figure_title
+      8: "union" # footer
+      9: "union" # footer
+      10: "union" # footnote
+      11: "union" # formula_number
+      12: "union" # header
+      13: "union" # header
+      14: "union" # image
+      15: "large" # inline_formula
+      16: "union" # number
+      17: "large" # paragraph_title
+      18: "union" # reference
+      19: "union" # reference_content
+      20: "union" # seal
+      21: "union" # table
+      22: "union" # text
+      23: "union" # text
+      24: "union" # vision_footnote
+  VLRecognition:
+    module_name: vl_recognition
+    model_name: PaddleOCR-VL-0.9B
+    model_dir: null
+    batch_size: 2048
+    genai_config:
+      backend: native
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
diff --git a/deploy/hps/sdk/pipelines/anomaly_detection/client/client.py b/deploy/hps/sdk/pipelines/anomaly_detection/client/client.py
index 429929068e..3f597400ab 100755
--- a/deploy/hps/sdk/pipelines/anomaly_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/anomaly_detection/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/anomaly_detection/server/model_repo/anomaly-detection/1/model.py b/deploy/hps/sdk/pipelines/anomaly_detection/server/model_repo/anomaly-detection/1/model.py
index f00933d540..26bb5f0fdf 100644
--- a/deploy/hps/sdk/pipelines/anomaly_detection/server/model_repo/anomaly-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/anomaly_detection/server/model_repo/anomaly-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
 
 
@@ -17,7 +31,11 @@ def run(self, input, log_id):
         pred = result["pred"][0].tolist()
         size = [len(pred), len(pred[0])]
         label_map = [item for sublist in pred for item in sublist]
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         if visualize_enabled:
             output_image_base64 = utils.base64_encode(
diff --git a/deploy/hps/sdk/pipelines/doc_preprocessor/client/client.py b/deploy/hps/sdk/pipelines/doc_preprocessor/client/client.py
index 5abc3f9835..8300605e7a 100755
--- a/deploy/hps/sdk/pipelines/doc_preprocessor/client/client.py
+++ b/deploy/hps/sdk/pipelines/doc_preprocessor/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/doc_preprocessor/server/model_repo/document-preprocessing/1/model.py b/deploy/hps/sdk/pipelines/doc_preprocessor/server/model_repo/document-preprocessing/1/model.py
index 910abe6f0a..629dd34e1a 100644
--- a/deploy/hps/sdk/pipelines/doc_preprocessor/server/model_repo/document-preprocessing/1/model.py
+++ b/deploy/hps/sdk/pipelines/doc_preprocessor/server/model_repo/document-preprocessing/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -89,7 +103,11 @@ def run(self, input, log_id):
                 use_doc_unwarping=input.useDocUnwarping,
             )
         )
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         doc_pp_results: List[Dict[str, Any]] = []
         for i, (img, item) in enumerate(zip(images, result)):
diff --git a/deploy/hps/sdk/pipelines/doc_understanding/client/client.py b/deploy/hps/sdk/pipelines/doc_understanding/client/client.py
old mode 100644
new mode 100755
index e9c2ccb3f1..f9b32bbe45
--- a/deploy/hps/sdk/pipelines/doc_understanding/client/client.py
+++ b/deploy/hps/sdk/pipelines/doc_understanding/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/doc_understanding/server/model_repo/document-understanding/1/model.py b/deploy/hps/sdk/pipelines/doc_understanding/server/model_repo/document-understanding/1/model.py
index e2e10b48e5..26ccc6dcf9 100644
--- a/deploy/hps/sdk/pipelines/doc_understanding/server/model_repo/document-understanding/1/model.py
+++ b/deploy/hps/sdk/pipelines/doc_understanding/server/model_repo/document-understanding/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import math
 import time
 from typing import List
diff --git a/deploy/hps/sdk/pipelines/face_recognition/client/client.py b/deploy/hps/sdk/pipelines/face_recognition/client/client.py
index e765054ed7..09b367d45e 100755
--- a/deploy/hps/sdk/pipelines/face_recognition/client/client.py
+++ b/deploy/hps/sdk/pipelines/face_recognition/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-add/1/model.py b/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-add/1/model.py
index 07341662c0..cab0244752 100644
--- a/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-add/1/model.py
+++ b/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-add/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from operator import attrgetter
 
 from paddlex.inference.pipelines.components import IndexData
diff --git a/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-build/1/model.py b/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-build/1/model.py
index e416065fc1..6d0458907c 100644
--- a/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-build/1/model.py
+++ b/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-build/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import uuid
 from operator import attrgetter
 
diff --git a/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-remove/1/model.py b/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-remove/1/model.py
index 04ccc29121..da6ebb86d8 100644
--- a/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-remove/1/model.py
+++ b/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-index-remove/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex.inference.pipelines.components import IndexData
 from paddlex_hps_server import schemas
 
diff --git a/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-infer/1/model.py b/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-infer/1/model.py
index fc3ebbe21a..81ddcb1255 100644
--- a/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-infer/1/model.py
+++ b/deploy/hps/sdk/pipelines/face_recognition/server/model_repo/face-recognition-infer/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex.inference.pipelines.components import IndexData
@@ -23,7 +37,11 @@ def run(self, input, log_id):
             index_data = IndexData.from_bytes(index_data_bytes)
         else:
             index_data = None
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(
             self.pipeline(
diff --git a/deploy/hps/sdk/pipelines/face_recognition/server/shared_mods/common/__init__.py b/deploy/hps/sdk/pipelines/face_recognition/server/shared_mods/common/__init__.py
index e69de29bb2..b64cf01fdc 100644
--- a/deploy/hps/sdk/pipelines/face_recognition/server/shared_mods/common/__init__.py
+++ b/deploy/hps/sdk/pipelines/face_recognition/server/shared_mods/common/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/deploy/hps/sdk/pipelines/face_recognition/server/shared_mods/common/base_model.py b/deploy/hps/sdk/pipelines/face_recognition/server/shared_mods/common/base_model.py
index fe41d3cddb..17c80dfd10 100644
--- a/deploy/hps/sdk/pipelines/face_recognition/server/shared_mods/common/base_model.py
+++ b/deploy/hps/sdk/pipelines/face_recognition/server/shared_mods/common/base_model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel
 from paddlex_hps_server.storage import create_storage
 
diff --git a/deploy/hps/sdk/pipelines/formula_recognition/client/client.py b/deploy/hps/sdk/pipelines/formula_recognition/client/client.py
index b01af7ad7c..80857307f0 100755
--- a/deploy/hps/sdk/pipelines/formula_recognition/client/client.py
+++ b/deploy/hps/sdk/pipelines/formula_recognition/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/formula_recognition/server/model_repo/formula-recognition/1/model.py b/deploy/hps/sdk/pipelines/formula_recognition/server/model_repo/formula-recognition/1/model.py
index 3350717687..7af06c405a 100644
--- a/deploy/hps/sdk/pipelines/formula_recognition/server/model_repo/formula-recognition/1/model.py
+++ b/deploy/hps/sdk/pipelines/formula_recognition/server/model_repo/formula-recognition/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
diff --git a/deploy/hps/sdk/pipelines/human_keypoint_detection/client/client.py b/deploy/hps/sdk/pipelines/human_keypoint_detection/client/client.py
index c9818f93ba..81fb9ea45a 100755
--- a/deploy/hps/sdk/pipelines/human_keypoint_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/human_keypoint_detection/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/human_keypoint_detection/server/model_repo/human-keypoint-detection/1/model.py b/deploy/hps/sdk/pipelines/human_keypoint_detection/server/model_repo/human-keypoint-detection/1/model.py
index 56847fbb20..d179900368 100644
--- a/deploy/hps/sdk/pipelines/human_keypoint_detection/server/model_repo/human-keypoint-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/human_keypoint_detection/server/model_repo/human-keypoint-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -13,7 +27,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(
             self.pipeline.predict(
diff --git a/deploy/hps/sdk/pipelines/image_classification/client/client.py b/deploy/hps/sdk/pipelines/image_classification/client/client.py
index 384158c790..6f88d68f43 100755
--- a/deploy/hps/sdk/pipelines/image_classification/client/client.py
+++ b/deploy/hps/sdk/pipelines/image_classification/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/image_classification/server/model_repo/image-classification/1/model.py b/deploy/hps/sdk/pipelines/image_classification/server/model_repo/image-classification/1/model.py
index 9ab12250a8..ded874ee89 100644
--- a/deploy/hps/sdk/pipelines/image_classification/server/model_repo/image-classification/1/model.py
+++ b/deploy/hps/sdk/pipelines/image_classification/server/model_repo/image-classification/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -13,7 +27,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(image, topk=input.topk))[0]
         if "label_names" in result:
diff --git a/deploy/hps/sdk/pipelines/image_multilabel_classification/client/client.py b/deploy/hps/sdk/pipelines/image_multilabel_classification/client/client.py
index a31ba80483..9eb3b8edc4 100755
--- a/deploy/hps/sdk/pipelines/image_multilabel_classification/client/client.py
+++ b/deploy/hps/sdk/pipelines/image_multilabel_classification/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/image_multilabel_classification/server/model_repo/multilabel-image-classification/1/model.py b/deploy/hps/sdk/pipelines/image_multilabel_classification/server/model_repo/multilabel-image-classification/1/model.py
index 981ce2667c..e22d59fe83 100644
--- a/deploy/hps/sdk/pipelines/image_multilabel_classification/server/model_repo/multilabel-image-classification/1/model.py
+++ b/deploy/hps/sdk/pipelines/image_multilabel_classification/server/model_repo/multilabel-image-classification/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -13,7 +27,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(image, threshold=input.threshold))[0]
 
diff --git a/deploy/hps/sdk/pipelines/instance_segmentation/client/client.py b/deploy/hps/sdk/pipelines/instance_segmentation/client/client.py
index f9ca703d01..e2af0a488f 100755
--- a/deploy/hps/sdk/pipelines/instance_segmentation/client/client.py
+++ b/deploy/hps/sdk/pipelines/instance_segmentation/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/instance_segmentation/server/model_repo/instance-segmentation/1/model.py b/deploy/hps/sdk/pipelines/instance_segmentation/server/model_repo/instance-segmentation/1/model.py
index 5c1d5f41f9..6fed8fb35f 100644
--- a/deploy/hps/sdk/pipelines/instance_segmentation/server/model_repo/instance-segmentation/1/model.py
+++ b/deploy/hps/sdk/pipelines/instance_segmentation/server/model_repo/instance-segmentation/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 import numpy as np
@@ -20,7 +34,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(image, threshold=input.threshold))[0]
 
diff --git a/deploy/hps/sdk/pipelines/layout_parsing/client/client.py b/deploy/hps/sdk/pipelines/layout_parsing/client/client.py
index 316568165c..834cbbf65d 100755
--- a/deploy/hps/sdk/pipelines/layout_parsing/client/client.py
+++ b/deploy/hps/sdk/pipelines/layout_parsing/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/layout_parsing/server/model_repo/layout-parsing/1/model.py b/deploy/hps/sdk/pipelines/layout_parsing/server/model_repo/layout-parsing/1/model.py
index 982dcf5990..b4ba08c961 100644
--- a/deploy/hps/sdk/pipelines/layout_parsing/server/model_repo/layout-parsing/1/model.py
+++ b/deploy/hps/sdk/pipelines/layout_parsing/server/model_repo/layout-parsing/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
diff --git a/deploy/hps/sdk/pipelines/multilingual_speech_recognition/client/client.py b/deploy/hps/sdk/pipelines/multilingual_speech_recognition/client/client.py
index 2d013e65f3..6cad0daaa7 100755
--- a/deploy/hps/sdk/pipelines/multilingual_speech_recognition/client/client.py
+++ b/deploy/hps/sdk/pipelines/multilingual_speech_recognition/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/multilingual_speech_recognition/server/model_repo/multilingual-speech-recognition/1/model.py b/deploy/hps/sdk/pipelines/multilingual_speech_recognition/server/model_repo/multilingual-speech-recognition/1/model.py
index 28c1bec10e..d65620e460 100644
--- a/deploy/hps/sdk/pipelines/multilingual_speech_recognition/server/model_repo/multilingual-speech-recognition/1/model.py
+++ b/deploy/hps/sdk/pipelines/multilingual_speech_recognition/server/model_repo/multilingual-speech-recognition/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from typing import Any, Dict, List
 
diff --git a/deploy/hps/sdk/pipelines/object_detection/client/client.py b/deploy/hps/sdk/pipelines/object_detection/client/client.py
index c754c38ba3..739cb3c431 100755
--- a/deploy/hps/sdk/pipelines/object_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/object_detection/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/object_detection/server/model_repo/object-detection/1/model.py b/deploy/hps/sdk/pipelines/object_detection/server/model_repo/object-detection/1/model.py
index cc7d3c7d5f..a825ca1bc7 100644
--- a/deploy/hps/sdk/pipelines/object_detection/server/model_repo/object-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/object_detection/server/model_repo/object-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -15,7 +29,11 @@ def run(self, input, log_id):
         image = utils.image_bytes_to_array(file_bytes)
 
         result = list(self.pipeline.predict(image, threshold=input.threshold))[0]
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         objects: List[Dict[str, Any]] = []
         for obj in result["boxes"]:
diff --git a/deploy/hps/sdk/pipelines/open_vocabulary_detection/client/client.py b/deploy/hps/sdk/pipelines/open_vocabulary_detection/client/client.py
index 2a8719dc4a..20bb9da392 100755
--- a/deploy/hps/sdk/pipelines/open_vocabulary_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/open_vocabulary_detection/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/open_vocabulary_detection/server/model_repo/open-vocabulary-detection/1/model.py b/deploy/hps/sdk/pipelines/open_vocabulary_detection/server/model_repo/open-vocabulary-detection/1/model.py
index d185439ef5..b0aad51ff6 100644
--- a/deploy/hps/sdk/pipelines/open_vocabulary_detection/server/model_repo/open-vocabulary-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/open_vocabulary_detection/server/model_repo/open-vocabulary-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -13,7 +27,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(
             self.pipeline.predict(
diff --git a/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/client/client.py b/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/client/client.py
index d747bdb78f..9ca03e1ba1 100755
--- a/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/client/client.py
+++ b/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/server/model_repo/open-vocabulary-segmentation/1/model.py b/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/server/model_repo/open-vocabulary-segmentation/1/model.py
index 004d391180..cfa5c78f44 100644
--- a/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/server/model_repo/open-vocabulary-segmentation/1/model.py
+++ b/deploy/hps/sdk/pipelines/open_vocabulary_segmentation/server/model_repo/open-vocabulary-segmentation/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import pycocotools.mask as mask_util
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -18,7 +32,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(
             self.pipeline.predict(
diff --git a/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/client/client.py b/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/client/client.py
index c5f987f959..52f4c2c875 100755
--- a/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/client/client.py
+++ b/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/server/model_repo/pedestrian-attribute-recognition/1/model.py b/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/server/model_repo/pedestrian-attribute-recognition/1/model.py
index ca6ad75719..c6066d3977 100644
--- a/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/server/model_repo/pedestrian-attribute-recognition/1/model.py
+++ b/deploy/hps/sdk/pipelines/pedestrian_attribute_recognition/server/model_repo/pedestrian-attribute-recognition/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -13,7 +27,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(
             self.pipeline.predict(
diff --git a/deploy/hps/sdk/pipelines/rotated_object_detection/client/client.py b/deploy/hps/sdk/pipelines/rotated_object_detection/client/client.py
index 261bb77110..94bd32a84d 100755
--- a/deploy/hps/sdk/pipelines/rotated_object_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/rotated_object_detection/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/rotated_object_detection/server/model_repo/rotated-object-detection/1/model.py b/deploy/hps/sdk/pipelines/rotated_object_detection/server/model_repo/rotated-object-detection/1/model.py
index fcc273bb75..0ff1b5a39b 100644
--- a/deploy/hps/sdk/pipelines/rotated_object_detection/server/model_repo/rotated-object-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/rotated_object_detection/server/model_repo/rotated-object-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -13,7 +27,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(image, threshold=input.threshold))[0]
 
diff --git a/deploy/hps/sdk/pipelines/seal_recognition/client/client.py b/deploy/hps/sdk/pipelines/seal_recognition/client/client.py
index d0c8715fba..528ca64055 100755
--- a/deploy/hps/sdk/pipelines/seal_recognition/client/client.py
+++ b/deploy/hps/sdk/pipelines/seal_recognition/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/seal_recognition/server/model_repo/seal-recognition/1/model.py b/deploy/hps/sdk/pipelines/seal_recognition/server/model_repo/seal-recognition/1/model.py
index f44c76e6b2..4885f6a68c 100644
--- a/deploy/hps/sdk/pipelines/seal_recognition/server/model_repo/seal-recognition/1/model.py
+++ b/deploy/hps/sdk/pipelines/seal_recognition/server/model_repo/seal-recognition/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
diff --git a/deploy/hps/sdk/pipelines/semantic_segmentation/client/client.py b/deploy/hps/sdk/pipelines/semantic_segmentation/client/client.py
index f1ff3cfb30..ee63629a17 100755
--- a/deploy/hps/sdk/pipelines/semantic_segmentation/client/client.py
+++ b/deploy/hps/sdk/pipelines/semantic_segmentation/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/semantic_segmentation/server/model_repo/semantic-segmentation/1/model.py b/deploy/hps/sdk/pipelines/semantic_segmentation/server/model_repo/semantic-segmentation/1/model.py
index fb9be9e406..290a150808 100644
--- a/deploy/hps/sdk/pipelines/semantic_segmentation/server/model_repo/semantic-segmentation/1/model.py
+++ b/deploy/hps/sdk/pipelines/semantic_segmentation/server/model_repo/semantic-segmentation/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
 
 
@@ -11,7 +25,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(image, target_size=input.targetSize))[0]
 
diff --git a/deploy/hps/sdk/pipelines/small_object_detection/client/client.py b/deploy/hps/sdk/pipelines/small_object_detection/client/client.py
index 2f2a97a18e..3f97632ed0 100755
--- a/deploy/hps/sdk/pipelines/small_object_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/small_object_detection/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/small_object_detection/server/model_repo/small-object-detection/1/model.py b/deploy/hps/sdk/pipelines/small_object_detection/server/model_repo/small-object-detection/1/model.py
index 1f232a2883..d3117baefc 100644
--- a/deploy/hps/sdk/pipelines/small_object_detection/server/model_repo/small-object-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/small_object_detection/server/model_repo/small-object-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -13,7 +27,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(image, threshold=input.threshold))[0]
 
diff --git a/deploy/hps/sdk/pipelines/table_recognition/client/client.py b/deploy/hps/sdk/pipelines/table_recognition/client/client.py
index 90760e33ba..5c75e267cd 100755
--- a/deploy/hps/sdk/pipelines/table_recognition/client/client.py
+++ b/deploy/hps/sdk/pipelines/table_recognition/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/table_recognition/server/model_repo/table-recognition/1/model.py b/deploy/hps/sdk/pipelines/table_recognition/server/model_repo/table-recognition/1/model.py
index 11ef57e896..c1624046bb 100644
--- a/deploy/hps/sdk/pipelines/table_recognition/server/model_repo/table-recognition/1/model.py
+++ b/deploy/hps/sdk/pipelines/table_recognition/server/model_repo/table-recognition/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
diff --git a/deploy/hps/sdk/pipelines/table_recognition_v2/client/client.py b/deploy/hps/sdk/pipelines/table_recognition_v2/client/client.py
index 90760e33ba..5c75e267cd 100755
--- a/deploy/hps/sdk/pipelines/table_recognition_v2/client/client.py
+++ b/deploy/hps/sdk/pipelines/table_recognition_v2/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/table_recognition_v2/server/model_repo/table-recognition/1/model.py b/deploy/hps/sdk/pipelines/table_recognition_v2/server/model_repo/table-recognition/1/model.py
index 05054447b9..508981080b 100644
--- a/deploy/hps/sdk/pipelines/table_recognition_v2/server/model_repo/table-recognition/1/model.py
+++ b/deploy/hps/sdk/pipelines/table_recognition_v2/server/model_repo/table-recognition/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, Final, List, Tuple
 
 from paddlex_hps_server import (
@@ -74,7 +88,11 @@ def run(self, input, log_id):
                 )
         else:
             file_type = "PDF" if input.fileType == 0 else "IMAGE"
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         file_bytes = utils.get_raw_bytes(input.file)
         images, data_info = utils.file_to_images(
diff --git a/deploy/hps/sdk/pipelines/ts_anomaly_detection/client/client.py b/deploy/hps/sdk/pipelines/ts_anomaly_detection/client/client.py
index 3cae4c1dec..33179498dd 100755
--- a/deploy/hps/sdk/pipelines/ts_anomaly_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/ts_anomaly_detection/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/ts_anomaly_detection/server/model_repo/time-series-anomaly-detection/1/model.py b/deploy/hps/sdk/pipelines/ts_anomaly_detection/server/model_repo/time-series-anomaly-detection/1/model.py
index 1e866aeeea..992c047e57 100644
--- a/deploy/hps/sdk/pipelines/ts_anomaly_detection/server/model_repo/time-series-anomaly-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/ts_anomaly_detection/server/model_repo/time-series-anomaly-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
 
 
@@ -11,7 +25,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.csv)
         df = utils.csv_bytes_to_data_frame(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(df))[0]
 
diff --git a/deploy/hps/sdk/pipelines/ts_classification/client/client.py b/deploy/hps/sdk/pipelines/ts_classification/client/client.py
index f63e71d609..ad3def1ccc 100755
--- a/deploy/hps/sdk/pipelines/ts_classification/client/client.py
+++ b/deploy/hps/sdk/pipelines/ts_classification/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/ts_classification/server/model_repo/time-series-classification/1/model.py b/deploy/hps/sdk/pipelines/ts_classification/server/model_repo/time-series-classification/1/model.py
index 397d11263a..67995aae6e 100644
--- a/deploy/hps/sdk/pipelines/ts_classification/server/model_repo/time-series-classification/1/model.py
+++ b/deploy/hps/sdk/pipelines/ts_classification/server/model_repo/time-series-classification/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
 
 
@@ -11,7 +25,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.csv)
         df = utils.csv_bytes_to_data_frame(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(df))[0]
 
diff --git a/deploy/hps/sdk/pipelines/ts_forecast/client/client.py b/deploy/hps/sdk/pipelines/ts_forecast/client/client.py
index d724a36d49..16a30af691 100755
--- a/deploy/hps/sdk/pipelines/ts_forecast/client/client.py
+++ b/deploy/hps/sdk/pipelines/ts_forecast/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import sys
 
diff --git a/deploy/hps/sdk/pipelines/ts_forecast/server/model_repo/time-series-forecasting/1/model.py b/deploy/hps/sdk/pipelines/ts_forecast/server/model_repo/time-series-forecasting/1/model.py
index 1f854b4782..f63e3f3271 100644
--- a/deploy/hps/sdk/pipelines/ts_forecast/server/model_repo/time-series-forecasting/1/model.py
+++ b/deploy/hps/sdk/pipelines/ts_forecast/server/model_repo/time-series-forecasting/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
 
 
@@ -11,7 +25,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.csv)
         df = utils.csv_bytes_to_data_frame(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(self.pipeline.predict(df))[0]
 
diff --git a/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/client/client.py b/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/client/client.py
index 158e29eea9..e9fbd78c2c 100755
--- a/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/client/client.py
+++ b/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/server/model_repo/vehicle-attribute-recognition/1/model.py b/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/server/model_repo/vehicle-attribute-recognition/1/model.py
index 4fdf501c0c..e009c4af29 100644
--- a/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/server/model_repo/vehicle-attribute-recognition/1/model.py
+++ b/deploy/hps/sdk/pipelines/vehicle_attribute_recognition/server/model_repo/vehicle-attribute-recognition/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Any, Dict, List
 
 from paddlex_hps_server import BaseTritonPythonModel, schemas, utils
@@ -13,7 +27,11 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         file_bytes = utils.get_raw_bytes(input.image)
         image = utils.image_bytes_to_array(file_bytes)
-        visualize_enabled = input.visualize if input.visualize is not None else self.app_config.visualize
+        visualize_enabled = (
+            input.visualize
+            if input.visualize is not None
+            else self.app_config.visualize
+        )
 
         result = list(
             self.pipeline.predict(
diff --git a/deploy/hps/sdk/pipelines/video_classification/client/client.py b/deploy/hps/sdk/pipelines/video_classification/client/client.py
index a995db1dd3..80eafa779b 100755
--- a/deploy/hps/sdk/pipelines/video_classification/client/client.py
+++ b/deploy/hps/sdk/pipelines/video_classification/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/video_classification/server/model_repo/video-classification/1/model.py b/deploy/hps/sdk/pipelines/video_classification/server/model_repo/video-classification/1/model.py
index 36640c7f90..4a3070ffa5 100644
--- a/deploy/hps/sdk/pipelines/video_classification/server/model_repo/video-classification/1/model.py
+++ b/deploy/hps/sdk/pipelines/video_classification/server/model_repo/video-classification/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from typing import Any, Dict, List
 
diff --git a/deploy/hps/sdk/pipelines/video_detection/client/client.py b/deploy/hps/sdk/pipelines/video_detection/client/client.py
index 51abdc7b19..03fa42f330 100755
--- a/deploy/hps/sdk/pipelines/video_detection/client/client.py
+++ b/deploy/hps/sdk/pipelines/video_detection/client/client.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import pprint
 import sys
diff --git a/deploy/hps/sdk/pipelines/video_detection/server/model_repo/video-detection/1/model.py b/deploy/hps/sdk/pipelines/video_detection/server/model_repo/video-detection/1/model.py
index b0555537e5..d045f21ce1 100644
--- a/deploy/hps/sdk/pipelines/video_detection/server/model_repo/video-detection/1/model.py
+++ b/deploy/hps/sdk/pipelines/video_detection/server/model_repo/video-detection/1/model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from typing import Any, Dict, List
 
diff --git a/deploy/hps/sdk/scripts/assemble.py b/deploy/hps/sdk/scripts/assemble.py
index 359e7bdb33..bf40dab9d8 100755
--- a/deploy/hps/sdk/scripts/assemble.py
+++ b/deploy/hps/sdk/scripts/assemble.py
@@ -1,5 +1,19 @@
 #!/usr/bin/env python
 
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import json
 import pathlib
diff --git a/deploy/hps/sdk/scripts/assemble.sh b/deploy/hps/sdk/scripts/assemble.sh
index 405bb0e24e..6c4257443c 100755
--- a/deploy/hps/sdk/scripts/assemble.sh
+++ b/deploy/hps/sdk/scripts/assemble.sh
@@ -8,5 +8,5 @@ docker run \
     -v "$(pwd)":/workspace \
     -w /workspace \
     --rm \
-    python:3.10@sha256:6ff000548a4fa34c1be02624836e75e212d4ead8227b4d4381c3ae998933a922 \
+    python:3.10 \
     /bin/bash scripts/_assemble.sh "$@"
diff --git a/deploy/hps/sdk/versions.json b/deploy/hps/sdk/versions.json
index 3e95174106..08820eeef5 100644
--- a/deploy/hps/sdk/versions.json
+++ b/deploy/hps/sdk/versions.json
@@ -10,16 +10,17 @@
     "image_multilabel_classification": "0.1.0",
     "instance_segmentation": "0.1.0",
     "layout_parsing": "0.3.0",
-    "PP-StructureV3": "0.3.0",
+    "PP-StructureV3": "0.3.1",
     "multilingual_speech_recognition": "0.2.0",
     "object_detection": "0.1.0",
-    "OCR": "0.2.1",
+    "OCR": "0.2.2",
     "open_vocabulary_detection": "0.1.0",
     "open_vocabulary_segmentation": "0.1.0",
     "pedestrian_attribute_recognition": "0.1.0",
     "PP-ChatOCRv3-doc": "0.3.1",
     "PP-ChatOCRv4-doc": "0.4.1",
     "PP-DocTranslation": "0.1.1",
+    "PaddleOCR-VL": "0.1.0",
     "PP-ShiTuV2": "0.1.0",
     "rotated_object_detection": "0.1.0",
     "seal_recognition": "0.2.1",
diff --git a/deploy/hps/server_env/Dockerfile b/deploy/hps/server_env/Dockerfile
index e7db00228d..1038a12d7f 100644
--- a/deploy/hps/server_env/Dockerfile
+++ b/deploy/hps/server_env/Dockerfile
@@ -33,9 +33,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 RUN mkdir /paddlex
 
-RUN apt-get update \
-    && apt-get install -y --no-install-recommends software-properties-common \
-    && add-apt-repository -y ppa:deadsnakes/ppa \
+RUN echo 'deb http://archive.ubuntu.com/ubuntu jammy main universe' > /etc/apt/sources.list.d/jammy-temp.list \
     && apt-get update \
     && apt-get install -y --no-install-recommends python3.10 python3.10-venv \
     && python3.10 -m venv /paddlex/py310 \
@@ -101,7 +99,8 @@ RUN --mount=type=bind,source=deploy/hps/server_env/requirements/${DEVICE_TYPE}.t
     python -m pip install --requirement /tmp/requirements.txt --requirement /tmp/hpi_requirements.txt \
     && if [ "${ENV_TYPE}" = 'dev' ]; then \
         python -m pip install --requirement /tmp/dev_requirements.txt; \
-    fi
+    fi \
+    && python -m pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
 
 RUN --mount=type=bind,source=.,target=/tmp/PaddleX,rw \
     python -m pip install --no-deps /tmp/PaddleX
diff --git a/deploy/hps/server_env/cpu_version.txt b/deploy/hps/server_env/cpu_version.txt
index 0f82685331..6678432209 100644
--- a/deploy/hps/server_env/cpu_version.txt
+++ b/deploy/hps/server_env/cpu_version.txt
@@ -1 +1 @@
-0.3.7
+0.3.8
diff --git a/deploy/hps/server_env/gpu_version.txt b/deploy/hps/server_env/gpu_version.txt
index 6678432209..940ac09aa6 100644
--- a/deploy/hps/server_env/gpu_version.txt
+++ b/deploy/hps/server_env/gpu_version.txt
@@ -1 +1 @@
-0.3.8
+0.3.9
diff --git a/deploy/hps/server_env/paddlex-hps-server/pyproject.toml b/deploy/hps/server_env/paddlex-hps-server/pyproject.toml
index f9709bf90b..422e3e19b6 100644
--- a/deploy/hps/server_env/paddlex-hps-server/pyproject.toml
+++ b/deploy/hps/server_env/paddlex-hps-server/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "paddlex-hps-server"
-version = "0.2.2"
+version = "0.3.0"
 # `paddlex` is not included here
 dependencies = [
     "colorlog >= 6.9",
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/__init__.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/__init__.py
index e5741669c7..d6fa861a55 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/__init__.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/__init__.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from importlib import metadata as _metadata
 
 from .base_model import BaseTritonPythonModel
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/app_common.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/app_common.py
index 649475fadd..14f699c3f1 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/app_common.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/app_common.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 from typing import Dict, Optional, Tuple, Union
 
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/base_model.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/base_model.py
index a7735f9323..09dc40df23 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/base_model.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/base_model.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 import time
 import uuid
@@ -66,14 +80,18 @@ def initialize(self, args):
         logging.info("%s initialized successfully", self.id)
 
     def execute(self, requests):
-        responses = []
-
-        for request in requests:
-            log_id = protocol.generate_log_id()
-            tokens = logging.set_context_vars(self.id, log_id)
-
-            start_time = time.perf_counter()
-            try:
+        batch_id = self._generate_batch_id()
+        tokens = logging.set_context_vars(self.id, batch_id)
+        logging.info("Received batch of size %s", len(requests))
+        start_time = time.perf_counter()
+
+        try:
+            inputs = {}
+            outputs = {}
+            log_ids = []
+            for i, request in enumerate(requests):
+                log_id = protocol.generate_log_id()
+                log_ids.append(log_id)
                 input_ = pb_utils.get_input_tensor_by_name(
                     request, constants.INPUT_NAME
                 )
@@ -81,39 +99,59 @@ def execute(self, requests):
                 input_model_type = self.get_input_model_type()
                 try:
                     input_ = protocol.parse_triton_input(input_, input_model_type)
+                    inputs[i] = input_
                 except ValidationError as e:
-                    output = protocol.create_aistudio_output_without_result(422, str(e))
-                else:
-                    try:
-                        result_or_output = self.run(input_, log_id)
-                    except Exception as e:
-                        logging.error("Unhandled exception", exc_info=e)
-                        output = protocol.create_aistudio_output_without_result(
-                            500, "Internal server error", log_id=log_id
+                    output = protocol.create_aistudio_output_without_result(
+                        422, str(e), log_id=log_id
+                    )
+                    outputs[i] = output
+
+            if inputs:
+                try:
+                    result_or_output_lst = self.run_batch(
+                        inputs.values(), [log_ids[i] for i in inputs.keys()], batch_id
+                    )
+                except Exception as e:
+                    logging.error("Unhandled exception", exc_info=e)
+                    for i in inputs.keys():
+                        outputs[i] = protocol.create_aistudio_output_without_result(
+                            500, "Internal server error", log_id=log_ids[i]
                         )
-                    else:
-                        result_model_type = self.get_result_model_type()
-                        if isinstance(result_or_output, result_model_type):
-                            output = protocol.create_aistudio_output_with_result(
-                                result_or_output, log_id=log_id
+                else:
+                    result_model_type = self.get_result_model_type()
+                    for i, item in enumerate(result_or_output_lst):
+                        if isinstance(item, result_model_type):
+                            outputs[i] = protocol.create_aistudio_output_with_result(
+                                item,
+                                log_id=log_ids[i],
                             )
                         else:
-                            output = result_or_output
+                            outputs[i] = item
+
+            assert len(outputs) == len(
+                requests
+            ), f"The number of outputs ({len(outputs)}) does not match the number of requests ({len(requests)})"
+
+            responses = []
+            for i in range(len(requests)):
+                output = outputs[i]
                 output = protocol.create_triton_output(output)
                 output = pb_utils.Tensor(constants.OUTPUT_NAME, output)
                 response = pb_utils.InferenceResponse(output_tensors=[output])
-            except Exception as e:
-                logging.error("Unhandled exception", exc_info=e)
-                response = pb_utils.InferenceResponse(
+                responses.append(response)
+        except Exception as e:
+            logging.error("Unhandled exception", exc_info=e)
+            responses = [
+                pb_utils.InferenceResponse(
                     output_tensors=[],
                     error=pb_utils.TritonError("An error occurred during execution"),
                 )
-            finally:
-                end_time = time.perf_counter()
-                logging.info("Time taken: %.3f ms", (end_time - start_time) * 1000)
-                logging.reset_context_vars(*tokens)
-
-            responses.append(response)
+                for _ in requests
+            ]
+        finally:
+            end_time = time.perf_counter()
+            logging.info("Time taken: %.3f ms", (end_time - start_time) * 1000)
+            logging.reset_context_vars(*tokens)
 
         return responses
 
@@ -126,6 +164,17 @@ def get_result_model_type(self):
     def run(self, input, log_id):
         raise NotImplementedError
 
+    def run_batch(self, inputs, log_ids, batch_id):
+        if len(inputs) != len(log_ids):
+            raise ValueError(
+                "The number of `inputs` does not match the number of `log_ids`"
+            )
+        outputs = []
+        for inp, log_id in zip(inputs, log_ids):
+            out = self.run(inp, log_id)
+            outputs.append(out)
+        return outputs
+
     def _create_pipeline(self, config, use_hpip):
         if self._device_id is not None:
             device = constr_device(self._device_type, [self._device_id])
@@ -141,3 +190,6 @@ def _create_pipeline(self, config, use_hpip):
 
     def _generate_model_id(self):
         return uuid.uuid4().hex
+
+    def _generate_batch_id(self):
+        return uuid.uuid4().hex
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/config.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/config.py
index b4f59b4fa8..97a2731bb2 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/config.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/config.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex.inference.serving.infra.config import (
     SERVING_CONFIG_KEY,
     AppConfig,
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/constants.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/constants.py
index 8d3a5d9359..2c1c6322cc 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/constants.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/constants.py
@@ -1,2 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 INPUT_NAME = "input"
 OUTPUT_NAME = "output"
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/env.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/env.py
index 8d5e378961..aec8b1a821 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/env.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/env.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 PIPELINE_CONFIG_PATH = os.getenv("PADDLEX_HPS_PIPELINE_CONFIG_PATH", "")
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/lazy_mods.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/lazy_mods.py
index b06ecc6951..3c017e8645 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/lazy_mods.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/lazy_mods.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import importlib
 
 
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/logging.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/logging.py
index 9af024bd16..028369dd86 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/logging.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/logging.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import logging
 import sys
 from contextvars import ContextVar
@@ -15,7 +29,7 @@
 }
 
 model_id_var = ContextVar("model_id", default="*")
-log_id_var = ContextVar("log_id", default="*")
+batch_id_var = ContextVar("batch_id", default="*")
 _logger = logging.getLogger("paddlex-hps-server")
 
 
@@ -23,7 +37,7 @@ def _log_with_context(func):
     def _wrapper(msg, *args, **kwargs):
         extra = kwargs.get("extra", {})
         extra["model_id"] = model_id_var.get()
-        extra["log_id"] = log_id_var.get()
+        extra["batch_id"] = batch_id_var.get()
         kwargs["extra"] = extra
         return func(msg, *args, **kwargs)
 
@@ -34,7 +48,7 @@ def set_up_logger():
     if env.LOGGING_LEVEL:
         _logger.setLevel(env.LOGGING_LEVEL)
         format = colorlog.ColoredFormatter(
-            "%(log_color)s[%(levelname)8s] [%(asctime)-15s] [%(model_id)s] [%(log_id)s] - %(message)s",
+            "%(log_color)s[%(levelname)8s] [%(asctime)-15s] [%(model_id)s] [%(batch_id)s] - %(message)s",
             log_colors={key: conf["color"] for key, conf in _LOGGING_CONFIG.items()},
         )
         handler = logging.StreamHandler(sys.stderr)
@@ -43,13 +57,13 @@ def set_up_logger():
         _logger.propagate = False
 
 
-def set_context_vars(model_id, log_id):
-    return model_id_var.set(model_id), log_id_var.set(log_id)
+def set_context_vars(model_id, batch_id):
+    return model_id_var.set(model_id), batch_id_var.set(batch_id)
 
 
-def reset_context_vars(model_id_token, log_id_token):
+def reset_context_vars(model_id_token, batch_id_token):
     model_id_var.reset(model_id_token)
-    log_id_var.reset(log_id_token)
+    batch_id_var.reset(batch_id_token)
 
 
 debug = _log_with_context(_logger.debug)
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/protocol.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/protocol.py
index 8f0e491667..b90230c705 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/protocol.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/protocol.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import TYPE_CHECKING, Optional
 
 import numpy as np
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/schemas.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/schemas.py
index 0e86672377..5ff53cdfcd 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/schemas.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/schemas.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from importlib import import_module
 
 
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/storage.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/storage.py
index 2cf38911dd..6314634c00 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/storage.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/storage.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex.inference.serving.infra.storage import (
     BOS,
     BOSConfig,
diff --git a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/utils.py b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/utils.py
index ab590029ee..771f46a5bb 100644
--- a/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/utils.py
+++ b/deploy/hps/server_env/paddlex-hps-server/src/paddlex_hps_server/utils.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddlex.inference.serving.infra.utils import (
     base64_encode,
     csv_bytes_to_data_frame,
diff --git a/deploy/hps/server_env/requirements/cpu.txt b/deploy/hps/server_env/requirements/cpu.txt
index c5b4c69d48..bbbeaab8e4 100644
--- a/deploy/hps/server_env/requirements/cpu.txt
+++ b/deploy/hps/server_env/requirements/cpu.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --allow-unsafe --extra=base --extra=serving --no-emit-index-url --no-emit-trusted-host --output-file=requirements/cpu.txt --strip-extras ../../../setup.py paddlex-hps-server/pyproject.toml requirements/app.in requirements/cpu.in
+#    pip-compile --allow-unsafe --cert=None --client-cert=None --extra=base --extra=serving --index-url=None --no-emit-index-url --no-emit-trusted-host --output-file=requirements/cpu.txt --pip-args=None --strip-extras ../../../setup.py paddlex-hps-server/pyproject.toml requirements/app.in requirements/cpu.in
 #
 aiohappyeyeballs==2.4.6
     # via aiohttp
@@ -13,6 +13,8 @@ aiohttp==3.11.12
     #   paddlex (../../../setup.py)
 aiosignal==1.3.2
     # via aiohttp
+aistudio-sdk==0.3.5
+    # via paddlex (../../../setup.py)
 annotated-types==0.7.0
     # via pydantic
 anyio==4.8.0
@@ -27,7 +29,9 @@ async-timeout==4.0.3
 attrs==25.1.0
     # via aiohttp
 bce-python-sdk==0.9.29
-    # via paddlex (../../../setup.py)
+    # via
+    #   aistudio-sdk
+    #   paddlex (../../../setup.py)
 beautifulsoup4==4.13.4
     # via paddlex (../../../setup.py)
 cachetools==5.5.2
@@ -46,7 +50,9 @@ charset-normalizer==3.4.1
 chinese-calendar==1.10.0
     # via paddlex (../../../setup.py)
 click==8.1.8
-    # via uvicorn
+    # via
+    #   aistudio-sdk
+    #   uvicorn
 colorlog==6.9.0
     # via
     #   paddlex (../../../setup.py)
@@ -78,6 +84,7 @@ fastapi==0.115.8
 filelock==3.17.0
     # via
     #   huggingface-hub
+    #   modelscope
     #   paddlex (../../../setup.py)
 filetype==1.2.0
     # via paddlex (../../../setup.py)
@@ -172,6 +179,8 @@ matplotlib==3.5.2
     # via
     #   paddlex (../../../setup.py)
     #   pycocotools
+modelscope==1.29.0
+    # via paddlex (../../../setup.py)
 more-itertools==10.6.0
     # via cssutils
 multidict==6.1.0
@@ -243,7 +252,9 @@ pillow==9.5.0
 premailer==3.10.0
     # via paddlex (../../../setup.py)
 prettytable==3.14.0
-    # via paddlex (../../../setup.py)
+    # via
+    #   aistudio-sdk
+    #   paddlex (../../../setup.py)
 propcache==0.3.0
     # via
     #   aiohttp
@@ -278,6 +289,8 @@ pyparsing==3.2.1
     # via matplotlib
 pypdfium2==4.30.1
     # via paddlex (../../../setup.py)
+python-bidi==0.6.6
+    # via paddlex (../../../setup.py)
 python-dateutil==2.9.0.post0
     # via
     #   matplotlib
@@ -297,10 +310,12 @@ regex==2024.11.6
     #   tiktoken
 requests==2.32.3
     # via
+    #   aistudio-sdk
     #   huggingface-hub
     #   langchain
     #   langchain-community
     #   langsmith
+    #   modelscope
     #   paddlex (../../../setup.py)
     #   premailer
     #   requests-toolbelt
@@ -311,6 +326,8 @@ ruamel-yaml==0.18.10
     # via paddlex (../../../setup.py)
 ruamel-yaml-clib==0.2.12
     # via ruamel-yaml
+safetensors==0.6.2
+    # via paddlex (../../../setup.py)
 scikit-image==0.24.0
     # via paddlex (../../../setup.py)
 scikit-learn==1.6.1
@@ -319,6 +336,8 @@ scipy==1.15.2
     # via
     #   scikit-image
     #   scikit-learn
+sentencepiece==0.2.1
+    # via paddlex (../../../setup.py)
 shapely==2.0.7
     # via paddlex (../../../setup.py)
 six==1.17.0
@@ -358,7 +377,9 @@ tokenizers==0.19.1
     # via paddlex (../../../setup.py)
 tqdm==4.67.1
     # via
+    #   aistudio-sdk
     #   huggingface-hub
+    #   modelscope
     #   openai
     #   paddlex (../../../setup.py)
 typing-extensions==4.12.2
@@ -383,7 +404,9 @@ typing-inspect==0.9.0
 ujson==5.10.0
     # via paddlex (../../../setup.py)
 urllib3==2.3.0
-    # via requests
+    # via
+    #   modelscope
+    #   requests
 uvicorn==0.34.0
     # via paddlex (../../../setup.py)
 wcwidth==0.2.13
@@ -394,3 +417,7 @@ yarl==1.18.3
     # via
     #   aiohttp
     #   paddlex (../../../setup.py)
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==80.9.0
+    # via modelscope
diff --git a/deploy/hps/server_env/requirements/cpu_dev.txt b/deploy/hps/server_env/requirements/cpu_dev.txt
index ad1a87d716..7c6e251f59 100644
--- a/deploy/hps/server_env/requirements/cpu_dev.txt
+++ b/deploy/hps/server_env/requirements/cpu_dev.txt
@@ -2,5 +2,5 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --allow-unsafe --constraint=requirements/cpu.txt --no-emit-index-url --no-emit-trusted-host --output-file=requirements/cpu_dev.txt --strip-extras requirements/cpu_dev.in
+#    pip-compile --allow-unsafe --cert=None --client-cert=None --constraint=requirements/cpu.txt --index-url=None --no-emit-index-url --no-emit-trusted-host --output-file=requirements/cpu_dev.txt --pip-args=None --strip-extras requirements/cpu_dev.in
 #
diff --git a/deploy/hps/server_env/requirements/cpu_hpi.txt b/deploy/hps/server_env/requirements/cpu_hpi.txt
index 839b28b45f..61dd0c4bc3 100644
--- a/deploy/hps/server_env/requirements/cpu_hpi.txt
+++ b/deploy/hps/server_env/requirements/cpu_hpi.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --allow-unsafe --constraint=requirements/cpu.txt --no-emit-index-url --no-emit-trusted-host --output-file=requirements/cpu_hpi.txt --strip-extras requirements/cpu_hpi.in
+#    pip-compile --allow-unsafe --cert=None --client-cert=None --constraint=requirements/cpu.txt --index-url=None --no-emit-index-url --no-emit-trusted-host --output-file=requirements/cpu_hpi.txt --pip-args=None --strip-extras requirements/cpu_hpi.in
 #
 certifi==2025.1.31
     # via
diff --git a/deploy/hps/server_env/requirements/gpu.txt b/deploy/hps/server_env/requirements/gpu.txt
index a706e121d5..3484d72f48 100644
--- a/deploy/hps/server_env/requirements/gpu.txt
+++ b/deploy/hps/server_env/requirements/gpu.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --allow-unsafe --extra=base --extra=serving --no-emit-index-url --no-emit-trusted-host --output-file=requirements/gpu.txt --strip-extras ../../../setup.py paddlex-hps-server/pyproject.toml requirements/app.in requirements/gpu.in
+#    pip-compile --allow-unsafe --cert=None --client-cert=None --extra=base --extra=serving --index-url=None --no-emit-index-url --no-emit-trusted-host --output-file=requirements/gpu.txt --pip-args=None --strip-extras ../../../setup.py paddlex-hps-server/pyproject.toml requirements/app.in requirements/gpu.in
 #
 aiohappyeyeballs==2.4.6
     # via aiohttp
@@ -13,6 +13,8 @@ aiohttp==3.11.12
     #   paddlex (../../../setup.py)
 aiosignal==1.3.2
     # via aiohttp
+aistudio-sdk==0.3.5
+    # via paddlex (../../../setup.py)
 annotated-types==0.7.0
     # via pydantic
 anyio==4.8.0
@@ -27,7 +29,9 @@ async-timeout==4.0.3
 attrs==25.1.0
     # via aiohttp
 bce-python-sdk==0.9.42
-    # via paddlex (../../../setup.py)
+    # via
+    #   aistudio-sdk
+    #   paddlex (../../../setup.py)
 beautifulsoup4==4.13.4
     # via paddlex (../../../setup.py)
 cachetools==5.5.2
@@ -46,7 +50,9 @@ charset-normalizer==3.4.1
 chinese-calendar==1.10.0
     # via paddlex (../../../setup.py)
 click==8.2.1
-    # via uvicorn
+    # via
+    #   aistudio-sdk
+    #   uvicorn
 colorlog==6.9.0
     # via
     #   paddlex (../../../setup.py)
@@ -78,6 +84,7 @@ fastapi==0.116.1
 filelock==3.17.0
     # via
     #   huggingface-hub
+    #   modelscope
     #   paddlex (../../../setup.py)
 filetype==1.2.0
     # via paddlex (../../../setup.py)
@@ -172,6 +179,8 @@ matplotlib==3.5.2
     # via
     #   paddlex (../../../setup.py)
     #   pycocotools
+modelscope==1.29.0
+    # via paddlex (../../../setup.py)
 more-itertools==10.6.0
     # via cssutils
 multidict==6.1.0
@@ -243,7 +252,9 @@ pillow==9.5.0
 premailer==3.10.0
     # via paddlex (../../../setup.py)
 prettytable==3.14.0
-    # via paddlex (../../../setup.py)
+    # via
+    #   aistudio-sdk
+    #   paddlex (../../../setup.py)
 propcache==0.3.0
     # via
     #   aiohttp
@@ -278,6 +289,8 @@ pyparsing==3.2.1
     # via matplotlib
 pypdfium2==4.30.1
     # via paddlex (../../../setup.py)
+python-bidi==0.6.6
+    # via paddlex (../../../setup.py)
 python-dateutil==2.9.0.post0
     # via
     #   matplotlib
@@ -297,10 +310,12 @@ regex==2024.11.6
     #   tiktoken
 requests==2.32.3
     # via
+    #   aistudio-sdk
     #   huggingface-hub
     #   langchain
     #   langchain-community
     #   langsmith
+    #   modelscope
     #   paddlex (../../../setup.py)
     #   premailer
     #   requests-toolbelt
@@ -311,6 +326,8 @@ ruamel-yaml==0.18.10
     # via paddlex (../../../setup.py)
 ruamel-yaml-clib==0.2.12
     # via ruamel-yaml
+safetensors==0.6.2
+    # via paddlex (../../../setup.py)
 scikit-image==0.24.0
     # via paddlex (../../../setup.py)
 scikit-learn==1.6.1
@@ -319,6 +336,8 @@ scipy==1.15.2
     # via
     #   scikit-image
     #   scikit-learn
+sentencepiece==0.2.1
+    # via paddlex (../../../setup.py)
 shapely==2.0.7
     # via paddlex (../../../setup.py)
 six==1.17.0
@@ -358,7 +377,9 @@ tokenizers==0.19.1
     # via paddlex (../../../setup.py)
 tqdm==4.67.1
     # via
+    #   aistudio-sdk
     #   huggingface-hub
+    #   modelscope
     #   openai
     #   paddlex (../../../setup.py)
 typing-extensions==4.12.2
@@ -384,7 +405,9 @@ typing-inspect==0.9.0
 ujson==5.10.0
     # via paddlex (../../../setup.py)
 urllib3==2.3.0
-    # via requests
+    # via
+    #   modelscope
+    #   requests
 uvicorn==0.35.0
     # via paddlex (../../../setup.py)
 wcwidth==0.2.13
@@ -395,3 +418,7 @@ yarl==1.18.3
     # via
     #   aiohttp
     #   paddlex (../../../setup.py)
+
+# The following packages are considered to be unsafe in a requirements file:
+setuptools==80.9.0
+    # via modelscope
diff --git a/deploy/hps/server_env/requirements/gpu_dev.txt b/deploy/hps/server_env/requirements/gpu_dev.txt
index 0e7e5715f8..8794c5c46c 100644
--- a/deploy/hps/server_env/requirements/gpu_dev.txt
+++ b/deploy/hps/server_env/requirements/gpu_dev.txt
@@ -2,5 +2,5 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --allow-unsafe --constraint=requirements/gpu.txt --no-emit-index-url --no-emit-trusted-host --output-file=requirements/gpu_dev.txt --strip-extras requirements/gpu_dev.in
+#    pip-compile --allow-unsafe --cert=None --client-cert=None --constraint=requirements/gpu.txt --index-url=None --no-emit-index-url --no-emit-trusted-host --output-file=requirements/gpu_dev.txt --pip-args=None --strip-extras requirements/gpu_dev.in
 #
diff --git a/deploy/hps/server_env/requirements/gpu_hpi.txt b/deploy/hps/server_env/requirements/gpu_hpi.txt
index 00bd86d6a1..de1a847d4e 100644
--- a/deploy/hps/server_env/requirements/gpu_hpi.txt
+++ b/deploy/hps/server_env/requirements/gpu_hpi.txt
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --allow-unsafe --constraint=requirements/gpu.txt --no-emit-index-url --no-emit-trusted-host --output-file=requirements/gpu_hpi.txt --strip-extras requirements/gpu_hpi.in
+#    pip-compile --allow-unsafe --cert=None --client-cert=None --constraint=requirements/gpu.txt --index-url=None --no-emit-index-url --no-emit-trusted-host --output-file=requirements/gpu_hpi.txt --pip-args=None --strip-extras requirements/gpu_hpi.in
 #
 certifi==2025.1.31
     # via
diff --git a/deploy/hps/server_env/scripts/remove_images.sh b/deploy/hps/server_env/scripts/remove_images.sh
index fafc29a8ec..2926504e3d 100755
--- a/deploy/hps/server_env/scripts/remove_images.sh
+++ b/deploy/hps/server_env/scripts/remove_images.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-paddlex_version="$(cat ../../paddlex/.version)"
+paddlex_version="$(cat ../../../paddlex/.version)"
 
 for device_type in 'gpu' 'cpu'; do
     version="$(cat "${device_type}_version.txt")"
diff --git a/deploy/hps/server_env/scripts/tag_and_push_images.sh b/deploy/hps/server_env/scripts/tag_and_push_images.sh
index 50bdb95bbc..e7b114565d 100755
--- a/deploy/hps/server_env/scripts/tag_and_push_images.sh
+++ b/deploy/hps/server_env/scripts/tag_and_push_images.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-paddlex_version="$(cat ../../paddlex/.version)"
+paddlex_version="$(cat ../../../paddlex/.version)"
 
 for device_type in 'gpu' 'cpu'; do
     version="$(cat "${device_type}_version.txt")"
diff --git a/docs/CHANGELOG.en.md b/docs/CHANGELOG.en.md
index cd57244f11..d50b4049b6 100644
--- a/docs/CHANGELOG.en.md
+++ b/docs/CHANGELOG.en.md
@@ -6,6 +6,10 @@ comments: true
 
 ## Latest Version Information
 
+### PaddleX v3.3.0 (October 16, 2025)
+
+- **Added support for inference and deployment of PaddleOCR-VL and PP-OCRv5 multilingual models.**
+
 ### PaddleX v3.2.0(8.20/2025)
 
 - **Deployment Capability Upgrades:**
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index ae4698e126..8fac9c0fb5 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -6,6 +6,10 @@ comments: true
 
 ## 最新版本信息
 
+### PaddleX v3.3.0(10.16/2025)
+
+- **支持PaddleOCR-VL、PP-OCRv5多语种模型的推理部署能力。**
+
 ### PaddleX v3.2.0(8.20/2025)
 
 - **部署能力升级：**
diff --git a/docs/installation/installation.en.md b/docs/installation/installation.en.md
index b861147bb0..a4fa26e9ed 100644
--- a/docs/installation/installation.en.md
+++ b/docs/installation/installation.en.md
@@ -4,6 +4,9 @@ comments: true
 
 # PaddleX Local Installation Tutorial
 > ❗Before installing PaddleX, please ensure you have a basic <b>Python environment</b> (Note: Currently supports Python 3.8 to Python 3.12, with more Python versions being adapted).
+
+> ❗In most cases, you need to first install the PaddlePaddle framework by referring to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md) before proceeding with PaddleX's installation steps. [4 PaddleX's Dependency on PaddlePaddle Framework](#4-paddlexs-dependency-on-paddlepaddle-framework) lists scenarios where installing the PaddlePaddle framework is not required.
+
 ## 1. Quick Installation
 Welcome to PaddleX, Baidu's low-code development tool for AI. Before we dive into the local installation process, please clarify your development needs and choose the appropriate installation mode.
 
@@ -12,16 +15,14 @@ PaddleX offers two installation modes: <b>Wheel Package Installation</b> and <b>
 ### 1.1 Wheel Package Installation Mode
 If your use case for PaddleX involves <b>model inference and integration</b>, we recommend the more <b>convenient</b> and <b>lightweight</b> Wheel package installation mode.
 
-After installing PaddlePaddle (refer to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md)), you can quickly install the PaddleX Wheel package by executing the following commands:
-
-> ❗ <b>Note</b>: Please ensure that PaddlePaddle is successfully installed before proceeding to the next step.
+You can quickly install the PaddleX Wheel package by executing the following commands:
 
 ```bash
 # Only install the required dependencies (optional dependencies can be installed later as needed)
 pip install paddlex
 ```
 
-You can install the optional dependencies as needed using the following method (For more details, please refer to [2.3 Selective Installation of Dependencies](#23-selective-installation-of-dependencies)):
+You can install the optional dependencies as needed using the following method (For more details, please refer to [3 Selective Installation of Dependencies](#3-selective-installation-of-dependencies)):
 
 Install all dependencies required for PaddleX "basic features":
 
@@ -121,7 +122,7 @@ The model training related plugins supported by PaddleX are listed below. Please
 </tbody>
 </table></details>
 
-If the plugin you need to install is `PaddleXXX`, after installing PaddlePaddle (refer to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md)), you can quickly install the corresponding PaddleX plugin by executing the following commands:
+If the plugin you need to install is `PaddleXXX`, you can quickly install the corresponding PaddleX plugin by executing the following commands:
 
 ```bash
 git clone https://github.com/PaddlePaddle/PaddleX.git
@@ -139,7 +140,7 @@ When installing PaddleX on Linux, we <b>strongly recommend using the official Pa
 
 When using the official Docker image, <b>PaddlePaddle, PaddleX (including the wheel package and all plugins), and the corresponding CUDA environment are already pre-installed</b>. You can simply obtain the Docker image and start the container to begin using it. <b>Please note that the official Docker image of PaddleX is different from the official Docker image of the PaddlePaddle framework, as the latter does not come with PaddleX pre-installed.</b>
 
-When using custom installation methods, you need to first install the PaddlePaddle framework, then obtain the PaddleX source code, and finally choose the PaddleX installation mode.
+When using custom installation methods, you need to first install the PaddlePaddle framework (except for [a few cases](#4-paddlexs-dependency-on-paddlepaddle-framework)), then obtain the PaddleX source code, and finally choose the PaddleX installation mode.
 ### 2.1 Get PaddleX based on Docker
 Using the PaddleX official Docker image, create a container called 'paddlex' and map the current working directory to the '/paddle' directory inside the container by following the command.
 
@@ -147,13 +148,13 @@ If your Docker version >= 19.03, please use:
 
 ```bash
 # For CPU
-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-cpu /bin/bash
+docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-cpu /bin/bash
 
 # gpu，requires GPU driver version ≥450.80.02 (Linux) or ≥452.39 (Windows)
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
 
 # gpu，requires GPU driver version ≥545.23.06（Linux） or ≥545.84（Windows）
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
 ```
 
 * If your Docker version <= 19.03 and >= 17.06, please use:
@@ -161,14 +162,14 @@ docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=hos
 <details><summary> Click Here</summary>
 
 <pre><code class="language-bash"># For CPU
-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-cpu /bin/bash
+docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-cpu /bin/bash
 
 # For GPU
 # gpu，requires GPU driver version ≥450.80.02 (Linux) or ≥452.39 (Windows)
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
 
 # gpu，requires GPU driver version ≥545.23.06（Linux） or ≥545.84（Windows）
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
 
 </code></pre></details>
 
@@ -177,7 +178,6 @@ nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -i
 * If you want to delve deeper into the principles or usage of Docker, please refer to the [Docker Official Website](https://www.docker.com/) or the [Docker Official Tutorial](https://docs.docker.com/get-started/).
 
 ### 2.2 Custom Installation of PaddleX
-Before installation, please ensure you have completed the local installation of PaddlePaddle by referring to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md).
 
 #### 2.2.1 Obtain PaddleX Source Code
 Next, use the following command to obtain the latest PaddleX source code from GitHub:
@@ -248,7 +248,7 @@ All packages are installed.
 
 For PaddleX installation on more hardware environments, please refer to the [PaddleX Multi-hardware Usage Guide](../other_devices_support/multi_devices_use_guide.en.md)
 
-### 2.3 Selective Installation of Dependencies
+## 3 Selective Installation of Dependencies
 
 PaddleX offers a wide range of features, and different features require different dependencies. The features in PaddleX that can be used without installing plugins are categorized as "basic features." The official PaddleX Docker images have all dependencies required for these basic features preinstalled. Similarly, using the installation method introduced earlier—`pip install "...[base]"`—will install all dependencies needed for the basic features.
 
@@ -279,8 +279,17 @@ PaddleX currently provides the following dependency groups:
 | `ts` | Basic features of time series pipelines. |
 | `video` | Basic features of video pipelines. |
 | `trans` | Basic features of translation pipelines. |
+| `genai-client` | The generative AI client feature. Installing this group is equivalent to installing the generative AI client plugin; the plugin can also be installed via the PaddleX CLI. |
+| `genai-sglang-server` | The serving feature. Installing this group is equivalent to installing the PaddleX serving plugin; the plugin can also be installed via the PaddleX CLI. |
+| `genai-vllm-server` | The serving feature. Installing this group is equivalent to installing the PaddleX serving plugin; the plugin can also be installed via the PaddleX CLI. |
 | `serving` | The serving feature. Installing this group is equivalent to installing the PaddleX serving plugin; the plugin can also be installed via the PaddleX CLI. |
-| `plugins` | All plugin-provided features that support installation via dependency groups. |
-| `all` | All basic features of PaddleX, as well as all plugin-provided features installable via dependency groups. |
+| `paddle2onnx` | The Paddle2ONNX feature. Installing this group is equivalent to installing the PaddleX Paddle2ONNX plugin; the plugin can also be installed via the PaddleX CLI. |
 
 Each pipeline belongs to exactly one dependency group. You can refer to the tutorial of each pipeline to find out which dependency group it belongs to. For modules, you can access the related basic features by installing any dependency group that includes the module.
+
+## 4 PaddleX's Dependency on PaddlePaddle Framework
+
+The vast majority of PaddleX's functionalities rely on the PaddlePaddle framework. Therefore, in most cases, you need to install the PaddlePaddle framework before using PaddleX by referring to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md). However, for the following scenarios, you can use the corresponding features without installing the PaddlePaddle framework:
+
+- Using the capabilities provided by PaddleX's `genai-vllm-server` or `genai-sglang-server` plugins to deploy model inference services.
+- Using PaddleX's `genai-client` plugin to call generative AI inference services.
diff --git a/docs/installation/installation.md b/docs/installation/installation.md
index 73e8db0449..2c3a2dac4d 100644
--- a/docs/installation/installation.md
+++ b/docs/installation/installation.md
@@ -5,6 +5,8 @@ comments: true
 # PaddleX本地安装教程
 > ❗安装 PaddleX 前请先确保您有基础的 <b>Python 运行环境</b>（注：当前支持Python 3.8 ～ Python 3.12下运行）。
 
+> ❗在大多数情况下，您需要先参考 [飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md) 安装飞桨框架，再执行 PaddleX 的安装步骤。[4 PaddleX 对飞桨框架的依赖](#4-paddlex-对飞桨框架的依赖) 中列举了不需要安装飞桨框架的情形。
+
 ## 1. 快速安装
 欢迎您使用飞桨低代码开发工具PaddleX，在我们正式开始本地安装之前，请首先明确您的开发需求，并根据您的需求选择合适的安装模式。
 PaddleX为您提供了两种安装模式：<b>Wheel包安装</b>和<b>插件安装</b>，下面分别对其应用场景进行介绍：
@@ -14,16 +16,14 @@ PaddleX为您提供了两种安装模式：<b>Wheel包安装</b>和<b>插件安
 
 快速安装轻量级的Wheel包之后，您即可基于PaddleX支持的所有模型进行推理，并能直接集成进您的项目中。
 
-参考[飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md)安装飞桨后，您可直接执行如下指令快速安装PaddleX的Wheel包：
-
-> ❗ 注：请务必保证 PaddlePaddle 安装成功，安装成功后，方可进行下一步。
+您可直接执行如下指令快速安装PaddleX的Wheel包：
 
 ```bash
 # 仅安装必须依赖（可以在之后按需安装可选依赖）
 pip install paddlex
 ```
 
-通过如下方式可以安装所需的可选依赖（更多说明请参考 [2.3 选择性安装依赖](#23-选择性安装依赖)）：
+通过如下方式可以安装所需的可选依赖（更多说明请参考 [3 选择性安装依赖](#3-选择性安装依赖)）：
 
 安装 PaddleX “基础功能”需要的全部依赖：
 
@@ -124,7 +124,7 @@ PaddleX支持的模型训练相关插件如下，请您根据开发需求，确
 
 
 
-若您需要安装的插件为`PaddleXXX`，在参考[飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md)安装飞桨后，您可以直接执行如下指令快速安装PaddleX的对应插件：
+若您需要安装的插件为`PaddleXXX`，可以直接执行如下指令快速安装PaddleX的对应插件：
 
 ```bash
 git clone https://github.com/PaddlePaddle/PaddleX.git
@@ -144,7 +144,7 @@ paddlex --install PaddleXXX  # 例如PaddleOCR
 
 当您使用官方 Docker 镜像安装时，其中<b>已经内置了 PaddlePaddle、PaddleX（包括wheel包和所有插件）</b>，并配置好了相应的CUDA环境，<b>您获取 Docker 镜像并启动容器即可开始使用</b>。<b>请注意，PaddleX 官方 Docker 镜像与飞桨框架官方 Docker 镜像不同，后者并没有预装 PaddleX。</b>
 
-当您使用自定义方式安装时，需要先安装飞桨 PaddlePaddle 框架，随后获取 PaddleX 源码，最后选择PaddleX的安装模式。
+当您使用自定义方式安装时，需要先安装飞桨 PaddlePaddle 框架（除 [少数情形](#4-paddlex-对飞桨框架的依赖) 外），随后获取 PaddleX 源码，最后选择PaddleX的安装模式。
 
 > ❗ 无需关注物理机上的 CUDA 版本，只需关注显卡驱动程序版本。
 
@@ -155,14 +155,14 @@ paddlex --install PaddleXXX  # 例如PaddleOCR
 
 ```bash
 # 对于 CPU 用户
-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-cpu /bin/bash
+docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-cpu /bin/bash
 
 # 对于 GPU 用户
 # GPU 版本，需显卡驱动程序版本 ≥450.80.02（Linux）或 ≥452.39（Windows）
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
 
 # GPU 版本，需显卡驱动程序版本 ≥545.23.06（Linux）或 ≥545.84（Windows）
-docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
+docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
 ```
 
 
@@ -171,14 +171,14 @@ docker run --gpus all --name paddlex -v $PWD:/paddle --shm-size=8g --network=hos
 <details><summary> 点击展开</summary>
 
 <pre><code class="language-bash"># 对于 CPU 用户
-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-cpu /bin/bash
+docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-cpu /bin/bash
 
 # 对于 GPU 用户
 # GPU 版本，需显卡驱动程序版本 ≥450.80.02（Linux）或 ≥452.39（Windows）
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-gpu-cuda11.8-cudnn8.9-trt8.6 /bin/bash
 
 # GPU 版本，需显卡驱动程序版本 ≥545.23.06（Linux）或 ≥545.84（Windows）
-nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.1.2-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
+nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -it ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/paddlex:paddlex3.2.0-paddlepaddle3.0.0-gpu-cuda12.6-cudnn9.5-trt10.5 /bin/bash
 
 </code></pre></details>
 
@@ -187,7 +187,6 @@ nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -i
 * 若您想更深入了解 Docker 的原理或使用方式，请参考 [Docker官方网站](https://www.docker.com/) 或 [Docker官方教程](https://docs.docker.com/get-started/)。
 
 ### 2.2 自定义方式安装PaddleX
-在安装之前，请确保您已经参考[飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md)完成飞桨的本地安装。
 
 #### 2.2.1 获取 PaddleX 源码
 接下来，请使用以下命令从 GitHub 获取 PaddleX 最新源码：
@@ -251,7 +250,7 @@ All packages are installed.
 ```
 更多硬件环境的PaddleX安装请参考[PaddleX多硬件使用指南](../other_devices_support/multi_devices_use_guide.md)
 
-### 2.3 选择性安装依赖
+## 3 选择性安装依赖
 
 PaddleX 的功能丰富，而不同的功能需要的依赖也不尽相同。将 PaddleX 中不需要安装插件即可使用的功能归类为“基础功能”。PaddleX 官方 Docker 镜像预置了基础功能所需的全部依赖；使用上文介绍的 `pip install "...[base]"` 的安装方式也将安装基础功能需要的所有依赖。如果您只专注于 PaddleX 的某一项功能，且希望保持安装的依赖的体积尽可能小，可以通过指定“依赖组”的方式，选择性地安装依赖：
 
@@ -280,9 +279,17 @@ PaddleX 目前提供如下依赖组：
 | `ts` | 时序产线的基础功能。 |
 | `video` | 视频产线的基础功能。 |
 | `trans` | 翻译产线的基础功能。 |
+| `genai-client` | 生成式 AI 客户端功能。安装此依赖组等效于安装 PaddleX 生成式 AI 客户端插件；也可以通过 PaddleX CLI 安装生成式 AI 客户端插件。 |
+| `genai-sglang-server` | SGLang 服务器功能。安装此依赖组等效于安装 PaddleX SGLang 服务器插件；也可以通过 PaddleX CLI 安装SGLang 服务器插件。 |
+| `genai-vllm-server` | vLLM 服务器功能。安装此依赖组等效于安装 PaddleX vLLM 服务器插件；也可以通过 PaddleX CLI 安装 vLLM 服务器插件。 |
 | `serving` | 服务化部署功能。安装此依赖组等效于安装 PaddleX 服务化部署插件；也可以通过 PaddleX CLI 安装服务化部署插件。 |
-| `plugins` | 所有支持通过指定依赖组安装的插件提供的功能。 |
-| `all` | PaddleX 的所有基础功能，以及所有支持通过指定依赖组安装的插件提供的功能。 |
-
+| `paddle2onnx` | Paddle2ONNX 功能。安装此依赖组等效于安装 PaddleX Paddle2ONNX 插件；也可以通过 PaddleX CLI 安装 Paddle2ONNX 插件。 |
 
 每一条产线属于且仅属于一个依赖组；在各产线的使用文档中可以了解产线属于哪一依赖组。对于单功能模块，安装任意包含该模块的产线对应的依赖组后即可使用相关的基础功能。
+
+## 4 PaddleX 对飞桨框架的依赖
+
+PaddleX 的绝大部分功能依赖飞桨框架，因此，在大多数情况下，您需要在使用 PaddleX 前参考 [飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md) 安装飞桨框架。不过，对于以下几种情形，不必安装飞桨框架也可以使用相应的功能：
+
+- 使用 PaddleX `genai-vllm-server` 或 `genai-sglang-server` 插件提供的能力部署模型推理服务。
+- 使用 PaddleX `genai-client` 插件调用生成式 AI 推理服务。
diff --git a/docs/installation/paddlepaddle_install.en.md b/docs/installation/paddlepaddle_install.en.md
index 7eee4adfc1..0fa4d81869 100644
--- a/docs/installation/paddlepaddle_install.en.md
+++ b/docs/installation/paddlepaddle_install.en.md
@@ -43,13 +43,8 @@ nvidia-docker run --name paddlex -v $PWD:/paddle  --shm-size=8G --network=host -
 
 * Note: For more official PaddlePaddle Docker images, please refer to the [PaddlePaddle official website](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/en/install/docker/linux-docker.html)
 
-To use [Paddle Inference TensorRT Subgraph Engine](https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#gpu), install TensorRT by executing the following instructions in the 'paddlex' container that has just been started
-
-```bash
-python -m pip install /usr/local/TensorRT-*/python/tensorrt-*-cp310-none-linux_x86_64.whl
-```
-
 ## Installing PaddlePaddle via pip
+
 <b>If you choose to install via pip</b>, please refer to the following commands to install PaddlePaddle in your current environment using pip:
 
 ```bash
@@ -100,21 +95,29 @@ python -m pip install https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-Ta
 
 ## Install the TensorRT Subgraph Engine
 
-If you want to use the [Paddle Inference TensorRT Subgraph Engine](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/paddle_v3_features/paddle_trt_en.html), after installing Paddle, you need to refer to the [TensorRT Documentation](https://docs.nvidia.com/deeplearning/tensorrt/archives/index.html) to install the corresponding version of TensorRT:
+If you want to use the [Paddle Inference TensorRT Subgraph Engine](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/paddle_v3_features/paddle_trt_en.html):
 
-- For PaddlePaddle with CUDA 11.8, the compatible TensorRT version is 8.x (where x >= 6). PaddleX has completed compatibility tests of Paddle-TensorRT on TensorRT 8.6.1.6, so it is **strongly recommended to install TensorRT 8.6.1.6**.
+1. If you are using the official PaddlePaddle 3.0 Docker image, you need to run the following command in the started container to install TensorRT.
 
-Below is an example of installing TensorRT 8.6.1.6 using the "Tar File Installation" method in a CUDA 11.8 environment:
+    ```bash
+    python -m pip install /usr/local/TensorRT-*/python/tensorrt-*-cp310-none-linux_x86_64.whl
+    ```
 
-```bash
-# Download TensorRT tar file
-wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/8.6.1/tars/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz
-# Extract TensorRT tar file
-tar xvf TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz
-# Install TensorRT wheel package
-python -m pip install TensorRT-8.6.1.6/python/tensorrt-8.6.1-cp310-none-linux_x86_64.whl
-# Add the absolute path of TensorRT's `lib` directory to LD_LIBRARY_PATH
-export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:TensorRT-8.6.1.6/lib"
-```
+2. If you are using the official PaddlePaddle 3.1 Docker image or higher, or if you installed PaddlePaddle with pip, you need to refer to the [TensorRT Documentation](https://docs.nvidia.com/deeplearning/tensorrt/archives/index.html) to install the corresponding version of TensorRT:
+
+    - For PaddlePaddle with CUDA 11.8, the compatible TensorRT version is 8.x (where x >= 6). PaddleX has completed compatibility tests of Paddle-TensorRT on TensorRT 8.6.1.6, so it is **strongly recommended to install TensorRT 8.6.1.6**.
+
+    Below is an example of installing TensorRT 8.6.1.6 using the "Tar File Installation" method in a CUDA 11.8 environment:
+
+    ```bash
+    # Download TensorRT tar file
+    wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/8.6.1/tars/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz
+    # Extract TensorRT tar file
+    tar xvf TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz
+    # Install TensorRT wheel package
+    python -m pip install TensorRT-8.6.1.6/python/tensorrt-8.6.1-cp310-none-linux_x86_64.whl
+    # Add the absolute path of TensorRT's `lib` directory to LD_LIBRARY_PATH
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:TensorRT-8.6.1.6/lib"
+    ```
 
 > ❗ <b>Note</b>: If you encounter any issues during the installation process, feel free to [submit an issue](https://github.com/PaddlePaddle/Paddle/issues) in the Paddle repository.
diff --git a/docs/installation/paddlepaddle_install.md b/docs/installation/paddlepaddle_install.md
index 6256a26857..53c2c2a9f2 100644
--- a/docs/installation/paddlepaddle_install.md
+++ b/docs/installation/paddlepaddle_install.md
@@ -44,13 +44,8 @@ nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8G --network=host -i
 
 * 注：更多飞桨官方 docker 镜像请参考[飞桨官网](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html)。
 
-在刚刚启动的 `paddlex` 容器中执行下面指令安装 TensorRT，即可使用 [Paddle Inference TensorRT 子图引擎](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/paddle_v3_features/paddle_trt_cn.html)：
-
-```bash
-python -m pip install /usr/local/TensorRT-*/python/tensorrt-*-cp310-none-linux_x86_64.whl
-```
-
 ## 基于 pip 安装飞桨
+
 <b>若您通过 pip 安装</b>，请参考下述命令，用 pip 在当前环境中安装飞桨 PaddlePaddle：
 
 ```bash
@@ -103,21 +98,29 @@ python -m https://paddle-qa.bj.bcebos.com/paddle-pipeline/Develop-TagBuild-Train
 
 ## 安装 TensorRT 子图引擎
 
-如果想要使用 [Paddle Inference TensorRT 子图引擎](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/paddle_v3_features/paddle_trt_cn.html)，在安装paddle后需参考 [TensorRT 文档](https://docs.nvidia.com/deeplearning/tensorrt/archives/index.html) 安装相应版本的 TensorRT：
+如果想要使用 [Paddle Inference TensorRT 子图引擎](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/paddle_v3_features/paddle_trt_cn.html)：
 
-- 对于 CUDA 11.8 版本的飞桨，兼容的 TensorRT 版本为 8.x（x>=6）。PaddleX 已在 TensorRT 8.6.1.6 上完成了 Paddle-TensorRT 的兼容性测试，因此**强烈建议安装 TensorRT 8.6.1.6**。
+1. 若您使用的是 PaddlePaddle 3.0 的官方镜像，需在启动的容器中执行下面指令安装 TensorRT：
 
-下面是在 CUDA 11.8 环境下使用 "Tar File Installation" 方式安装 TensoRT 8.6.1.6 的例子：
+    ```bash
+    python -m pip install /usr/local/TensorRT-*/python/tensorrt-*-cp310-none-linux_x86_64.whl
+    ```
 
-```bash
-# 下载 TensorRT tar 文件
-wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/8.6.1/tars/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz
-# 解压 TensorRT tar 文件
-tar xvf TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz
-# 安装 TensorRT wheel 包
-python -m pip install TensorRT-8.6.1.6/python/tensorrt-8.6.1-cp310-none-linux_x86_64.whl
-# 添加 TensorRT 的 `lib` 目录的绝对路径到 LD_LIBRARY_PATH 中
-export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:TensorRT-8.6.1.6/lib"
-```
+2. 若您使用的是 PaddlePaddle 3.1 及以上版本的官方镜像或使用 pip 安装的 PaddlePaddle，需参考 [TensorRT 文档](https://docs.nvidia.com/deeplearning/tensorrt/archives/index.html) 安装相应版本的 TensorRT：
+
+    - 对于 CUDA 11.8 版本的飞桨，兼容的 TensorRT 版本为 8.x（x>=6）。PaddleX 已在 TensorRT 8.6.1.6 上完成了 Paddle-TensorRT 的兼容性测试，因此**强烈建议安装 TensorRT 8.6.1.6**。
+
+    下面是在 CUDA 11.8 环境下使用 "Tar File Installation" 方式安装 TensoRT 8.6.1.6 的例子：
+
+    ```bash
+    # 下载 TensorRT tar 文件
+    wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/8.6.1/tars/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz
+    # 解压 TensorRT tar 文件
+    tar xvf TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz
+    # 安装 TensorRT wheel 包
+    python -m pip install TensorRT-8.6.1.6/python/tensorrt-8.6.1-cp310-none-linux_x86_64.whl
+    # 添加 TensorRT 的 `lib` 目录的绝对路径到 LD_LIBRARY_PATH 中
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:TensorRT-8.6.1.6/lib"
+    ```
 
 > ❗ <b>注</b>：如果在安装的过程中，出现任何问题，欢迎在Paddle仓库中[提Issue](https://github.com/PaddlePaddle/Paddle/issues)。
diff --git a/docs/module_usage/instructions/distributed_training.en.md b/docs/module_usage/instructions/distributed_training.en.md
index 850d3c6418..7bd3ba41e4 100644
--- a/docs/module_usage/instructions/distributed_training.en.md
+++ b/docs/module_usage/instructions/distributed_training.en.md
@@ -23,4 +23,4 @@ python main.py -c paddlex/configs/modules/image_classification/PP-LCNet_x1_0.yam
 
 - The IP addresses of different machines should be separated by commas and can be checked using `ifconfig` or `ipconfig`.
 - Passwordless SSH should be set up between different machines, and they should be able to ping each other directly; otherwise, communication cannot be completed.
-- The code, data, and execution commands or scripts must be consistent across all machines, and the training command or script must be run on all machines. Finally, the first device of the first machine in the `Train.dist_ips` list will be trainer0, and so on.
\ No newline at end of file
+- The code, data, and execution commands or scripts must be consistent across all machines, and the training command or script must be run on all machines. Finally, the first device of the first machine in the `Train.dist_ips` list will be trainer0, and so on.
diff --git a/docs/module_usage/instructions/distributed_training.md b/docs/module_usage/instructions/distributed_training.md
index ca09d55044..5cf29a0038 100644
--- a/docs/module_usage/instructions/distributed_training.md
+++ b/docs/module_usage/instructions/distributed_training.md
@@ -23,4 +23,4 @@ python main.py -c paddlex/configs/modules/image_classification/PP-LCNet_x1_0.yam
 
 - 不同机器的ip信息需要用逗号隔开，可以通过 `ifconfig` 或者 `ipconfig` 查看。
 - 不同机器之间需要做免密设置，且可以直接ping通，否则无法完成通信。
-- 不同机器之间的代码、数据与运行命令或脚本需要保持一致，且所有的机器上都需要运行设置好的训练命令或者脚本。最终 `Train.dist_ips` 中的第一台机器的第一块设备是trainer0，以此类推。
\ No newline at end of file
+- 不同机器之间的代码、数据与运行命令或脚本需要保持一致，且所有的机器上都需要运行设置好的训练命令或者脚本。最终 `Train.dist_ips` 中的第一台机器的第一块设备是trainer0，以此类推。
diff --git a/docs/module_usage/instructions/model_python_API.en.md b/docs/module_usage/instructions/model_python_API.en.md
index 3d026b8479..aaebaac0d1 100644
--- a/docs/module_usage/instructions/model_python_API.en.md
+++ b/docs/module_usage/instructions/model_python_API.en.md
@@ -32,12 +32,13 @@ In short, just three steps:
 * `create_model`: Instantiate the prediction model object;
   * Parameters:
     * `model_name`: `str` type, model name, such as "PP-LCNet_x1_0", "/path/to/PP-LCNet_x1_0_infer/";
-    * `model_dir`: `str` type, local path to directory of inference model files ，such as "/path/to/PP-LCNet_x1_0_infer/", default to `None`, means that use the official model specified by `model_name`;
+    * `model_dir`: `str | None` type, local path to directory of inference model files ，such as "/path/to/PP-LCNet_x1_0_infer/", default to `None`, means that use the official model specified by `model_name`;
     * `batch_size`: `int` type, default to `1`;
     * `device`: `str` type, used to set the inference device, such as "cpu", "gpu:2" for GPU settings. By default, using 0 id GPU if available, otherwise CPU;
     * `pp_option`: `PaddlePredictorOption` type, used to change inference settings (e.g. the operating mode). Please refer to [4-Inference Configuration](#4-inference-configuration) for more details;
-    * `use_hpip`：`bool` type, whether to enable the high-performance inference plugin;
-    * `hpi_config`：`dict | None` type, high-performance inference configuration;
+    * `use_hpip`: `bool` type, whether to enable the high-performance inference plugin;
+    * `hpi_config`: `dict | None` type, high-performance inference configuration;
+    * `genai_config`: `dict | None` type, generative AI configuration;
     * _`inference hyperparameters`_: used to set common inference hyperparameters. Please refer to specific model description document for details.
 
 ### 2. Perform Inference Prediction by Calling the `predict()` Method of the Prediction Model Object
diff --git a/docs/module_usage/instructions/model_python_API.md b/docs/module_usage/instructions/model_python_API.md
index 2ec8352a87..47c923c6e8 100644
--- a/docs/module_usage/instructions/model_python_API.md
+++ b/docs/module_usage/instructions/model_python_API.md
@@ -33,12 +33,13 @@ for res in output:
 * `create_model`：实例化预测模型对象；
   * 参数：
     * `model_name`：`str` 类型，模型名，如“PP-LCNet_x1_0”；
-    * `model_dir`：`str` 类型，本地 inference 模型文件目录路径，如“/path/to/PP-LCNet_x1_0_infer/”，默认为 `None`，表示使用`model_name`指定的官方推理模型；
+    * `model_dir`：`str | None` 类型，本地 inference 模型文件目录路径，如“/path/to/PP-LCNet_x1_0_infer/”，默认为 `None`，表示使用`model_name`指定的官方推理模型或不使用本地模型；
     * `batch_size`：`int` 类型，默认为 `1`；
     * `device`：`str` 类型，用于设置模型推理设备，如为GPU设置则可以指定卡号，如“cpu”、“gpu:2”，默认情况下，如GPU可用，则使用GPU 0，否则使用CPU；
     * `pp_option`：`PaddlePredictorOption` 类型，用于改变运行模式等配置项，关于推理配置的详细说明，请参考下文[4-推理配置](#4-推理配置)；
     * `use_hpip`：`bool` 类型，是否启用高性能推理插件；
     * `hpi_config`：`dict | None` 类型，高性能推理配置；
+    * `genai_config`：`dict | None` 类型，生成式 AI 配置；
     * _`推理超参数`_：支持常见推理超参数的修改，具体参数说明详见具体模型文档；
 
 ### 2. 调用预测模型对象的`predict()`方法进行推理预测
diff --git a/docs/module_usage/tutorials/ocr_modules/text_recognition.en.md b/docs/module_usage/tutorials/ocr_modules/text_recognition.en.md
index d18692a9a6..1405dbc5dc 100644
--- a/docs/module_usage/tutorials/ocr_modules/text_recognition.en.md
+++ b/docs/module_usage/tutorials/ocr_modules/text_recognition.en.md
@@ -310,6 +310,51 @@ el_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-mo
 <td>The Greek recognition model trained based on the PP-OCRv5 recognition model supports recognition of Greek, English, and numbers.</td>
 </tr>
 <tr>
+<td>arabic_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/arabic_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/arabic_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>81.27</td>
+<td>-</td>
+<td>-</td>
+<td>7.6</td>
+<td>Ultra-lightweight Arabic character recognition model trained based on the PP-OCRv5 recognition model, supports Arabic letters and number recognition</td>
+</tr>
+<tr>
+<td>cyrillic_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/cyrillic_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/cyrillic_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>80.27</td>
+<td>-</td>
+<td>-</td>
+<td>7.7</td>
+<td>Ultra-lightweight Cyrillic character recognition model trained based on the PP-OCRv5 recognition model, supports Cyrillic letters and number recognition</td>
+</tr>
+<tr>
+<td>devanagari_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/devanagari_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/devanagari_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>84.96</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>Ultra-lightweight Devanagari script recognition model trained based on the PP-OCRv5 recognition model, supports Hindi, Sanskrit and other Devanagari letters, as well as number recognition</td>
+</tr>
+<tr>
+<td>te_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/te_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/te_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>87.65</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>Ultra-lightweight Telugu script recognition model trained based on the PP-OCRv5 recognition model, supports Telugu script and number recognition</td>
+</tr>
+<tr>
+<td>ta_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/ta_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/ta_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>94.2</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>Ultra-lightweight Tamil script recognition model trained based on the PP-OCRv5 recognition model, supports Tamil script and number recognition</td>
+</tr>
+<tr>
 <td>korean_PP-OCRv3_mobile_rec</td>
 <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/korean_PP-OCRv3_mobile_rec_infer.tar">Inference Model</a>/<a href="">Training Model</a></td>
 <td>60.21</td>
diff --git a/docs/module_usage/tutorials/ocr_modules/text_recognition.md b/docs/module_usage/tutorials/ocr_modules/text_recognition.md
index 0a4b51f9db..a77cb0b8ba 100644
--- a/docs/module_usage/tutorials/ocr_modules/text_recognition.md
+++ b/docs/module_usage/tutorials/ocr_modules/text_recognition.md
@@ -319,6 +319,56 @@ el_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model
 <td>基于PP-OCRv5识别模型训练得到的希腊语识别模型， 支持希腊语、英文和数字识别</td>
 </tr>
 <tr>
+<td>arabic_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+arabic_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/arabic_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>81.27</td>
+<td>-</td>
+<td>-</td>
+<td>7.6</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量阿拉伯字母识别模型，支持阿拉伯字母、数字识别</td>
+</tr>
+<tr>
+<td>cyrillic_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+cyrillic_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/cyrillic_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>80.27</td>
+<td>-</td>
+<td>-</td>
+<td>7.7</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量斯拉夫字母识别模型，支持斯拉夫字母、数字识别</td>
+</tr>
+<tr>
+<td>devanagari_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+devanagari_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/devanagari_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>84.96</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量天城文识别模型，支持印地文、梵文等字母以及数字识别</td>
+</tr>
+<tr>
+<td>te_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+te_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/te_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>87.65</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量泰卢固文识别模型，支持泰卢固文、数字识别</td>
+</tr>
+<tr>
+<td>ta_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+ta_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/ta_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>94.2</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量泰米尔文识别模型，支持泰米尔文、数字识别</td>
+</tr>
+<tr>
 <td>korean_PP-OCRv3_mobile_rec</td>
 <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
 korean_PP-OCRv3_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/korean_PP-OCRv3_mobile_rec_pretrained.pdparams">训练模型</a></td>
diff --git a/docs/pipeline_deploy/high_performance_inference.en.md b/docs/pipeline_deploy/high_performance_inference.en.md
index 81a36174cc..74b32b8710 100644
--- a/docs/pipeline_deploy/high_performance_inference.en.md
+++ b/docs/pipeline_deploy/high_performance_inference.en.md
@@ -126,7 +126,7 @@ pip list | grep nvidia-cuda
 pip list | grep nvidia-cudnn
 ```
 
-If you wish to use the Paddle Inference TensorRT subgraph engine, you will need to install TensorRT additionally. Please refer to the related instructions in the [PaddlePaddle Local Installation Tutorial](../installation/paddlepaddle_install.en.md). Note that because the underlying inference library of the high-performance inference plugin also integrates TensorRT, it is recommended to install the same version of TensorRT to avoid version conflicts. Currently, the TensorRT version integrated into the CUDA 11.8 high-performance inference plugin's underlying inference library is 8.6.1.6. If you are using the official PaddlePaddle image, you do not need to worry about version conflicts.
+If you wish to use the Paddle Inference TensorRT subgraph engine, you will need to install TensorRT additionally. Please refer to the related instructions in the [PaddlePaddle Local Installation Tutorial](../installation/paddlepaddle_install.en.md). Note that because the underlying inference library of the high-performance inference plugin also integrates TensorRT, it is recommended to install the same version of TensorRT to avoid version conflicts. Currently, the TensorRT version integrated into the CUDA 11.8 high-performance inference plugin's underlying inference library is 8.6.1.6. If you are using the official PaddlePaddle 3.0 Docker image, you do not need to worry about version conflicts.
 
 After confirming that the correct versions of CUDA, cuDNN, and TensorRT (optional) are installed, run:
 
diff --git a/docs/pipeline_deploy/high_performance_inference.md b/docs/pipeline_deploy/high_performance_inference.md
index 199ef59679..3e46ea19b3 100644
--- a/docs/pipeline_deploy/high_performance_inference.md
+++ b/docs/pipeline_deploy/high_performance_inference.md
@@ -124,7 +124,7 @@ pip list | grep nvidia-cuda
 pip list | grep nvidia-cudnn
 ```
 
-如果希望使用 Paddle Inference TensorRT 子图引擎，需额外安装 TensorRT。请参考 [飞桨PaddlePaddle本地安装教程](../installation/paddlepaddle_install.md) 中的相关说明。需要注意的是，由于高性能推理插件的底层推理库也集成了 TensorRT，建议安装相同版本的 TensorRT 以避免版本冲突。目前，CUDA 11.8 的高性能推理插件底层推理库集成的 TensorRT 版本为 8.6.1.6。如果使用的是飞桨框架官方镜像，则无需关心版本冲突问题。
+如果希望使用 Paddle Inference TensorRT 子图引擎，需额外安装 TensorRT。请参考 [飞桨PaddlePaddle本地安装教程](../installation/paddlepaddle_install.md) 中的相关说明。需要注意的是，由于高性能推理插件的底层推理库也集成了 TensorRT，建议安装相同版本的 TensorRT 以避免版本冲突。目前，CUDA 11.8 的高性能推理插件底层推理库集成的 TensorRT 版本为 8.6.1.6。如果使用的是飞桨框架 PaddlePaddle 3.0 的官方镜像，则无需关心版本冲突问题。
 
 确认安装了正确版本的 CUDA、cuDNN、以及 TensorRT （可选）后，执行：
 
diff --git a/docs/pipeline_deploy/serving.en.md b/docs/pipeline_deploy/serving.en.md
index 08e600b123..3038a92872 100644
--- a/docs/pipeline_deploy/serving.en.md
+++ b/docs/pipeline_deploy/serving.en.md
@@ -128,131 +128,135 @@ Find the high-stability serving SDK corresponding to the pipeline in the table b
 <tbody>
 <tr>
 <td>PP-ChatOCR-doc v3</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz">paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz">paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General image classification</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_image_classification_sdk.tar.gz">paddlex_hps_image_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_image_classification_sdk.tar.gz">paddlex_hps_image_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General object detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_object_detection_sdk.tar.gz">paddlex_hps_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_object_detection_sdk.tar.gz">paddlex_hps_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General instance segmentation</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_instance_segmentation_sdk.tar.gz">paddlex_hps_instance_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_instance_segmentation_sdk.tar.gz">paddlex_hps_instance_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General semantic segmentation</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_semantic_segmentation_sdk.tar.gz">paddlex_hps_semantic_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_semantic_segmentation_sdk.tar.gz">paddlex_hps_semantic_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Image multi-label classification</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_image_multilabel_classification_sdk.tar.gz">paddlex_hps_image_multilabel_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_image_multilabel_classification_sdk.tar.gz">paddlex_hps_image_multilabel_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General image recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_PP-ShiTuV2_sdk.tar.gz">paddlex_hps_PP-ShiTuV2_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-ShiTuV2_sdk.tar.gz">paddlex_hps_PP-ShiTuV2_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Pedestrian attribute recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz">paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz">paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Vehicle attribute recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz">paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz">paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Face recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_face_recognition_sdk.tar.gz">paddlex_hps_face_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_face_recognition_sdk.tar.gz">paddlex_hps_face_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Small object detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_small_object_detection_sdk.tar.gz">paddlex_hps_small_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_small_object_detection_sdk.tar.gz">paddlex_hps_small_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Image anomaly detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_anomaly_detection_sdk.tar.gz">paddlex_hps_anomaly_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_anomaly_detection_sdk.tar.gz">paddlex_hps_anomaly_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Human keypoint detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_human_keypoint_detection_sdk.tar.gz">paddlex_hps_human_keypoint_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_human_keypoint_detection_sdk.tar.gz">paddlex_hps_human_keypoint_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Open vocabulary detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_open_vocabulary_detection_sdk.tar.gz">paddlex_hps_open_vocabulary_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_open_vocabulary_detection_sdk.tar.gz">paddlex_hps_open_vocabulary_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Open vocabulary segmentation</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz">paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz">paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Rotated object detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_rotated_object_detection_sdk.tar.gz">paddlex_hps_rotated_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_rotated_object_detection_sdk.tar.gz">paddlex_hps_rotated_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>3D multi-modal fusion detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_3d_bev_detection_sdk.tar.gz">paddlex_hps_3d_bev_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_3d_bev_detection_sdk.tar.gz">paddlex_hps_3d_bev_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General OCR</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_OCR_sdk.tar.gz">paddlex_hps_OCR_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_OCR_sdk.tar.gz">paddlex_hps_OCR_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General table recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_table_recognition_sdk.tar.gz">paddlex_hps_table_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_table_recognition_sdk.tar.gz">paddlex_hps_table_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General table recognition v2</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_table_recognition_v2_sdk.tar.gz">paddlex_hps_table_recognition_v2_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_table_recognition_v2_sdk.tar.gz">paddlex_hps_table_recognition_v2_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General layout parsing</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_layout_parsing_sdk.tar.gz">paddlex_hps_layout_parsing_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_layout_parsing_sdk.tar.gz">paddlex_hps_layout_parsing_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>PP-StructureV3</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_PP-StructureV3_sdk.tar.gz">paddlex_hps_PP-StructureV3_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-StructureV3_sdk.tar.gz">paddlex_hps_PP-StructureV3_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Formula recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_formula_recognition_sdk.tar.gz">paddlex_hps_formula_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_formula_recognition_sdk.tar.gz">paddlex_hps_formula_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Seal text recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_seal_recognition_sdk.tar.gz">paddlex_hps_seal_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_seal_recognition_sdk.tar.gz">paddlex_hps_seal_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Document image preprocessing</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_doc_preprocessor_sdk.tar.gz">paddlex_hps_doc_preprocessor_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_doc_preprocessor_sdk.tar.gz">paddlex_hps_doc_preprocessor_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Time series forecasting</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_ts_forecast_sdk.tar.gz">paddlex_hps_ts_forecast_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_forecast_sdk.tar.gz">paddlex_hps_ts_forecast_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Time series anomaly detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_ts_anomaly_detection_sdk.tar.gz">paddlex_hps_ts_anomaly_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_anomaly_detection_sdk.tar.gz">paddlex_hps_ts_anomaly_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Time series classification</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_ts_classification_sdk.tar.gz">paddlex_hps_ts_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_classification_sdk.tar.gz">paddlex_hps_ts_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Multilingual speech recognition</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_multilingual_speech_recognition_sdk.tar.gz">paddlex_hps_multilingual_speech_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_multilingual_speech_recognition_sdk.tar.gz">paddlex_hps_multilingual_speech_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General video classification</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_video_classification_sdk.tar.gz">paddlex_hps_video_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_video_classification_sdk.tar.gz">paddlex_hps_video_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>General video detection</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_video_detection_sdk.tar.gz">paddlex_hps_video_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_video_detection_sdk.tar.gz">paddlex_hps_video_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>Document understanding</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_doc_understanding_sdk.tar.gz">paddlex_hps_doc_understanding_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_doc_understanding_sdk.tar.gz">paddlex_hps_doc_understanding_sdk.tar.gz</a></td>
+</tr>
+<tr>
+<td>PaddleOCR-VL</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PaddleOCR-VL_sdk.tar.gz">paddlex_hps_PaddleOCR-VL_sdk.tar.gz</a></td>
 </tr>
 </tbody>
 </table>
@@ -310,13 +314,13 @@ First, pull the Docker image as needed:
 - Image supporting deployment with NVIDIA GPU (the machine must have NVIDIA drivers that support CUDA 11.8 installed):
 
     ```bash
-    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.2-gpu
+    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-gpu
     ```
 
 - CPU-only Image:
 
     ```bash
-    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.2-cpu
+    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-cpu
     ```
 
 If you need to build the image on your own, please refer to [the `hps` project documentation](https://github.com/PaddlePaddle/PaddleX/blob/develop/deploy/hps/README_en.md#1-image-building)
diff --git a/docs/pipeline_deploy/serving.md b/docs/pipeline_deploy/serving.md
index 88cf1d311d..9515cfb919 100644
--- a/docs/pipeline_deploy/serving.md
+++ b/docs/pipeline_deploy/serving.md
@@ -128,131 +128,135 @@ paddlex --serve --pipeline image_classification --use_hpip
 <tbody>
 <tr>
 <td>文档场景信息抽取 v3</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz">paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz">paddlex_hps_PP-ChatOCRv3-doc_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用图像分类</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_image_classification_sdk.tar.gz">paddlex_hps_image_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_image_classification_sdk.tar.gz">paddlex_hps_image_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用目标检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_object_detection_sdk.tar.gz">paddlex_hps_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_object_detection_sdk.tar.gz">paddlex_hps_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用实例分割</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_instance_segmentation_sdk.tar.gz">paddlex_hps_instance_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_instance_segmentation_sdk.tar.gz">paddlex_hps_instance_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用语义分割</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_semantic_segmentation_sdk.tar.gz">paddlex_hps_semantic_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_semantic_segmentation_sdk.tar.gz">paddlex_hps_semantic_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用图像多标签分类</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_image_multilabel_classification_sdk.tar.gz">paddlex_hps_image_multilabel_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_image_multilabel_classification_sdk.tar.gz">paddlex_hps_image_multilabel_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用图像识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_PP-ShiTuV2_sdk.tar.gz">paddlex_hps_PP-ShiTuV2_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-ShiTuV2_sdk.tar.gz">paddlex_hps_PP-ShiTuV2_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>行人属性识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz">paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz">paddlex_hps_pedestrian_attribute_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>车辆属性识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz">paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz">paddlex_hps_vehicle_attribute_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>人脸识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_face_recognition_sdk.tar.gz">paddlex_hps_face_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_face_recognition_sdk.tar.gz">paddlex_hps_face_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>小目标检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_small_object_detection_sdk.tar.gz">paddlex_hps_small_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_small_object_detection_sdk.tar.gz">paddlex_hps_small_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>图像异常检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_anomaly_detection_sdk.tar.gz">paddlex_hps_anomaly_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_anomaly_detection_sdk.tar.gz">paddlex_hps_anomaly_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>人体关键点检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_human_keypoint_detection_sdk.tar.gz">paddlex_hps_human_keypoint_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_human_keypoint_detection_sdk.tar.gz">paddlex_hps_human_keypoint_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>开放词汇检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_open_vocabulary_detection_sdk.tar.gz">paddlex_hps_open_vocabulary_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_open_vocabulary_detection_sdk.tar.gz">paddlex_hps_open_vocabulary_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>开放词汇分割</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz">paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz">paddlex_hps_open_vocabulary_segmentation_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>旋转目标检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_rotated_object_detection_sdk.tar.gz">paddlex_hps_rotated_object_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_rotated_object_detection_sdk.tar.gz">paddlex_hps_rotated_object_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>3D 多模态融合检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_3d_bev_detection_sdk.tar.gz">paddlex_hps_3d_bev_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_3d_bev_detection_sdk.tar.gz">paddlex_hps_3d_bev_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用 OCR</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_OCR_sdk.tar.gz">paddlex_hps_OCR_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_OCR_sdk.tar.gz">paddlex_hps_OCR_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用表格识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_table_recognition_sdk.tar.gz">paddlex_hps_table_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_table_recognition_sdk.tar.gz">paddlex_hps_table_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用表格识别 v2</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_table_recognition_v2_sdk.tar.gz">paddlex_hps_table_recognition_v2_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_table_recognition_v2_sdk.tar.gz">paddlex_hps_table_recognition_v2_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用版面解析</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_layout_parsing_sdk.tar.gz">paddlex_hps_layout_parsing_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_layout_parsing_sdk.tar.gz">paddlex_hps_layout_parsing_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用版面解析 v3</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_PP-StructureV3_sdk.tar.gz">paddlex_hps_PP-StructureV3_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PP-StructureV3_sdk.tar.gz">paddlex_hps_PP-StructureV3_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>公式识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_formula_recognition_sdk.tar.gz">paddlex_hps_formula_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_formula_recognition_sdk.tar.gz">paddlex_hps_formula_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>印章文本识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_seal_recognition_sdk.tar.gz">paddlex_hps_seal_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_seal_recognition_sdk.tar.gz">paddlex_hps_seal_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>文档图像预处理</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_doc_preprocessor_sdk.tar.gz">paddlex_hps_doc_preprocessor_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_doc_preprocessor_sdk.tar.gz">paddlex_hps_doc_preprocessor_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>时序预测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_ts_forecast_sdk.tar.gz">paddlex_hps_ts_forecast_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_forecast_sdk.tar.gz">paddlex_hps_ts_forecast_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>时序异常检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_ts_anomaly_detection_sdk.tar.gz">paddlex_hps_ts_anomaly_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_anomaly_detection_sdk.tar.gz">paddlex_hps_ts_anomaly_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>时序分类</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_ts_classification_sdk.tar.gz">paddlex_hps_ts_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_ts_classification_sdk.tar.gz">paddlex_hps_ts_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>多语种语音识别</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_multilingual_speech_recognition_sdk.tar.gz">paddlex_hps_multilingual_speech_recognition_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_multilingual_speech_recognition_sdk.tar.gz">paddlex_hps_multilingual_speech_recognition_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用视频分类</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_video_classification_sdk.tar.gz">paddlex_hps_video_classification_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_video_classification_sdk.tar.gz">paddlex_hps_video_classification_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>通用视频检测</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_video_detection_sdk.tar.gz">paddlex_hps_video_detection_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_video_detection_sdk.tar.gz">paddlex_hps_video_detection_sdk.tar.gz</a></td>
 </tr>
 <tr>
 <td>文档理解</td>
-<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.2/paddlex_hps_doc_understanding_sdk.tar.gz">paddlex_hps_doc_understanding_sdk.tar.gz</a></td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_doc_understanding_sdk.tar.gz">paddlex_hps_doc_understanding_sdk.tar.gz</a></td>
+</tr>
+<tr>
+<td>PaddleOCR-VL</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/PaddleX3.0/deploy/paddlex_hps/public/sdks/v3.3/paddlex_hps_PaddleOCR-VL_sdk.tar.gz">paddlex_hps_PaddleOCR-VL_sdk.tar.gz</a></td>
 </tr>
 </tbody>
 </table>
@@ -310,13 +314,13 @@ paddlex --serve --pipeline image_classification --use_hpip
 - 支持使用 NVIDIA GPU 部署的镜像（机器上需要安装有支持 CUDA 11.8 的 NVIDIA 驱动）：
 
     ```bash
-    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.2-gpu
+    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-gpu
     ```
 
 - CPU-only 镜像：
 
     ```bash
-    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.2-cpu
+    docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlex/hps:paddlex3.3-cpu
     ```
 
 如需自定义构建镜像可参考 [`hps` 项目文档](https://github.com/PaddlePaddle/PaddleX/blob/develop/deploy/hps/README.md#1-镜像构建)。
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/OCR.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/OCR.en.md
index 3264f481d3..72974ff53f 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/OCR.en.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/OCR.en.md
@@ -471,6 +471,51 @@ el_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-mo
 <td>The Greek recognition model trained based on the PP-OCRv5 recognition model supports recognition of Greek, English, and numbers.</td>
 </tr>
 <tr>
+<td>arabic_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/arabic_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/arabic_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>81.27</td>
+<td>-</td>
+<td>-</td>
+<td>7.6</td>
+<td>Ultra-lightweight Arabic character recognition model trained based on the PP-OCRv5 recognition model, supports Arabic letters and number recognition</td>
+</tr>
+<tr>
+<td>cyrillic_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/cyrillic_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/cyrillic_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>80.27</td>
+<td>-</td>
+<td>-</td>
+<td>7.7</td>
+<td>Ultra-lightweight Cyrillic character recognition model trained based on the PP-OCRv5 recognition model, supports Cyrillic letters and number recognition</td>
+</tr>
+<tr>
+<td>devanagari_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/devanagari_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/devanagari_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>84.96</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>Ultra-lightweight Devanagari script recognition model trained based on the PP-OCRv5 recognition model, supports Hindi, Sanskrit and other Devanagari letters, as well as number recognition</td>
+</tr>
+<tr>
+<td>te_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/te_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/te_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>87.65</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>Ultra-lightweight Telugu script recognition model trained based on the PP-OCRv5 recognition model, supports Telugu script and number recognition</td>
+</tr>
+<tr>
+<td>ta_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/ta_PP-OCRv5_mobile_rec_infer.tar">Inference Model</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/ta_PP-OCRv5_mobile_rec_pretrained.pdparams">Pretrained Model</a></td>
+<td>94.2</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>Ultra-lightweight Tamil script recognition model trained based on the PP-OCRv5 recognition model, supports Tamil script and number recognition</td>
+</tr>
+<tr>
 <td>korean_PP-OCRv3_mobile_rec</td>
 <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/korean_PP-OCRv3_mobile_rec_infer.tar">Inference Model</a>/<a href="">Training Model</a></td>
 <td>60.21</td>
@@ -2439,6 +2484,12 @@ To remove the page limit, please add the following configuration to the pipeline
 <td>No</td>
 </tr>
 <tr>
+<td><code>returnWordBox</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>Please refer to the description of the <code>return_word_box</code> parameter of the pipeline object's <code>predict</code> method.</td>
+<td>No</td>
+</tr>
+<tr>
 <td><code>visualize</code></td>
 <td><code>boolean</code> | <code>null</code></td>
 <td>
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/OCR.md b/docs/pipeline_usage/tutorials/ocr_pipelines/OCR.md
index e834074777..056c9ddcff 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/OCR.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/OCR.md
@@ -448,6 +448,56 @@ el_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model
 <td>基于PP-OCRv5识别模型训练得到的希腊语识别模型， 支持希腊语、英文和数字识别</td>
 </tr>
 <tr>
+<td>arabic_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+arabic_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/arabic_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>81.27</td>
+<td>-</td>
+<td>-</td>
+<td>7.6</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量阿拉伯字母识别模型，支持阿拉伯字母、数字识别</td>
+</tr>
+<tr>
+<td>cyrillic_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+cyrillic_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/cyrillic_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>80.27</td>
+<td>-</td>
+<td>-</td>
+<td>7.7</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量斯拉夫字母识别模型，支持斯拉夫字母、数字识别</td>
+</tr>
+<tr>
+<td>devanagari_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+devanagari_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/devanagari_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>84.96</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量天城文识别模型，支持印地文、梵文等字母以及数字识别</td>
+</tr>
+<tr>
+<td>te_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+te_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/te_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>87.65</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量泰卢固文识别模型，支持泰卢固文、数字识别</td>
+</tr>
+<tr>
+<td>ta_PP-OCRv5_mobile_rec</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
+ta_PP-OCRv5_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/ta_PP-OCRv5_mobile_rec_pretrained.pdparams">训练模型</a></td>
+<td>94.2</td>
+<td>-</td>
+<td>-</td>
+<td>7.5</td>
+<td>基于PP-OCRv5识别模型训练得到的超轻量泰米尔文识别模型，支持泰米尔文、数字识别</td>
+</tr>
+<tr>
 <td>korean_PP-OCRv3_mobile_rec</td>
 <td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/\
 korean_PP-OCRv3_mobile_rec_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/korean_PP-OCRv3_mobile_rec_pretrained.pdparams">训练模型</a></td>
@@ -2415,6 +2465,12 @@ for res in output:
 <td>否</td>
 </tr>
 <tr>
+<td><code>returnWordBox</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>return_word_box</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
 <td><code>visualize</code></td>
 <td><code>boolean</code> | <code>null</code></td>
 <td>是否返回可视化结果图以及处理过程中的中间图像等。
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
index 73ad164049..2ff08a0beb 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
@@ -1352,35 +1352,25 @@ After running, the result will be printed to the terminal, as follows:
 <details><summary>👉Click to Expand</summary>
 <pre><code>
 
-{'res': {'input_path': 'pp_structure_v3_demo.png', 'model_settings': {'use_doc_preprocessor': False, 'use_general_ocr': True, 'use_seal_recognition': True, 'use_table_recognition': True, 'use_formula_recognition': True}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 2, 'label': 'text', 'score': 0.9853514432907104, 'coordinate': [770.9531, 776.6814, 1122.6057, 1058.7322]}, {'cls_id': 1, 'label': 'image', 'score': 0.9848673939704895, 'coordinate': [775.7434, 202.27979, 1502.8113, 686.02136]}, {'cls_id': 2, 'label': 'text', 'score': 0.983731746673584, 'coordinate': [1152.3197, 1113.3275, 1503.3029, 1346.586]}, {'cls_id': 2, 'label': 'text', 'score': 0.9832221865653992, 'coordinate': [1152.5602, 801.431, 1503.8436, 986.3563]}, {'cls_id': 2, 'label': 'text', 'score': 0.9829439520835876, 'coordinate': [9.549545, 849.5713, 359.1173, 1058.7488]}, {'cls_id': 2, 'label': 'text', 'score': 0.9811657667160034, 'coordinate': [389.58298, 1137.2659, 740.66235, 1346.7488]}, {'cls_id': 2, 'label': 'text', 'score': 0.9775941371917725, 'coordinate': [9.1302185, 201.85, 359.0409, 339.05692]}, {'cls_id': 2, 'label': 'text', 'score': 0.9750366806983948, 'coordinate': [389.71454, 752.96924, 740.544, 889.92456]}, {'cls_id': 2, 'label': 'text', 'score': 0.9738152027130127, 'coordinate': [389.94565, 298.55988, 740.5585, 435.5124]}, {'cls_id': 2, 'label': 'text', 'score': 0.9737328290939331, 'coordinate': [771.50256, 1065.4697, 1122.2582, 1178.7324]}, {'cls_id': 2, 'label': 'text', 'score': 0.9728517532348633, 'coordinate': [1152.5154, 993.3312, 1503.2349, 1106.327]}, {'cls_id': 2, 'label': 'text', 'score': 0.9725610017776489, 'coordinate': [9.372787, 1185.823, 359.31738, 1298.7227]}, {'cls_id': 2, 'label': 'text', 'score': 0.9724331498146057, 'coordinate': [389.62848, 610.7389, 740.83234, 746.2377]}, {'cls_id': 2, 'label': 'text', 'score': 0.9720287322998047, 'coordinate': [389.29898, 897.0936, 741.41516, 1034.6616]}, {'cls_id': 2, 'label': 'text', 'score': 0.9713053703308105, 'coordinate': [10.323685, 1065.4663, 359.6786, 1178.8872]}, {'cls_id': 2, 'label': 'text', 'score': 0.9689728021621704, 'coordinate': [9.336395, 537.6609, 359.2901, 652.1881]}, {'cls_id': 2, 'label': 'text', 'score': 0.9684857130050659, 'coordinate': [10.7608185, 345.95068, 358.93616, 434.64087]}, {'cls_id': 2, 'label': 'text', 'score': 0.9681928753852844, 'coordinate': [9.674866, 658.89075, 359.56528, 770.4319]}, {'cls_id': 2, 'label': 'text', 'score': 0.9634978175163269, 'coordinate': [770.9464, 1281.1785, 1122.6522, 1346.7156]}, {'cls_id': 2, 'label': 'text', 'score': 0.96304851770401, 'coordinate': [390.0113, 201.28055, 740.1684, 291.53073]}, {'cls_id': 2, 'label': 'text', 'score': 0.962053120136261, 'coordinate': [391.21393, 1040.952, 740.5046, 1130.32]}, {'cls_id': 2, 'label': 'text', 'score': 0.9565253853797913, 'coordinate': [10.113251, 777.1482, 359.439, 842.437]}, {'cls_id': 2, 'label': 'text', 'score': 0.9497362375259399, 'coordinate': [390.31357, 537.86285, 740.47595, 603.9285]}, {'cls_id': 2, 'label': 'text', 'score': 0.9371236562728882, 'coordinate': [10.2034, 1305.9753, 359.5958, 1346.7295]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9338151216506958, 'coordinate': [791.6062, 1200.8479, 1103.3257, 1259.9324]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9326773285865784, 'coordinate': [408.0737, 457.37024, 718.9509, 516.63464]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9274250864982605, 'coordinate': [29.448685, 456.6762, 340.99194, 515.6999]}, {'cls_id': 2, 'label': 'text', 'score': 0.8742568492889404, 'coordinate': [1154.7095, 777.3624, 1330.3086, 794.5853]}, {'cls_id': 2, 'label': 'text', 'score': 0.8442489504814148, 'coordinate': [586.49316, 160.15454, 927.468, 179.64203]}, {'cls_id': 11, 'label': 'doc_title', 'score': 0.8332607746124268, 'coordinate': [133.80017, 37.41908, 1380.8601, 124.1429]}, {'cls_id': 6, 'label': 'figure_title', 'score': 0.6770150661468506, 'coordinate': [812.1718, 705.1199, 1484.6973, 747.1692]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[ 133,   35],
+{'res': {'input_path': 'pp_structure_v3_demo.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_seal_recognition': False, 'use_table_recognition': True, 'use_formula_recognition': True, 'use_chart_recognition': False, 'use_region_detection': True}, 'parsing_res_list': [{'block_label': 'doc_title', 'block_content': '助力双方交往搭建友谊桥梁', 'block_bbox': [133, 36, 1379, 123], 'block_id': 0, 'block_order': 1}, {'block_label': 'text', 'block_content': '本报记者沈小晓任彦黄培昭', 'block_bbox': [584, 159, 927, 179], 'block_id': 1, 'block_order': 2}, {'block_label': 'image', 'block_content': '', 'block_bbox': [774, 201, 1502, 685], 'block_id': 2, 'block_order': None}, {'block_label': 'figure_title', 'block_content': '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。中国驻厄立特里亚大使馆供图', 'block_bbox': [808, 704, 1484, 747], 'block_id': 3, 'block_order': None}, {'block_label': 'text', 'block_content': '身着中国传统民族服装的厄立特里亚青年依次登台表演中国民族舞、现代舞、扇子舞等，曼妙的舞姿赢得现场观众阵阵掌声。这是日前危立特里亚高等教育与研究院孔子学院(以下简称“厄特孔院")举办“喜迎新年"中国歌舞比赛的场景。\n', 'block_bbox': [9, 201, 358, 338], 'block_id': 4, 'block_order': 3}, {'block_label': 'text', 'block_content': '中国和厄立特里亚传统友谊深厚。近年来，在高质量共建“一带一路”框架下，中厄两国人文交流不断深化，互利合作的民意基础日益深厚。\n', 'block_bbox': [9, 345, 358, 435], 'block_id': 5, 'block_order': 4}, {'block_label': 'paragraph_title', 'block_content': '“学好中文，我们的未来不是梦”\n', 'block_bbox': [28, 456, 339, 514], 'block_id': 6, 'block_order': 5}, {'block_label': 'text', 'block_content': '“鲜花曾告诉我你怎样走过，大地知道你心中的每一个角落……"厄立特里亚阿斯马拉大学综合楼二层，一阵优美的歌声在走廊里回响。循着熟悉的旋律轻轻推开一间教室的门，学生们正跟着老师学唱中文歌曲《同一首歌》。', 'block_bbox': [9, 536, 358, 651], 'block_id': 7, 'block_order': 6}, {'block_label': 'text', 'block_content': '这是厄特孔院阿斯马拉大学教学点的一节中文歌曲课。为了让学生们更好地理解歌词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐字翻译和解释歌词。随着伴奏声响起，学生们边唱边随着节拍摇动身体，现场气氛热烈。', 'block_bbox': [9, 658, 359, 770], 'block_id': 8, 'block_order': 7}, {'block_label': 'text', 'block_content': '“这是中文歌曲初级班，共有32人。学生大部分来自首都阿斯马拉的中小学，年龄最小的仅有6岁。”尤斯拉告诉记者。', 'block_bbox': [10, 776, 359, 842], 'block_id': 9, 'block_order': 8}, {'block_label': 'text', 'block_content': '尤斯拉今年23岁，是厄立特里亚一所公立学校的艺术老师。她12岁开始在厄特孔院学习中文，在2017年第十届“汉语桥"世界中学生中文比赛中获得厄立特里亚赛区第一名，并和同伴代表厄立特里亚前往中国参加决赛，获得团体优胜奖。2022年起，尤斯拉开始在厄特孔院兼职教授中文歌曲，每周末两个课时。“中国文化博大精深，我希望我的学生们能够通过中文歌曲更好地理解中国文化。”她说。', 'block_bbox': [9, 848, 358, 1057], 'block_id': 10, 'block_order': 9}, {'block_label': 'text', 'block_content': '“姐姐，你想去中国吗?”“非常想！我想去看故宫、爬长城。”尤斯拉的学生中有一对能歌善舞的姐妹，姐姐露娅今年15岁，妹妹莉娅14岁，两人都已在厄特孔院学习多年，中文说得格外流利。\n', 'block_bbox': [9, 1064, 358, 1177], 'block_id': 11, 'block_order': 10}, {'block_label': 'text', 'block_content': '露娅对记者说：“这些年来，怀着对中文和中国文化的热爱，我们姐妹俩始终相互鼓励，一起学习。我们的中文一天比一天好，还学会了中文歌和中国舞。我们一定要到中国去。学好中文，我们的未来不是梦！”', 'block_bbox': [8, 1184, 358, 1297], 'block_id': 12, 'block_order': 11}, {'block_label': 'text', 'block_content': '据厄特孔院中方院长黄鸣飞介绍，这所孔院成立于2013年3月，由贵州财经大学和', 'block_bbox': [10, 1304, 358, 1346], 'block_id': 13, 'block_order': 12}, {'block_label': 'text', 'block_content': '厄立特里亚高等教育与研究院合作建立，开设了中国语言课程和中国文化课程，注册学生2万余人次。10余年来，厄特孔院已成为当地民众了解中国的一扇窗口。', 'block_bbox': [388, 200, 740, 290], 'block_id': 14, 'block_order': 13}, {'block_label': 'text', 'block_content': '黄鸣飞表示，随着来学习中文的人日益增多，阿斯马拉大学教学点已难以满足教学需要。2024年4月，由中企蜀道集团所属四川路桥承建的孔院教学楼项目在阿斯马拉开工建设，预计今年上半年竣工，建成后将为危特孔院提供全新的办学场地。\n', 'block_bbox': [389, 297, 740, 435], 'block_id': 15, 'block_order': 14}, {'block_label': 'paragraph_title', 'block_content': '“在中国学习的经历让我看到更广阔的世界”', 'block_bbox': [409, 456, 718, 515], 'block_id': 16, 'block_order': 15}, {'block_label': 'text', 'block_content': '多年来，厄立特里亚广大赴华留学生和培训人员积极投身国家建设，成为助力该国发展的人才和厄中友好的见证者和推动者。', 'block_bbox': [389, 537, 740, 603], 'block_id': 17, 'block_order': 16}, {'block_label': 'text', 'block_content': '在厄立特里亚全国妇女联盟工作的约翰娜·特韦尔德·凯莱塔就是其中一位。她曾在中华女子学院攻读硕士学位，研究方向是女性领导力与社会发展。其间，她实地走访中国多个地区，获得了观察中国社会发展的第一手资料。\n', 'block_bbox': [389, 609, 740, 745], 'block_id': 18, 'block_order': 17}, {'block_label': 'text', 'block_content': '谈起在中国求学的经历，约翰娜记忆犹新：“中国的发展在当今世界是独一无二的。沿着中国特色社会主义道路坚定前行，中国创造了发展奇迹，这一切都离不开中国共产党的领导。中国的发展经验值得许多国家学习借鉴。”\n', 'block_bbox': [389, 752, 740, 889], 'block_id': 19, 'block_order': 18}, {'block_label': 'text', 'block_content': '正在西南大学学习的厄立特里亚博士生穆卢盖塔·泽穆伊对中国怀有深厚感情。8年前，在北京师范大学获得硕士学位后，穆卢盖塔在社交媒体上写下这样一段话：“这是我人生的重要一步，自此我拥有了一双坚固的鞋子，赋予我穿越荆棘的力量。”', 'block_bbox': [389, 896, 740, 1033], 'block_id': 20, 'block_order': 19}, {'block_label': 'text', 'block_content': '穆卢盖塔密切关注中国在经济、科技、教育等领域的发展，“中国在科研等方面的实力与日俱增。在中国学习的经历让我看到更广阔的世界，从中受益匪浅。”\n', 'block_bbox': [389, 1040, 740, 1129], 'block_id': 21, 'block_order': 20}, {'block_label': 'text', 'block_content': '23岁的莉迪亚·埃斯蒂法诺斯已在厄特孔院学习3年，在中国书法、中国画等方面表现干分优秀，在2024年厄立特里亚赛区的“汉语桥”比赛中获得一等奖。莉迪亚说：“学习中国书法让我的内心变得安宁和纯粹。我也喜欢中国的服饰，希望未来能去中国学习，把中国不同民族元素融入服装设计中，创作出更多精美作品，也把厄特文化分享给更多的中国朋友。”\n', 'block_bbox': [389, 1136, 740, 1345], 'block_id': 22, 'block_order': 21}, {'block_label': 'text', 'block_content': '“不管远近都是客人，请不用客气；相约好了在一起，我们欢迎你……”在一场中厄青年联谊活动上，四川路桥中方员工同当地大学生合唱《北京欢迎你》。厄立特里亚技术学院计算机科学与工程专业学生鲁夫塔·谢拉是其中一名演唱者，她很早便在孔院学习中文，一直在为去中国留学作准备。“这句歌词是我们两国人民友谊的生动写照。无论是投身于厄立特里亚基础设施建设的中企员工，还是在中国留学的厄立特里亚学子，两国人民携手努力，必将推动两国关系不断向前发展。”鲁夫塔说。\n', 'block_bbox': [769, 776, 1121, 1058], 'block_id': 23, 'block_order': 22}, {'block_label': 'text', 'block_content': '厄立特里亚高等教育委员会主任助理萨马瑞表示：“每年我们都会组织学生到中国访问学习，自前有超过5000名厄立特里亚学生在中国留学。学习中国的教育经验，有助于提升厄立特里亚的教育水平。”', 'block_bbox': [770, 1064, 1121, 1177], 'block_id': 24, 'block_order': 23}, {'block_label': 'paragraph_title', 'block_content': '“共同向世界展示非洲和亚洲的灿烂文明”', 'block_bbox': [790, 1200, 1102, 1259], 'block_id': 25, 'block_order': 24}, {'block_label': 'text', 'block_content': '从阿斯马拉出发，沿着蜿蜒曲折的盘山公路一路向东寻找丝路印迹。驱车两个小时，记者来到位于厄立特里亚港口城市马萨', 'block_bbox': [770, 1280, 1122, 1346], 'block_id': 26, 'block_order': 25}, {'block_label': 'text', 'block_content': '瓦的北红海省博物馆。', 'block_bbox': [1154, 776, 1331, 794], 'block_id': 27, 'block_order': 26}, {'block_label': 'text', 'block_content': '博物馆二层陈列着一个发掘自阿杜利斯古城的中国古代陶制酒器，罐身上写着“万”“和”“禅”“山”等汉字。“这件文物证明，很早以前我们就通过海上丝绸之路进行贸易往来与文化交流。这也是厄立特里亚与中国友好交往历史的有力证明。”北红海省博物馆研究与文献部负责人伊萨亚斯·特斯法兹吉说。\n', 'block_bbox': [1152, 800, 1502, 986], 'block_id': 28, 'block_order': 27}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆考古学和人类学研究员菲尔蒙·特韦尔德十分喜爱中国文化。他表示：“学习彼此的语言和文化，将帮助厄中两国人民更好地理解彼此，助力双方交往，搭建友谊桥梁。”\n', 'block_bbox': [1152, 992, 1502, 1106], 'block_id': 29, 'block_order': 28}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆馆长塔吉丁·努重达姆·优素福曾多次访问中国，对中华文明的传承与创新、现代化博物馆的建设与发展印象深刻。“中国博物馆不仅有许多保存完好的文物，还充分运用先进科技手段进行展示，帮助人们更好理解中华文明。”塔吉丁说，“危立特里亚与中国都拥有悠久的文明，始终相互理解、相互尊重。我希望未来与中国同行加强合作，共同向世界展示非洲和亚洲的灿烂文明。”\n', 'block_bbox': [1151, 1112, 1502, 1346], 'block_id': 30, 'block_order': 29}], 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 1, 'label': 'image', 'score': 0.9864752888679504, 'coordinate': [774.821, 201.05176, 1502.1008, 685.7733]}, {'cls_id': 2, 'label': 'text', 'score': 0.9859225749969482, 'coordinate': [769.8655, 776.2444, 1121.5986, 1058.4167]}, {'cls_id': 2, 'label': 'text', 'score': 0.9857110381126404, 'coordinate': [1151.98, 1112.5356, 1502.7852, 1346.3569]}, {'cls_id': 2, 'label': 'text', 'score': 0.9847239255905151, 'coordinate': [389.0322, 1136.3547, 740.2322, 1345.928]}, {'cls_id': 2, 'label': 'text', 'score': 0.9842492938041687, 'coordinate': [1152.1504, 800.1625, 1502.1265, 986.1522]}, {'cls_id': 2, 'label': 'text', 'score': 0.9840831160545349, 'coordinate': [9.158066, 848.8696, 358.5725, 1057.832]}, {'cls_id': 2, 'label': 'text', 'score': 0.9802583456039429, 'coordinate': [9.335953, 201.10046, 358.31543, 338.78876]}, {'cls_id': 2, 'label': 'text', 'score': 0.9801402688026428, 'coordinate': [389.1556, 297.4113, 740.07556, 435.41647]}, {'cls_id': 2, 'label': 'text', 'score': 0.9793564081192017, 'coordinate': [389.18976, 752.0959, 740.0832, 889.88043]}, {'cls_id': 2, 'label': 'text', 'score': 0.9793409109115601, 'coordinate': [389.02496, 896.34143, 740.7431, 1033.9465]}, {'cls_id': 2, 'label': 'text', 'score': 0.9776486754417419, 'coordinate': [8.950775, 1184.7842, 358.75067, 1297.8755]}, {'cls_id': 2, 'label': 'text', 'score': 0.9773538708686829, 'coordinate': [770.7178, 1064.5714, 1121.2249, 1177.9928]}, {'cls_id': 2, 'label': 'text', 'score': 0.9773064255714417, 'coordinate': [389.38086, 609.7071, 740.0553, 745.3206]}, {'cls_id': 2, 'label': 'text', 'score': 0.9765821099281311, 'coordinate': [1152.0115, 992.296, 1502.4929, 1106.1166]}, {'cls_id': 2, 'label': 'text', 'score': 0.9761461019515991, 'coordinate': [9.46727, 536.993, 358.2047, 651.32025]}, {'cls_id': 2, 'label': 'text', 'score': 0.975399911403656, 'coordinate': [9.353531, 1064.3059, 358.45312, 1177.8347]}, {'cls_id': 2, 'label': 'text', 'score': 0.9730532169342041, 'coordinate': [9.932312, 345.36237, 358.03476, 435.1646]}, {'cls_id': 2, 'label': 'text', 'score': 0.9722575545310974, 'coordinate': [388.91736, 200.93637, 740.00793, 290.80692]}, {'cls_id': 2, 'label': 'text', 'score': 0.9710634350776672, 'coordinate': [389.39496, 1040.3186, 740.0091, 1129.7168]}, {'cls_id': 2, 'label': 'text', 'score': 0.9696939587593079, 'coordinate': [9.6145935, 658.1123, 359.06088, 770.0288]}, {'cls_id': 2, 'label': 'text', 'score': 0.9664148092269897, 'coordinate': [770.235, 1280.4562, 1122.0927, 1346.4742]}, {'cls_id': 2, 'label': 'text', 'score': 0.9597565531730652, 'coordinate': [389.66678, 537.5609, 740.06274, 603.17725]}, {'cls_id': 2, 'label': 'text', 'score': 0.9594324827194214, 'coordinate': [10.162949, 776.86414, 359.08307, 842.1771]}, {'cls_id': 2, 'label': 'text', 'score': 0.9484634399414062, 'coordinate': [10.402863, 1304.7743, 358.9441, 1346.3749]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9476125240325928, 'coordinate': [28.159409, 456.7627, 339.5631, 514.9665]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9427680969238281, 'coordinate': [790.6992, 1200.3663, 1102.3799, 1259.1647]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9424256682395935, 'coordinate': [409.02832, 456.6831, 718.8154, 515.5757]}, {'cls_id': 10, 'label': 'doc_title', 'score': 0.9376171827316284, 'coordinate': [133.77905, 36.884415, 1379.6667, 123.46867]}, {'cls_id': 2, 'label': 'text', 'score': 0.9020254015922546, 'coordinate': [584.9165, 159.1416, 927.22876, 179.01605]}, {'cls_id': 2, 'label': 'text', 'score': 0.895164430141449, 'coordinate': [1154.3364, 776.74646, 1331.8564, 794.2301]}, {'cls_id': 6, 'label': 'figure_title', 'score': 0.7892374992370605, 'coordinate': [808.9641, 704.2555, 1484.0623, 747.2296]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[ 129,   42],
         ...,
-        [ 133,  131]],
+        [ 129,  140]],
 
        ...,
 
-       [[1154, 1323],
+       [[1156, 1330],
         ...,
-        [1152, 1355]]], dtype=int16), 'text_det_params': {'limit_side_len': 960, 'limit_type': 'max', 'thresh': 0.3, 'box_thresh': 0.6, 'unclip_ratio': 2.0}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'rec_texts': ['助力双方交往', '搭建友谊桥梁', '本报记者', '沈小晓', '任', '彦', '黄培昭', '身着中国传统民族服装的厄立特里亚青', '厄立特里亚高等教育与研究院合作建立，开', '年依次登台表演中国民族舞、现代舞、扇子舞', '设了中国语言课程和中国文化课程，注册学', '等,曼妙的舞姿赢得现场观众阵阵掌声。这', '生2万余人次。10余年来，厄特孔院已成为', '是日前厄立特里亚高等教育与研究院孔子学', '当地民众了解中国的一扇窗口。', '院(以下简称"厄特孔院")举办"喜迎新年"中国', '黄鸣飞表示，随着来学习中文的人日益', '歌舞比赛的场景。', '增多，阿斯马拉大学教学点已难以满足教学', '中国和厄立特里亚传统友谊深厚。近年', '需要。2024年4月，由中企蜀道集团所属四', '来,在高质量共建"一带一路"框架下，中厄两', '川路桥承建的孔院教学楼项目在阿斯马拉开', '国人文交流不断深化，互利合作的民意基础', '工建设，预计今年上半年峻工，建成后将为厄', '日益深厚。', '特孔院提供全新的办学场地。', '“学好中文，我们的', '“在中国学习的经历', '未来不是梦”', '让我看到更广阔的世界”', '“鲜花曾告诉我你怎样走过，大地知道你', '多年来，厄立特里亚广大赴华留学生和', '心中的每一个角落…"厄立特里亚阿斯马拉', '培训人员积极投身国家建设，成为助力该国', '大学综合楼二层，一阵优美的歌声在走廊里回', '发展的人才和厄中友好的见证者和推动者。', '响。循着熟悉的旋律轻轻推开一间教室的门，', '在厄立特里亚全国妇女联盟工作的约翰', '学生们正跟着老师学唱中文歌曲《同一首歌》。', '娜·特韦尔德·凯莱塔就是其中一位。她曾在', '这是厄特孔院阿斯马拉大学教学点的一', '中华女子学院攻读硕士学位，研究方向是女', '节中文歌曲课。为了让学生们更好地理解歌', '性领导力与社会发展。其间，她实地走访中国', '词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐', '多个地区，获得了观察中国社会发展的第一', '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。', '字翻译和解释歌词。随着伴奏声响起，学生们', '手资料。', '中国驻厄立特里亚大使馆供图', '边唱边随着节拍摇动身体，现场气氛热烈。', '谈起在中国求学的经历，约翰娜记忆犹', '“这是中文歌曲初级班，共有32人。学', '新："中国的发展在当今世界是独一无二的。', '“不管远近都是客人，请不用客气；相约', '瓦的北红海省博物馆。', '生大部分来自首都阿斯马拉的中小学，年龄', '沿着中国特色社会主义道路坚定前行，中国', '好了在一起我们欢迎你"在一场中厄青', '博物馆二层陈列着一个发掘自阿杜利', '最小的仅有6岁。”尤斯拉告诉记者。', '创造了发展奇迹，这一切都离不开中国共产党', '年联谊活动上，四川路桥中方员工同当地大', '斯古城的中国古代陶制酒器，罐身上写着', '尤斯拉今年23岁，是厄立特里亚一所公立', '的领导。中国的发展经验值得许多国家学习', '学生合唱《北京欢迎你》。厄立特里亚技术学', '“万""和""禅"“山"等汉字。“这件文物证', '学校的艺术老师。她12岁开始在厄特孔院学', '借鉴。”', '院计算机科学与工程专业学生鲁夫塔·谢拉', '明,很早以前我们就通过海上丝绸之路进行', '习中文,在2017年第十届"汉语桥"世界中学生', '正在西南大学学习的厄立特里亚博士生', '是其中一名演唱者，她很早便在孔院学习中', '贸易往来与文化交流。这也是厄立特里亚', '中文比赛中获得厄立特里亚赛区第一名，并和', '穆卢盖塔·泽穆伊对中国怀有深厚感情。8', '文，一直在为去中国留学作准备。"这句歌词', '与中国友好交往历史的有力证明。"北红海', '同伴代表厄立特里亚前往中国参加决赛，获得', '年前，在北京师范大学获得硕士学位后，穆卢', '是我们两国人民友谊的生动写照。无论是投', '省博物馆研究与文献部负责人伊萨亚斯·特', '团体优胜奖。2022年起，尤斯拉开始在厄特孔', '盖塔在社交媒体上写下这样一段话："这是我', '身于厄立特里亚基础设施建设的中企员工，', '斯法兹吉说。', '院兼职教授中文歌曲，每周末两个课时。“中国', '人生的重要一步，自此我拥有了一双坚固的', '还是在中国留学的厄立特里亚学子，两国人', '厄立特里亚国家博物馆考古学和人类学', '文化博大精深，我希望我的学生们能够通过中', '鞋子，赋予我穿越荆棘的力量。”', '民携手努力，必将推动两国关系不断向前发', '研究员菲尔蒙·特韦尔德十分喜爱中国文', '文歌曲更好地理解中国文化。"她说。', '穆卢盖塔密切关注中国在经济、科技、教', '展。"鲁夫塔说。', '化。他表示：“学习彼此的语言和文化，将帮', '“姐姐，你想去中国吗？""非常想！我想', '育等领域的发展，中国在科研等方面的实力', '厄立特里亚高等教育委员会主任助理萨', '助厄中两国人民更好地理解彼此，助力双方', '去看故宫、爬长城。"尤斯拉的学生中有一对', '与日俱增。在中国学习的经历让我看到更广', '马瑞表示："每年我们都会组织学生到中国访', '交往，搭建友谊桥梁。"', '能歌善舞的姐妹，姐姐露娅今年15岁，妹妹', '阔的世界，从中受益匪浅。', '问学习，目前有超过5000名厄立特里亚学生', '厄立特里亚国家博物馆馆长塔吉丁·努', '莉娅14岁，两人都已在厄特孔院学习多年，', '23岁的莉迪亚·埃斯蒂法诺斯已在厄特', '在中国留学。学习中国的教育经验，有助于', '里达姆·优素福曾多次访问中国，对中华文明', '中文说得格外流利。', '孔院学习3年，在中国书法、中国画等方面表', '提升厄立特里亚的教育水平。”', '的传承与创新、现代化博物馆的建设与发展', '露娅对记者说："这些年来，怀着对中文', '现十分优秀，在2024年厄立特里亚赛区的', '“共同向世界展示非', '印象深刻。“中国博物馆不仅有许多保存完好', '和中国文化的热爱，我们姐妹俩始终相互鼓', '“汉语桥"比赛中获得一等奖。莉迪亚说："学', '的文物，还充分运用先进科技手段进行展示，', '励，一起学习。我们的中文一天比一天好，还', '习中国书法让我的内心变得安宁和纯粹。我', '洲和亚洲的灿烂文明”', '帮助人们更好理解中华文明。"塔吉丁说，"厄', '学会了中文歌和中国舞。我们一定要到中国', '也喜欢中国的服饰,希望未来能去中国学习，', '立特里亚与中国都拥有悠久的文明，始终相', '去。学好中文，我们的未来不是梦！"', '把中国不同民族元素融入服装设计中，创作', '从阿斯马拉出发，沿着蜿蜓曲折的盘山', '互理解、相互尊重。我希望未来与中国同行', '据厄特孔院中方院长黄鸣飞介绍，这所', '出更多精美作品，也把厄特文化分享给更多', '公路一路向东寻找丝路印迹。驱车两个小', '加强合作，共同向世界展示非洲和亚洲的灿', '孔院成立于2013年3月，由贵州财经大学和', '的中国朋友。”', '时，记者来到位于厄立特里亚港口城市马萨', '烂文明。”'], 'rec_scores': array([0.99943757, ..., 0.98181838]), 'rec_polys': array([[[ 133,   35],
+        [1156, 1351]]], dtype=int16), 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['助力双方交往', '搭建友谊桥梁', '本报记者沈小晓', '任', '彦', '黄培昭', '身着中国传统民族服装的厄立特里亚青', '厄立特里亚高等教育与研究院合作建立，开', '年依次登台表演中国民族舞、现代舞、扇子舞', '设了中国语言课程和中国文化课程，注册学', '等，曼妙的舞姿赢得现场观众阵阵掌声。这', '生2万余人次。10余年来，厄特孔院已成为', '是日前危立特里亚高等教育与研究院孔子学', '当地民众了解中国的一扇窗口。', '院(以下简称“厄特孔院")举办“喜迎新年"中国', '黄鸣飞表示，随着来学习中文的人日益', '歌舞比赛的场景。', '增多，阿斯马拉大学教学点已难以满足教学', '中国和厄立特里亚传统友谊深厚。近年', '需要。2024年4月，由中企蜀道集团所属四', '来，在高质量共建“一带一路”框架下，中厄两', '川路桥承建的孔院教学楼项目在阿斯马拉开', '国人文交流不断深化，互利合作的民意基础', '工建设，预计今年上半年竣工，建成后将为危', '日益深厚。', '特孔院提供全新的办学场地。', '“学好中文，我们的', '“在中国学习的经历', '未来不是梦”', '让我看到更广阔的世界”', '“鲜花曾告诉我你怎样走过，大地知道你', '多年来，厄立特里亚广大赴华留学生和', '心中的每一个角落……"厄立特里亚阿斯马拉', '培训人员积极投身国家建设，成为助力该国', '大学综合楼二层，一阵优美的歌声在走廊里回', '发展的人才和厄中友好的见证者和推动者。', '响。循着熟悉的旋律轻轻推开一间教室的门，', '在厄立特里亚全国妇女联盟工作的约翰', '学生们正跟着老师学唱中文歌曲《同一首歌》。', '娜·特韦尔德·凯莱塔就是其中一位。她曾在', '这是厄特孔院阿斯马拉大学教学点的一', '中华女子学院攻读硕士学位，研究方向是女', '节中文歌曲课。为了让学生们更好地理解歌', '性领导力与社会发展。其间，她实地走访中国', '词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐', '多个地区，获得了观察中国社会发展的第一', '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。', '字翻译和解释歌词。随着伴奏声响起，学生们', '手资料。', '中国驻厄立特里亚大使馆供图', '边唱边随着节拍摇动身体，现场气氛热烈。', '谈起在中国求学的经历，约翰娜记忆犹', '“这是中文歌曲初级班，共有32人。学', '新：“中国的发展在当今世界是独一无二的。', '“不管远近都是客人，请不用客气；相约', '瓦的北红海省博物馆。', '生大部分来自首都阿斯马拉的中小学，年龄', '沿着中国特色社会主义道路坚定前行，中国', '好了在一起，我们欢迎你……”在一场中厄青', '博物馆二层陈列着一个发掘自阿杜利', '最小的仅有6岁。”尤斯拉告诉记者。', '创造了发展奇迹，这一切都离不开中国共产党', '年联谊活动上，四川路桥中方员工同当地大', '斯古城的中国古代陶制酒器，罐身上写着', '尤斯拉今年23岁，是厄立特里亚一所公立', '的领导。中国的发展经验值得许多国家学习', '学生合唱《北京欢迎你》。厄立特里亚技术学', '“万”“和”“禅”“山”等汉字。“这件文物证', '学校的艺术老师。她12岁开始在厄特孔院学', '借鉴。”', '院计算机科学与工程专业学生鲁夫塔·谢拉', '明，很早以前我们就通过海上丝绸之路进行', '习中文，在2017年第十届“汉语桥"世界中学生', '正在西南大学学习的厄立特里亚博士生', '是其中一名演唱者，她很早便在孔院学习中', '贸易往来与文化交流。这也是厄立特里亚', '中文比赛中获得厄立特里亚赛区第一名，并和', '穆卢盖塔·泽穆伊对中国怀有深厚感情。8', '文，一直在为去中国留学作准备。“这句歌词', '与中国友好交往历史的有力证明。”北红海', '同伴代表厄立特里亚前往中国参加决赛，获得', '年前，在北京师范大学获得硕士学位后，穆卢', '是我们两国人民友谊的生动写照。无论是投', '省博物馆研究与文献部负责人伊萨亚斯·特', '团体优胜奖。2022年起，尤斯拉开始在厄特孔', '盖塔在社交媒体上写下这样一段话：“这是我', '身于厄立特里亚基础设施建设的中企员工，', '斯法兹吉说。', '院兼职教授中文歌曲，每周末两个课时。“中国', '人生的重要一步，自此我拥有了一双坚固的', '还是在中国留学的厄立特里亚学子，两国人', '厄立特里亚国家博物馆考古学和人类学', '文化博大精深，我希望我的学生们能够通过中', '鞋子，赋予我穿越荆棘的力量。”', '民携手努力，必将推动两国关系不断向前发', '研究员菲尔蒙·特韦尔德十分喜爱中国文', '文歌曲更好地理解中国文化。”她说。', '穆卢盖塔密切关注中国在经济、科技、教', '展。”鲁夫塔说。', '化。他表示：“学习彼此的语言和文化，将帮', '“姐姐，你想去中国吗?”“非常想！我想', '育等领域的发展，“中国在科研等方面的实力', '厄立特里亚高等教育委员会主任助理萨', '助厄中两国人民更好地理解彼此，助力双方', '去看故宫、爬长城。”尤斯拉的学生中有一对', '与日俱增。在中国学习的经历让我看到更广', '马瑞表示：“每年我们都会组织学生到中国访', '交往，搭建友谊桥梁。”', '能歌善舞的姐妹，姐姐露娅今年15岁，妹妹', '阔的世界，从中受益匪浅。”', '问学习，自前有超过5000名厄立特里亚学生', '厄立特里亚国家博物馆馆长塔吉丁·努', '莉娅14岁，两人都已在厄特孔院学习多年，', '23岁的莉迪亚·埃斯蒂法诺斯已在厄特', '在中国留学。学习中国的教育经验，有助于', '重达姆·优素福曾多次访问中国，对中华文明', '中文说得格外流利。', '孔院学习3年，在中国书法、中国画等方面表', '提升厄立特里亚的教育水平。”', '的传承与创新、现代化博物馆的建设与发展', '露娅对记者说：“这些年来，怀着对中文', '现干分优秀，在2024年厄立特里亚赛区的', '印象深刻。“中国博物馆不仅有许多保存完好', '“共同向世界展示非', '和中国文化的热爱，我们姐妹俩始终相互鼓', '“汉语桥”比赛中获得一等奖。莉迪亚说：“学', '的文物，还充分运用先进科技手段进行展示，', '励，一起学习。我们的中文一天比一天好，还', '习中国书法让我的内心变得安宁和纯粹。我', '洲和亚洲的灿烂文明”', '帮助人们更好理解中华文明。”塔吉丁说，“危', '学会了中文歌和中国舞。我们一定要到中国', '也喜欢中国的服饰，希望未来能去中国学习，', '立特里亚与中国都拥有悠久的文明，始终相', '去。学好中文，我们的未来不是梦！”', '把中国不同民族元素融入服装设计中，创作', '从阿斯马拉出发，沿着蜿蜒曲折的盘山', '互理解、相互尊重。我希望未来与中国同行', '据厄特孔院中方院长黄鸣飞介绍，这所', '出更多精美作品，也把厄特文化分享给更多', '公路一路向东寻找丝路印迹。驱车两个小', '加强合作，共同向世界展示非洲和亚洲的灿', '孔院成立于2013年3月，由贵州财经大学和', '的中国朋友。”', '时，记者来到位于厄立特里亚港口城市马萨', '烂文明。”'], 'rec_scores': array([0.99113536, ..., 0.95110023]), 'rec_polys': array([[[ 129,   42],
         ...,
-        [ 133,  131]],
+        [ 129,  140]],
 
        ...,
 
-       [[1154, 1323],
+       [[1156, 1330],
         ...,
-        [1152, 1355]]], dtype=int16), 'rec_boxes': array([[ 133, ...,  131],
+        [1156, 1351]]], dtype=int16), 'rec_boxes': array([[ 129, ...,  140],
        ...,
-       [1152, ..., 1359]], dtype=int16)}, 'text_paragraphs_ocr_res': {'rec_polys': array([[[ 133,   35],
-        ...,
-        [ 133,  131]],
-
-       ...,
-
-       [[1154, 1323],
-        ...,
-        [1152, 1355]]], dtype=int16), 'rec_texts': ['助力双方交往', '搭建友谊桥梁', '本报记者', '沈小晓', '任', '彦', '黄培昭', '身着中国传统民族服装的厄立特里亚青', '厄立特里亚高等教育与研究院合作建立，开', '年依次登台表演中国民族舞、现代舞、扇子舞', '设了中国语言课程和中国文化课程，注册学', '等,曼妙的舞姿赢得现场观众阵阵掌声。这', '生2万余人次。10余年来，厄特孔院已成为', '是日前厄立特里亚高等教育与研究院孔子学', '当地民众了解中国的一扇窗口。', '院(以下简称"厄特孔院")举办"喜迎新年"中国', '黄鸣飞表示，随着来学习中文的人日益', '歌舞比赛的场景。', '增多，阿斯马拉大学教学点已难以满足教学', '中国和厄立特里亚传统友谊深厚。近年', '需要。2024年4月，由中企蜀道集团所属四', '来,在高质量共建"一带一路"框架下，中厄两', '川路桥承建的孔院教学楼项目在阿斯马拉开', '国人文交流不断深化，互利合作的民意基础', '工建设，预计今年上半年峻工，建成后将为厄', '日益深厚。', '特孔院提供全新的办学场地。', '“学好中文，我们的', '“在中国学习的经历', '未来不是梦”', '让我看到更广阔的世界”', '“鲜花曾告诉我你怎样走过，大地知道你', '多年来，厄立特里亚广大赴华留学生和', '心中的每一个角落…"厄立特里亚阿斯马拉', '培训人员积极投身国家建设，成为助力该国', '大学综合楼二层，一阵优美的歌声在走廊里回', '发展的人才和厄中友好的见证者和推动者。', '响。循着熟悉的旋律轻轻推开一间教室的门，', '在厄立特里亚全国妇女联盟工作的约翰', '学生们正跟着老师学唱中文歌曲《同一首歌》。', '娜·特韦尔德·凯莱塔就是其中一位。她曾在', '这是厄特孔院阿斯马拉大学教学点的一', '中华女子学院攻读硕士学位，研究方向是女', '节中文歌曲课。为了让学生们更好地理解歌', '性领导力与社会发展。其间，她实地走访中国', '词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐', '多个地区，获得了观察中国社会发展的第一', '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。', '字翻译和解释歌词。随着伴奏声响起，学生们', '手资料。', '中国驻厄立特里亚大使馆供图', '边唱边随着节拍摇动身体，现场气氛热烈。', '谈起在中国求学的经历，约翰娜记忆犹', '“这是中文歌曲初级班，共有32人。学', '新："中国的发展在当今世界是独一无二的。', '“不管远近都是客人，请不用客气；相约', '瓦的北红海省博物馆。', '生大部分来自首都阿斯马拉的中小学，年龄', '沿着中国特色社会主义道路坚定前行，中国', '好了在一起我们欢迎你"在一场中厄青', '博物馆二层陈列着一个发掘自阿杜利', '最小的仅有6岁。”尤斯拉告诉记者。', '创造了发展奇迹，这一切都离不开中国共产党', '年联谊活动上，四川路桥中方员工同当地大', '斯古城的中国古代陶制酒器，罐身上写着', '尤斯拉今年23岁，是厄立特里亚一所公立', '的领导。中国的发展经验值得许多国家学习', '学生合唱《北京欢迎你》。厄立特里亚技术学', '“万""和""禅"“山"等汉字。“这件文物证', '学校的艺术老师。她12岁开始在厄特孔院学', '借鉴。”', '院计算机科学与工程专业学生鲁夫塔·谢拉', '明,很早以前我们就通过海上丝绸之路进行', '习中文,在2017年第十届"汉语桥"世界中学生', '正在西南大学学习的厄立特里亚博士生', '是其中一名演唱者，她很早便在孔院学习中', '贸易往来与文化交流。这也是厄立特里亚', '中文比赛中获得厄立特里亚赛区第一名，并和', '穆卢盖塔·泽穆伊对中国怀有深厚感情。8', '文，一直在为去中国留学作准备。"这句歌词', '与中国友好交往历史的有力证明。"北红海', '同伴代表厄立特里亚前往中国参加决赛，获得', '年前，在北京师范大学获得硕士学位后，穆卢', '是我们两国人民友谊的生动写照。无论是投', '省博物馆研究与文献部负责人伊萨亚斯·特', '团体优胜奖。2022年起，尤斯拉开始在厄特孔', '盖塔在社交媒体上写下这样一段话："这是我', '身于厄立特里亚基础设施建设的中企员工，', '斯法兹吉说。', '院兼职教授中文歌曲，每周末两个课时。“中国', '人生的重要一步，自此我拥有了一双坚固的', '还是在中国留学的厄立特里亚学子，两国人', '厄立特里亚国家博物馆考古学和人类学', '文化博大精深，我希望我的学生们能够通过中', '鞋子，赋予我穿越荆棘的力量。”', '民携手努力，必将推动两国关系不断向前发', '研究员菲尔蒙·特韦尔德十分喜爱中国文', '文歌曲更好地理解中国文化。"她说。', '穆卢盖塔密切关注中国在经济、科技、教', '展。"鲁夫塔说。', '化。他表示：“学习彼此的语言和文化，将帮', '“姐姐，你想去中国吗？""非常想！我想', '育等领域的发展，中国在科研等方面的实力', '厄立特里亚高等教育委员会主任助理萨', '助厄中两国人民更好地理解彼此，助力双方', '去看故宫、爬长城。"尤斯拉的学生中有一对', '与日俱增。在中国学习的经历让我看到更广', '马瑞表示："每年我们都会组织学生到中国访', '交往，搭建友谊桥梁。"', '能歌善舞的姐妹，姐姐露娅今年15岁，妹妹', '阔的世界，从中受益匪浅。', '问学习，目前有超过5000名厄立特里亚学生', '厄立特里亚国家博物馆馆长塔吉丁·努', '莉娅14岁，两人都已在厄特孔院学习多年，', '23岁的莉迪亚·埃斯蒂法诺斯已在厄特', '在中国留学。学习中国的教育经验，有助于', '里达姆·优素福曾多次访问中国，对中华文明', '中文说得格外流利。', '孔院学习3年，在中国书法、中国画等方面表', '提升厄立特里亚的教育水平。”', '的传承与创新、现代化博物馆的建设与发展', '露娅对记者说："这些年来，怀着对中文', '现十分优秀，在2024年厄立特里亚赛区的', '“共同向世界展示非', '印象深刻。“中国博物馆不仅有许多保存完好', '和中国文化的热爱，我们姐妹俩始终相互鼓', '“汉语桥"比赛中获得一等奖。莉迪亚说："学', '的文物，还充分运用先进科技手段进行展示，', '励，一起学习。我们的中文一天比一天好，还', '习中国书法让我的内心变得安宁和纯粹。我', '洲和亚洲的灿烂文明”', '帮助人们更好理解中华文明。"塔吉丁说，"厄', '学会了中文歌和中国舞。我们一定要到中国', '也喜欢中国的服饰,希望未来能去中国学习，', '立特里亚与中国都拥有悠久的文明，始终相', '去。学好中文，我们的未来不是梦！"', '把中国不同民族元素融入服装设计中，创作', '从阿斯马拉出发，沿着蜿蜓曲折的盘山', '互理解、相互尊重。我希望未来与中国同行', '据厄特孔院中方院长黄鸣飞介绍，这所', '出更多精美作品，也把厄特文化分享给更多', '公路一路向东寻找丝路印迹。驱车两个小', '加强合作，共同向世界展示非洲和亚洲的灿', '孔院成立于2013年3月，由贵州财经大学和', '的中国朋友。”', '时，记者来到位于厄立特里亚港口城市马萨', '烂文明。”'], 'rec_scores': array([0.99943757, ..., 0.98181838]), 'rec_boxes': array([[ 133, ...,  131],
-       ...,
-       [1152, ..., 1359]], dtype=int16)}}}
+       [1156, ..., 1351]], dtype=int16)}}}
 
 </code></pre></details>
 
@@ -1582,7 +1572,7 @@ In the above Python script, the following steps are executed:
 <td>
 <ul>
 <li><b>bool</b>: <code>True</code> or <code>False</code>;</li>
-<li><b>None</b>: If set to <code>None</code>, the default value initialized in the pipeline will be used, initialized as <code>True</code>;</li>
+<li><b>None</b>: If set to <code>None</code>, the default value initialized in the pipeline will be used, initialized as <code>False</code>;</li>
 </ul>
 </td>
 <td><code>None</code></td>
@@ -2004,11 +1994,14 @@ In the above Python script, the following steps are executed:
         - `use_seal_recognition`: `(bool)` Controls whether to enable the seal recognition sub-line.
         - `use_table_recognition`: `(bool)` Controls whether to enable the table recognition sub-line.
         - `use_formula_recognition`: `(bool)` Controls whether to enable the formula recognition sub-line.
+        - `format_block_content`: `(bool)` Controls whether to format the `block_content` into Markdown format
 
     - `parsing_res_list`: `(List[Dict])` A list of parsing results, where each element is a dictionary. The order of the list is the reading order after parsing.
-        - `layout_bbox`: `(np.ndarray)` The bounding box of the layout area.
-        - `{label}`: `(str)` The key is the label of the layout area, such as `text`, `table`, etc., and the content is the content within the layout area.
-        - `layout`: `(str)` The layout type, such as `double`, `single`, etc.
+        - `block_bbox`: `(np.ndarray)` The bounding box of the layout area.
+        - `block_label`: `(str)` The label of the layout area, such as `text`, `table`, etc.
+        - `block_content`: `(str)` The content within the layout area.
+        - `block_id`: `(int)` The index of the layout area, used to display the layout sorting result.
+        - `block_order`: `(int)` The order of the layout area, used to display the reading order of the layout. For non-ordered parts, the default value is `None`.
 
     - `overall_ocr_res`: `(Dict[str, Union[List[str], List[float], numpy.ndarray]])` A dictionary of global OCR results
       -  `input_path`: `(Union[str, None])` The image path accepted by the image OCR sub-line. When the input is a `numpy.ndarray`, it is saved as `None`.
@@ -2305,6 +2298,12 @@ To remove the page limit, please add the following configuration to the pipeline
 <td>No</td>
 </tr>
 <tr>
+<td><code>formatBlockContent</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>Please refer to the description of the <code>format_block_content</code> parameter of the pipeline object's <code>predict</code> method.</td>
+<td>No</td>
+</tr>
+<tr>
 <td><code>layoutThreshold</code></td>
 <td><code>number</code> | <code>object</code> | </code><code>null</code></td>
 <td>Please refer to the description of the <code>layout_threshold</code> parameter of the pipeline object's <code>predict</code> method.</td>
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
index 27c51bb067..8195f203fc 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
@@ -1316,25 +1316,25 @@ paddlex --pipeline PP-StructureV3 \
 
 <details><summary> 👉点击展开</summary>
 <pre><code>
-{'res': {'input_path': 'pp_structure_v3_demo.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_general_ocr': True, 'use_seal_recognition': True, 'use_table_recognition': True, 'use_formula_recognition': True}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 2, 'label': 'text', 'score': 0.9853514432907104, 'coordinate': [770.9531, 776.6814, 1122.6057, 1058.7322]}, {'cls_id': 1, 'label': 'image', 'score': 0.9848673939704895, 'coordinate': [775.7434, 202.27979, 1502.8113, 686.02136]}, {'cls_id': 2, 'label': 'text', 'score': 0.983731746673584, 'coordinate': [1152.3197, 1113.3275, 1503.3029, 1346.586]}, {'cls_id': 2, 'label': 'text', 'score': 0.9832221865653992, 'coordinate': [1152.5602, 801.431, 1503.8436, 986.3563]}, {'cls_id': 2, 'label': 'text', 'score': 0.9829439520835876, 'coordinate': [9.549545, 849.5713, 359.1173, 1058.7488]}, {'cls_id': 2, 'label': 'text', 'score': 0.9811657667160034, 'coordinate': [389.58298, 1137.2659, 740.66235, 1346.7488]}, {'cls_id': 2, 'label': 'text', 'score': 0.9775941371917725, 'coordinate': [9.1302185, 201.85, 359.0409, 339.05692]}, {'cls_id': 2, 'label': 'text', 'score': 0.9750366806983948, 'coordinate': [389.71454, 752.96924, 740.544, 889.92456]}, {'cls_id': 2, 'label': 'text', 'score': 0.9738152027130127, 'coordinate': [389.94565, 298.55988, 740.5585, 435.5124]}, {'cls_id': 2, 'label': 'text', 'score': 0.9737328290939331, 'coordinate': [771.50256, 1065.4697, 1122.2582, 1178.7324]}, {'cls_id': 2, 'label': 'text', 'score': 0.9728517532348633, 'coordinate': [1152.5154, 993.3312, 1503.2349, 1106.327]}, {'cls_id': 2, 'label': 'text', 'score': 0.9725610017776489, 'coordinate': [9.372787, 1185.823, 359.31738, 1298.7227]}, {'cls_id': 2, 'label': 'text', 'score': 0.9724331498146057, 'coordinate': [389.62848, 610.7389, 740.83234, 746.2377]}, {'cls_id': 2, 'label': 'text', 'score': 0.9720287322998047, 'coordinate': [389.29898, 897.0936, 741.41516, 1034.6616]}, {'cls_id': 2, 'label': 'text', 'score': 0.9713053703308105, 'coordinate': [10.323685, 1065.4663, 359.6786, 1178.8872]}, {'cls_id': 2, 'label': 'text', 'score': 0.9689728021621704, 'coordinate': [9.336395, 537.6609, 359.2901, 652.1881]}, {'cls_id': 2, 'label': 'text', 'score': 0.9684857130050659, 'coordinate': [10.7608185, 345.95068, 358.93616, 434.64087]}, {'cls_id': 2, 'label': 'text', 'score': 0.9681928753852844, 'coordinate': [9.674866, 658.89075, 359.56528, 770.4319]}, {'cls_id': 2, 'label': 'text', 'score': 0.9634978175163269, 'coordinate': [770.9464, 1281.1785, 1122.6522, 1346.7156]}, {'cls_id': 2, 'label': 'text', 'score': 0.96304851770401, 'coordinate': [390.0113, 201.28055, 740.1684, 291.53073]}, {'cls_id': 2, 'label': 'text', 'score': 0.962053120136261, 'coordinate': [391.21393, 1040.952, 740.5046, 1130.32]}, {'cls_id': 2, 'label': 'text', 'score': 0.9565253853797913, 'coordinate': [10.113251, 777.1482, 359.439, 842.437]}, {'cls_id': 2, 'label': 'text', 'score': 0.9497362375259399, 'coordinate': [390.31357, 537.86285, 740.47595, 603.9285]}, {'cls_id': 2, 'label': 'text', 'score': 0.9371236562728882, 'coordinate': [10.2034, 1305.9753, 359.5958, 1346.7295]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9338151216506958, 'coordinate': [791.6062, 1200.8479, 1103.3257, 1259.9324]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9326773285865784, 'coordinate': [408.0737, 457.37024, 718.9509, 516.63464]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9274250864982605, 'coordinate': [29.448685, 456.6762, 340.99194, 515.6999]}, {'cls_id': 2, 'label': 'text', 'score': 0.8742568492889404, 'coordinate': [1154.7095, 777.3624, 1330.3086, 794.5853]}, {'cls_id': 2, 'label': 'text', 'score': 0.8442489504814148, 'coordinate': [586.49316, 160.15454, 927.468, 179.64203]}, {'cls_id': 11, 'label': 'doc_title', 'score': 0.8332607746124268, 'coordinate': [133.80017, 37.41908, 1380.8601, 124.1429]}, {'cls_id': 6, 'label': 'figure_title', 'score': 0.6770150661468506, 'coordinate': [812.1718, 705.1199, 1484.6973, 747.1692]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[133,  35],
+{'res': {'input_path': 'pp_structure_v3_demo.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_seal_recognition': False, 'use_table_recognition': True, 'use_formula_recognition': True, 'use_chart_recognition': False, 'use_region_detection': True}, 'parsing_res_list': [{'block_label': 'doc_title', 'block_content': '助力双方交往搭建友谊桥梁', 'block_bbox': [133, 36, 1379, 123], 'block_id': 0, 'block_order': 1}, {'block_label': 'text', 'block_content': '本报记者沈小晓任彦黄培昭', 'block_bbox': [584, 159, 927, 179], 'block_id': 1, 'block_order': 2}, {'block_label': 'image', 'block_content': '', 'block_bbox': [774, 201, 1502, 685], 'block_id': 2, 'block_order': None}, {'block_label': 'figure_title', 'block_content': '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。中国驻厄立特里亚大使馆供图', 'block_bbox': [808, 704, 1484, 747], 'block_id': 3, 'block_order': None}, {'block_label': 'text', 'block_content': '身着中国传统民族服装的厄立特里亚青年依次登台表演中国民族舞、现代舞、扇子舞等，曼妙的舞姿赢得现场观众阵阵掌声。这是日前危立特里亚高等教育与研究院孔子学院(以下简称“厄特孔院")举办“喜迎新年"中国歌舞比赛的场景。\n', 'block_bbox': [9, 201, 358, 338], 'block_id': 4, 'block_order': 3}, {'block_label': 'text', 'block_content': '中国和厄立特里亚传统友谊深厚。近年来，在高质量共建“一带一路”框架下，中厄两国人文交流不断深化，互利合作的民意基础日益深厚。\n', 'block_bbox': [9, 345, 358, 435], 'block_id': 5, 'block_order': 4}, {'block_label': 'paragraph_title', 'block_content': '“学好中文，我们的未来不是梦”\n', 'block_bbox': [28, 456, 339, 514], 'block_id': 6, 'block_order': 5}, {'block_label': 'text', 'block_content': '“鲜花曾告诉我你怎样走过，大地知道你心中的每一个角落……"厄立特里亚阿斯马拉大学综合楼二层，一阵优美的歌声在走廊里回响。循着熟悉的旋律轻轻推开一间教室的门，学生们正跟着老师学唱中文歌曲《同一首歌》。', 'block_bbox': [9, 536, 358, 651], 'block_id': 7, 'block_order': 6}, {'block_label': 'text', 'block_content': '这是厄特孔院阿斯马拉大学教学点的一节中文歌曲课。为了让学生们更好地理解歌词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐字翻译和解释歌词。随着伴奏声响起，学生们边唱边随着节拍摇动身体，现场气氛热烈。', 'block_bbox': [9, 658, 359, 770], 'block_id': 8, 'block_order': 7}, {'block_label': 'text', 'block_content': '“这是中文歌曲初级班，共有32人。学生大部分来自首都阿斯马拉的中小学，年龄最小的仅有6岁。”尤斯拉告诉记者。', 'block_bbox': [10, 776, 359, 842], 'block_id': 9, 'block_order': 8}, {'block_label': 'text', 'block_content': '尤斯拉今年23岁，是厄立特里亚一所公立学校的艺术老师。她12岁开始在厄特孔院学习中文，在2017年第十届“汉语桥"世界中学生中文比赛中获得厄立特里亚赛区第一名，并和同伴代表厄立特里亚前往中国参加决赛，获得团体优胜奖。2022年起，尤斯拉开始在厄特孔院兼职教授中文歌曲，每周末两个课时。“中国文化博大精深，我希望我的学生们能够通过中文歌曲更好地理解中国文化。”她说。', 'block_bbox': [9, 848, 358, 1057], 'block_id': 10, 'block_order': 9}, {'block_label': 'text', 'block_content': '“姐姐，你想去中国吗?”“非常想！我想去看故宫、爬长城。”尤斯拉的学生中有一对能歌善舞的姐妹，姐姐露娅今年15岁，妹妹莉娅14岁，两人都已在厄特孔院学习多年，中文说得格外流利。\n', 'block_bbox': [9, 1064, 358, 1177], 'block_id': 11, 'block_order': 10}, {'block_label': 'text', 'block_content': '露娅对记者说：“这些年来，怀着对中文和中国文化的热爱，我们姐妹俩始终相互鼓励，一起学习。我们的中文一天比一天好，还学会了中文歌和中国舞。我们一定要到中国去。学好中文，我们的未来不是梦！”', 'block_bbox': [8, 1184, 358, 1297], 'block_id': 12, 'block_order': 11}, {'block_label': 'text', 'block_content': '据厄特孔院中方院长黄鸣飞介绍，这所孔院成立于2013年3月，由贵州财经大学和', 'block_bbox': [10, 1304, 358, 1346], 'block_id': 13, 'block_order': 12}, {'block_label': 'text', 'block_content': '厄立特里亚高等教育与研究院合作建立，开设了中国语言课程和中国文化课程，注册学生2万余人次。10余年来，厄特孔院已成为当地民众了解中国的一扇窗口。', 'block_bbox': [388, 200, 740, 290], 'block_id': 14, 'block_order': 13}, {'block_label': 'text', 'block_content': '黄鸣飞表示，随着来学习中文的人日益增多，阿斯马拉大学教学点已难以满足教学需要。2024年4月，由中企蜀道集团所属四川路桥承建的孔院教学楼项目在阿斯马拉开工建设，预计今年上半年竣工，建成后将为危特孔院提供全新的办学场地。\n', 'block_bbox': [389, 297, 740, 435], 'block_id': 15, 'block_order': 14}, {'block_label': 'paragraph_title', 'block_content': '“在中国学习的经历让我看到更广阔的世界”', 'block_bbox': [409, 456, 718, 515], 'block_id': 16, 'block_order': 15}, {'block_label': 'text', 'block_content': '多年来，厄立特里亚广大赴华留学生和培训人员积极投身国家建设，成为助力该国发展的人才和厄中友好的见证者和推动者。', 'block_bbox': [389, 537, 740, 603], 'block_id': 17, 'block_order': 16}, {'block_label': 'text', 'block_content': '在厄立特里亚全国妇女联盟工作的约翰娜·特韦尔德·凯莱塔就是其中一位。她曾在中华女子学院攻读硕士学位，研究方向是女性领导力与社会发展。其间，她实地走访中国多个地区，获得了观察中国社会发展的第一手资料。\n', 'block_bbox': [389, 609, 740, 745], 'block_id': 18, 'block_order': 17}, {'block_label': 'text', 'block_content': '谈起在中国求学的经历，约翰娜记忆犹新：“中国的发展在当今世界是独一无二的。沿着中国特色社会主义道路坚定前行，中国创造了发展奇迹，这一切都离不开中国共产党的领导。中国的发展经验值得许多国家学习借鉴。”\n', 'block_bbox': [389, 752, 740, 889], 'block_id': 19, 'block_order': 18}, {'block_label': 'text', 'block_content': '正在西南大学学习的厄立特里亚博士生穆卢盖塔·泽穆伊对中国怀有深厚感情。8年前，在北京师范大学获得硕士学位后，穆卢盖塔在社交媒体上写下这样一段话：“这是我人生的重要一步，自此我拥有了一双坚固的鞋子，赋予我穿越荆棘的力量。”', 'block_bbox': [389, 896, 740, 1033], 'block_id': 20, 'block_order': 19}, {'block_label': 'text', 'block_content': '穆卢盖塔密切关注中国在经济、科技、教育等领域的发展，“中国在科研等方面的实力与日俱增。在中国学习的经历让我看到更广阔的世界，从中受益匪浅。”\n', 'block_bbox': [389, 1040, 740, 1129], 'block_id': 21, 'block_order': 20}, {'block_label': 'text', 'block_content': '23岁的莉迪亚·埃斯蒂法诺斯已在厄特孔院学习3年，在中国书法、中国画等方面表现干分优秀，在2024年厄立特里亚赛区的“汉语桥”比赛中获得一等奖。莉迪亚说：“学习中国书法让我的内心变得安宁和纯粹。我也喜欢中国的服饰，希望未来能去中国学习，把中国不同民族元素融入服装设计中，创作出更多精美作品，也把厄特文化分享给更多的中国朋友。”\n', 'block_bbox': [389, 1136, 740, 1345], 'block_id': 22, 'block_order': 21}, {'block_label': 'text', 'block_content': '“不管远近都是客人，请不用客气；相约好了在一起，我们欢迎你……”在一场中厄青年联谊活动上，四川路桥中方员工同当地大学生合唱《北京欢迎你》。厄立特里亚技术学院计算机科学与工程专业学生鲁夫塔·谢拉是其中一名演唱者，她很早便在孔院学习中文，一直在为去中国留学作准备。“这句歌词是我们两国人民友谊的生动写照。无论是投身于厄立特里亚基础设施建设的中企员工，还是在中国留学的厄立特里亚学子，两国人民携手努力，必将推动两国关系不断向前发展。”鲁夫塔说。\n', 'block_bbox': [769, 776, 1121, 1058], 'block_id': 23, 'block_order': 22}, {'block_label': 'text', 'block_content': '厄立特里亚高等教育委员会主任助理萨马瑞表示：“每年我们都会组织学生到中国访问学习，自前有超过5000名厄立特里亚学生在中国留学。学习中国的教育经验，有助于提升厄立特里亚的教育水平。”', 'block_bbox': [770, 1064, 1121, 1177], 'block_id': 24, 'block_order': 23}, {'block_label': 'paragraph_title', 'block_content': '“共同向世界展示非洲和亚洲的灿烂文明”', 'block_bbox': [790, 1200, 1102, 1259], 'block_id': 25, 'block_order': 24}, {'block_label': 'text', 'block_content': '从阿斯马拉出发，沿着蜿蜒曲折的盘山公路一路向东寻找丝路印迹。驱车两个小时，记者来到位于厄立特里亚港口城市马萨', 'block_bbox': [770, 1280, 1122, 1346], 'block_id': 26, 'block_order': 25}, {'block_label': 'text', 'block_content': '瓦的北红海省博物馆。', 'block_bbox': [1154, 776, 1331, 794], 'block_id': 27, 'block_order': 26}, {'block_label': 'text', 'block_content': '博物馆二层陈列着一个发掘自阿杜利斯古城的中国古代陶制酒器，罐身上写着“万”“和”“禅”“山”等汉字。“这件文物证明，很早以前我们就通过海上丝绸之路进行贸易往来与文化交流。这也是厄立特里亚与中国友好交往历史的有力证明。”北红海省博物馆研究与文献部负责人伊萨亚斯·特斯法兹吉说。\n', 'block_bbox': [1152, 800, 1502, 986], 'block_id': 28, 'block_order': 27}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆考古学和人类学研究员菲尔蒙·特韦尔德十分喜爱中国文化。他表示：“学习彼此的语言和文化，将帮助厄中两国人民更好地理解彼此，助力双方交往，搭建友谊桥梁。”\n', 'block_bbox': [1152, 992, 1502, 1106], 'block_id': 29, 'block_order': 28}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆馆长塔吉丁·努重达姆·优素福曾多次访问中国，对中华文明的传承与创新、现代化博物馆的建设与发展印象深刻。“中国博物馆不仅有许多保存完好的文物，还充分运用先进科技手段进行展示，帮助人们更好理解中华文明。”塔吉丁说，“危立特里亚与中国都拥有悠久的文明，始终相互理解、相互尊重。我希望未来与中国同行加强合作，共同向世界展示非洲和亚洲的灿烂文明。”\n', 'block_bbox': [1151, 1112, 1502, 1346], 'block_id': 30, 'block_order': 29}], 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 1, 'label': 'image', 'score': 0.9864752888679504, 'coordinate': [774.821, 201.05176, 1502.1008, 685.7733]}, {'cls_id': 2, 'label': 'text', 'score': 0.9859225749969482, 'coordinate': [769.8655, 776.2444, 1121.5986, 1058.4167]}, {'cls_id': 2, 'label': 'text', 'score': 0.9857110381126404, 'coordinate': [1151.98, 1112.5356, 1502.7852, 1346.3569]}, {'cls_id': 2, 'label': 'text', 'score': 0.9847239255905151, 'coordinate': [389.0322, 1136.3547, 740.2322, 1345.928]}, {'cls_id': 2, 'label': 'text', 'score': 0.9842492938041687, 'coordinate': [1152.1504, 800.1625, 1502.1265, 986.1522]}, {'cls_id': 2, 'label': 'text', 'score': 0.9840831160545349, 'coordinate': [9.158066, 848.8696, 358.5725, 1057.832]}, {'cls_id': 2, 'label': 'text', 'score': 0.9802583456039429, 'coordinate': [9.335953, 201.10046, 358.31543, 338.78876]}, {'cls_id': 2, 'label': 'text', 'score': 0.9801402688026428, 'coordinate': [389.1556, 297.4113, 740.07556, 435.41647]}, {'cls_id': 2, 'label': 'text', 'score': 0.9793564081192017, 'coordinate': [389.18976, 752.0959, 740.0832, 889.88043]}, {'cls_id': 2, 'label': 'text', 'score': 0.9793409109115601, 'coordinate': [389.02496, 896.34143, 740.7431, 1033.9465]}, {'cls_id': 2, 'label': 'text', 'score': 0.9776486754417419, 'coordinate': [8.950775, 1184.7842, 358.75067, 1297.8755]}, {'cls_id': 2, 'label': 'text', 'score': 0.9773538708686829, 'coordinate': [770.7178, 1064.5714, 1121.2249, 1177.9928]}, {'cls_id': 2, 'label': 'text', 'score': 0.9773064255714417, 'coordinate': [389.38086, 609.7071, 740.0553, 745.3206]}, {'cls_id': 2, 'label': 'text', 'score': 0.9765821099281311, 'coordinate': [1152.0115, 992.296, 1502.4929, 1106.1166]}, {'cls_id': 2, 'label': 'text', 'score': 0.9761461019515991, 'coordinate': [9.46727, 536.993, 358.2047, 651.32025]}, {'cls_id': 2, 'label': 'text', 'score': 0.975399911403656, 'coordinate': [9.353531, 1064.3059, 358.45312, 1177.8347]}, {'cls_id': 2, 'label': 'text', 'score': 0.9730532169342041, 'coordinate': [9.932312, 345.36237, 358.03476, 435.1646]}, {'cls_id': 2, 'label': 'text', 'score': 0.9722575545310974, 'coordinate': [388.91736, 200.93637, 740.00793, 290.80692]}, {'cls_id': 2, 'label': 'text', 'score': 0.9710634350776672, 'coordinate': [389.39496, 1040.3186, 740.0091, 1129.7168]}, {'cls_id': 2, 'label': 'text', 'score': 0.9696939587593079, 'coordinate': [9.6145935, 658.1123, 359.06088, 770.0288]}, {'cls_id': 2, 'label': 'text', 'score': 0.9664148092269897, 'coordinate': [770.235, 1280.4562, 1122.0927, 1346.4742]}, {'cls_id': 2, 'label': 'text', 'score': 0.9597565531730652, 'coordinate': [389.66678, 537.5609, 740.06274, 603.17725]}, {'cls_id': 2, 'label': 'text', 'score': 0.9594324827194214, 'coordinate': [10.162949, 776.86414, 359.08307, 842.1771]}, {'cls_id': 2, 'label': 'text', 'score': 0.9484634399414062, 'coordinate': [10.402863, 1304.7743, 358.9441, 1346.3749]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9476125240325928, 'coordinate': [28.159409, 456.7627, 339.5631, 514.9665]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9427680969238281, 'coordinate': [790.6992, 1200.3663, 1102.3799, 1259.1647]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9424256682395935, 'coordinate': [409.02832, 456.6831, 718.8154, 515.5757]}, {'cls_id': 10, 'label': 'doc_title', 'score': 0.9376171827316284, 'coordinate': [133.77905, 36.884415, 1379.6667, 123.46867]}, {'cls_id': 2, 'label': 'text', 'score': 0.9020254015922546, 'coordinate': [584.9165, 159.1416, 927.22876, 179.01605]}, {'cls_id': 2, 'label': 'text', 'score': 0.895164430141449, 'coordinate': [1154.3364, 776.74646, 1331.8564, 794.2301]}, {'cls_id': 6, 'label': 'figure_title', 'score': 0.7892374992370605, 'coordinate': [808.9641, 704.2555, 1484.0623, 747.2296]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[ 129,   42],
         ...,
-        [133, 131]],
+        [ 129,  140]],
 
        ...,
 
-       [[ 13, 754],
+       [[1156, 1330],
         ...,
-        [ 13, 777]]], dtype=int16), 'text_det_params': {'limit_side_len': 960, 'limit_type': 'max', 'thresh': 0.3, 'box_thresh': 0.6, 'unclip_ratio': 2.0}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'rec_texts': ['助力双方交往', '搭建友谊桥梁', '本报记者', '沈小晓', '任', '彦', '黄培昭', '身着中国传统民族服装的厄立特里亚青', '厄立特里亚高等教育与研究院合作建立，开', '年依次登台表演中国民族舞、现代舞、扇子舞', '设了中国语言课程和中国文化课程，注册学', '等,曼妙的舞姿赢得现场观众阵阵掌声。这', '生2万余人次。10余年来，厄特孔院已成为', '是日前厄立特里亚高等教育与研究院孔子学', '当地民众了解中国的一扇窗口。', '院(以下简称"厄特孔院")举办"喜迎新年"中国', '黄鸣飞表示,随着来学习中文的人日益', '歌舞比赛的场景。', '增多，阿斯马拉大学教学点已难以满足教学', '中国和厄立特里亚传统友谊深厚。近年', '需要。2024年4月，由中企蜀道集团所属四', '来,在高质量共建"一带一路"框架下，中厄两', '川路桥承建的孔院教学楼项目在阿斯马拉开', '国人文交流不断深化，互利合作的民意基础', '工建设,预计今年上半年竣工,建成后将为厄', '日益深厚。', '特孔院提供全新的办学场地。', '“学好中文，我们的', '“在中国学习的经历', '未来不是梦”', '让我看到更广阔的世界”', '多年来,厄立特里亚广大赴华留学生和', '培训人员积极投身国家建设,成为助力该国', '发展的人才和厄中友好的见证者和推动者。', '在厄立特里亚全国妇女联盟工作的约翰', '娜·特韦尔德·凯莱塔就是其中一位。她曾在', '中华女子学院攻读硕士学位,研究方向是女', '性领导力与社会发展。其间，她实地走访中国', '多个地区，获得了观察中国社会发展的第一', '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。', '手资料。', '中国驻厄立特里亚大使馆供图', '“这是中文歌曲初级班，共有32人。学', '“不管远近都是客人，请不用客气;相约', '瓦的北红海省博物馆。', '生大部分来自首都阿斯马拉的中小学，年龄', '好了在一起,我们欢迎你"在一场中厄青', '博物馆二层陈列着一个发掘自阿杜利', '最小的仅有6岁。"尤斯拉告诉记者。', '年联谊活动上,四川路桥中方员工同当地大', '斯古城的中国古代陶制酒器,罐身上写着', '尤斯拉今年23岁，是厄立特里亚一所公立', '学生合唱《北京欢迎你》。厄立特里亚技术学', '“万”“和""禅”“山"等汉字。“这件文物证', '学校的艺术老师。她12岁开始在厄特孔院学', '院计算机科学与工程专业学生鲁夫塔·谢拉', '明,很早以前我们就通过海上丝绸之路进行', '习中文,在2017年第十届"汉语桥"世界中学生', '是其中一名演唱者,她很早便在孔院学习中', '贸易往来与文化交流。这也是厄立特里亚', '中文比赛中获得厄立特里亚赛区第一名,并和', '文，一直在为去中国留学作准备。“这句歌词', '与中国友好交往历史的有力证明。"北红海', '同伴代表厄立特里亚前往中国参加决赛,获得', '是我们两国人民友谊的生动写照。无论是投', '省博物馆研究与文献部负责人伊萨亚斯·特', '团体优胜奖。2022年起，尤斯拉开始在厄特孔', '身于厄立特里亚基础设施建设的中企员工，', '斯法兹吉说。', '院兼职教授中文歌曲,每周末两个课时。中国', '还是在中国留学的厄立特里亚学子,两国人', '厄立特里亚国家博物馆考古学和人类学', '文化博大精深,我希望我的学生们能够通过中', '民携手努力,必将推动两国关系不断向前发', '研究员菲尔蒙·特韦尔德十分喜爱中国文', '文歌曲更好地理解中国文化。"她说。', '穆卢盖塔密切关注中国在经济、科技、教', '展。"鲁夫塔说。', '化。他表示：“学习彼此的语言和文化，将帮', '“姐姐,你想去中国吗?"“非常想！我想', '育等领域的发展，“中国在科研等方面的实力', '厄立特里亚高等教育委员会主任助理萨', '助厄中两国人民更好地理解彼此，助力双方', '去看故宫、爬长城。"尤斯拉的学生中有一对', '与日俱增。在中国学习的经历让我看到更广', '马瑞表示：“每年我们都会组织学生到中国访', '交往,搭建友谊桥梁。"', '能歌善舞的姐妹,姐姐露娅今年15岁，妹妹', '阔的世界，从中受益匪浅。', '问学习，目前有超过5000名厄立特里亚学生', '厄立特里亚国家博物馆馆长塔吉丁·努', '莉娅14岁，两人都已在厄特孔院学习多年，', '23岁的莉迪亚·埃斯蒂法诺斯已在厄特', '在中国留学。学习中国的教育经验,有助于', '里达姆·优素福曾多次访问中国，对中华文明', '中文说得格外流利。', '孔院学习3年，在中国书法、中国画等方面表', '提升厄立特里亚的教育水平。”', '的传承与创新、现代化博物馆的建设与发展', '露娅对记者说：“这些年来,怀着对中文', '现十分优秀，在2024年厄立特里亚赛区的', '“共同向世界展示非', '印象深刻。“中国博物馆不仅有许多保存完好', '和中国文化的热爱,我们姐妹俩始终相互鼓', '“汉语桥"比赛中获得一等奖。莉迪亚说：“学', '的文物,还充分运用先进科技手段进行展示，', '励,一起学习。我们的中文一天比一天好,还', '习中国书法让我的内心变得安宁和纯粹。我', '洲和亚洲的灿烂文明”', '帮助人们更好理解中华文明。"塔吉丁说，厄', '学会了中文歌和中国舞。我们一定要到中国', '也喜欢中国的服饰,希望未来能去中国学习，', '立特里亚与中国都拥有悠久的文明,始终相', '去。学好中文,我们的未来不是梦!"', '把中国不同民族元素融入服装设计中，创作', '从阿斯马拉出发,沿着蜿蜒曲折的盘山', '互理解、相互尊重。我希望未来与中国同行', '据厄特孔院中方院长黄鸣飞介绍,这所', '出更多精美作品，也把厄特文化分享给更多', '公路一路向东寻找丝路印迹。驱车两个小', '加强合作,共同向世界展示非洲和亚洲的灿', '孔院成立于2013年3月，由贵州财经大学和', '的中国朋友。”', '时,记者来到位于厄立特里亚港口城市马萨', '烂文明。”', '谈起在中国求学的经历,约翰娜记忆犹', '新：“中国的发展在当今世界是独一无二的。', '沿着中国特色社会主义道路坚定前行，中国', '创造了发展奇迹,这一切都离不开中国共产党', '的领导。中国的发展经验值得许多国家学习', '借鉴，”', '正在西南大学学习的厄立特里亚博士生', '穆卢盖塔·泽穆伊对中国怀有深厚感情。8', '年前，在北京师范大学获得硕士学位后，穆卢', '盖塔在社交媒体上写下这样一段话：“这是我', '人生的重要一步，自此我拥有了一双坚固的', '鞋子.赋予我穿越荆棘的力量。”', '“鲜花曾告诉我你怎样走过，大地知道你', '心中的每一个角落"厄立特里亚阿斯马拉', '大学综合楼二层，一阵优美的歌声在走廊里回', '响。循着熟悉的旋律轻轻推开一间教室的门，', '学生们正跟着老师学唱中文歌曲《同一首歌》。', '这是厄特孔院阿斯马拉大学教学点的一', '节中文歌曲课。为了让学生们更好地理解歌', '词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐', '字翻译和解释歌词。随着伴奏声响起，学生们', '边唱边随着节拍摇动身体，现场气氛热烈。'], 'rec_scores': array([0.99972075, ..., 0.96241361]), 'rec_polys': array([[[133,  35],
+        [1156, 1351]]], dtype=int16), 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['助力双方交往', '搭建友谊桥梁', '本报记者沈小晓', '任', '彦', '黄培昭', '身着中国传统民族服装的厄立特里亚青', '厄立特里亚高等教育与研究院合作建立，开', '年依次登台表演中国民族舞、现代舞、扇子舞', '设了中国语言课程和中国文化课程，注册学', '等，曼妙的舞姿赢得现场观众阵阵掌声。这', '生2万余人次。10余年来，厄特孔院已成为', '是日前危立特里亚高等教育与研究院孔子学', '当地民众了解中国的一扇窗口。', '院(以下简称“厄特孔院")举办“喜迎新年"中国', '黄鸣飞表示，随着来学习中文的人日益', '歌舞比赛的场景。', '增多，阿斯马拉大学教学点已难以满足教学', '中国和厄立特里亚传统友谊深厚。近年', '需要。2024年4月，由中企蜀道集团所属四', '来，在高质量共建“一带一路”框架下，中厄两', '川路桥承建的孔院教学楼项目在阿斯马拉开', '国人文交流不断深化，互利合作的民意基础', '工建设，预计今年上半年竣工，建成后将为危', '日益深厚。', '特孔院提供全新的办学场地。', '“学好中文，我们的', '“在中国学习的经历', '未来不是梦”', '让我看到更广阔的世界”', '“鲜花曾告诉我你怎样走过，大地知道你', '多年来，厄立特里亚广大赴华留学生和', '心中的每一个角落……"厄立特里亚阿斯马拉', '培训人员积极投身国家建设，成为助力该国', '大学综合楼二层，一阵优美的歌声在走廊里回', '发展的人才和厄中友好的见证者和推动者。', '响。循着熟悉的旋律轻轻推开一间教室的门，', '在厄立特里亚全国妇女联盟工作的约翰', '学生们正跟着老师学唱中文歌曲《同一首歌》。', '娜·特韦尔德·凯莱塔就是其中一位。她曾在', '这是厄特孔院阿斯马拉大学教学点的一', '中华女子学院攻读硕士学位，研究方向是女', '节中文歌曲课。为了让学生们更好地理解歌', '性领导力与社会发展。其间，她实地走访中国', '词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐', '多个地区，获得了观察中国社会发展的第一', '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。', '字翻译和解释歌词。随着伴奏声响起，学生们', '手资料。', '中国驻厄立特里亚大使馆供图', '边唱边随着节拍摇动身体，现场气氛热烈。', '谈起在中国求学的经历，约翰娜记忆犹', '“这是中文歌曲初级班，共有32人。学', '新：“中国的发展在当今世界是独一无二的。', '“不管远近都是客人，请不用客气；相约', '瓦的北红海省博物馆。', '生大部分来自首都阿斯马拉的中小学，年龄', '沿着中国特色社会主义道路坚定前行，中国', '好了在一起，我们欢迎你……”在一场中厄青', '博物馆二层陈列着一个发掘自阿杜利', '最小的仅有6岁。”尤斯拉告诉记者。', '创造了发展奇迹，这一切都离不开中国共产党', '年联谊活动上，四川路桥中方员工同当地大', '斯古城的中国古代陶制酒器，罐身上写着', '尤斯拉今年23岁，是厄立特里亚一所公立', '的领导。中国的发展经验值得许多国家学习', '学生合唱《北京欢迎你》。厄立特里亚技术学', '“万”“和”“禅”“山”等汉字。“这件文物证', '学校的艺术老师。她12岁开始在厄特孔院学', '借鉴。”', '院计算机科学与工程专业学生鲁夫塔·谢拉', '明，很早以前我们就通过海上丝绸之路进行', '习中文，在2017年第十届“汉语桥"世界中学生', '正在西南大学学习的厄立特里亚博士生', '是其中一名演唱者，她很早便在孔院学习中', '贸易往来与文化交流。这也是厄立特里亚', '中文比赛中获得厄立特里亚赛区第一名，并和', '穆卢盖塔·泽穆伊对中国怀有深厚感情。8', '文，一直在为去中国留学作准备。“这句歌词', '与中国友好交往历史的有力证明。”北红海', '同伴代表厄立特里亚前往中国参加决赛，获得', '年前，在北京师范大学获得硕士学位后，穆卢', '是我们两国人民友谊的生动写照。无论是投', '省博物馆研究与文献部负责人伊萨亚斯·特', '团体优胜奖。2022年起，尤斯拉开始在厄特孔', '盖塔在社交媒体上写下这样一段话：“这是我', '身于厄立特里亚基础设施建设的中企员工，', '斯法兹吉说。', '院兼职教授中文歌曲，每周末两个课时。“中国', '人生的重要一步，自此我拥有了一双坚固的', '还是在中国留学的厄立特里亚学子，两国人', '厄立特里亚国家博物馆考古学和人类学', '文化博大精深，我希望我的学生们能够通过中', '鞋子，赋予我穿越荆棘的力量。”', '民携手努力，必将推动两国关系不断向前发', '研究员菲尔蒙·特韦尔德十分喜爱中国文', '文歌曲更好地理解中国文化。”她说。', '穆卢盖塔密切关注中国在经济、科技、教', '展。”鲁夫塔说。', '化。他表示：“学习彼此的语言和文化，将帮', '“姐姐，你想去中国吗?”“非常想！我想', '育等领域的发展，“中国在科研等方面的实力', '厄立特里亚高等教育委员会主任助理萨', '助厄中两国人民更好地理解彼此，助力双方', '去看故宫、爬长城。”尤斯拉的学生中有一对', '与日俱增。在中国学习的经历让我看到更广', '马瑞表示：“每年我们都会组织学生到中国访', '交往，搭建友谊桥梁。”', '能歌善舞的姐妹，姐姐露娅今年15岁，妹妹', '阔的世界，从中受益匪浅。”', '问学习，自前有超过5000名厄立特里亚学生', '厄立特里亚国家博物馆馆长塔吉丁·努', '莉娅14岁，两人都已在厄特孔院学习多年，', '23岁的莉迪亚·埃斯蒂法诺斯已在厄特', '在中国留学。学习中国的教育经验，有助于', '重达姆·优素福曾多次访问中国，对中华文明', '中文说得格外流利。', '孔院学习3年，在中国书法、中国画等方面表', '提升厄立特里亚的教育水平。”', '的传承与创新、现代化博物馆的建设与发展', '露娅对记者说：“这些年来，怀着对中文', '现干分优秀，在2024年厄立特里亚赛区的', '印象深刻。“中国博物馆不仅有许多保存完好', '“共同向世界展示非', '和中国文化的热爱，我们姐妹俩始终相互鼓', '“汉语桥”比赛中获得一等奖。莉迪亚说：“学', '的文物，还充分运用先进科技手段进行展示，', '励，一起学习。我们的中文一天比一天好，还', '习中国书法让我的内心变得安宁和纯粹。我', '洲和亚洲的灿烂文明”', '帮助人们更好理解中华文明。”塔吉丁说，“危', '学会了中文歌和中国舞。我们一定要到中国', '也喜欢中国的服饰，希望未来能去中国学习，', '立特里亚与中国都拥有悠久的文明，始终相', '去。学好中文，我们的未来不是梦！”', '把中国不同民族元素融入服装设计中，创作', '从阿斯马拉出发，沿着蜿蜒曲折的盘山', '互理解、相互尊重。我希望未来与中国同行', '据厄特孔院中方院长黄鸣飞介绍，这所', '出更多精美作品，也把厄特文化分享给更多', '公路一路向东寻找丝路印迹。驱车两个小', '加强合作，共同向世界展示非洲和亚洲的灿', '孔院成立于2013年3月，由贵州财经大学和', '的中国朋友。”', '时，记者来到位于厄立特里亚港口城市马萨', '烂文明。”'], 'rec_scores': array([0.99113536, ..., 0.95110023]), 'rec_polys': array([[[ 129,   42],
         ...,
-        [133, 131]],
+        [ 129,  140]],
 
        ...,
 
-       [[ 13, 754],
+       [[1156, 1330],
         ...,
-        [ 13, 777]]], dtype=int16), 'rec_boxes': array([[133, ..., 131],
+        [1156, 1351]]], dtype=int16), 'rec_boxes': array([[ 129, ...,  140],
        ...,
-       [ 13, ..., 777]], dtype=int16)}}}
+       [1156, ..., 1351]], dtype=int16)}}}
 </code></pre></details>
 
 运行结果参数说明可以参考[2.2.2 Python脚本方式集成](#222-python脚本方式集成)中的结果解释。
@@ -1540,7 +1540,7 @@ for item in markdown_images:
 <td>
 <ul>
 <li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
-<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>False</code>；</li>
 </ul>
 </td>
 <td><code>None</code></td>
@@ -1956,6 +1956,7 @@ for item in markdown_images:
         - `use_seal_recognition`: `(bool)` 控制是否启用印章识别子产线
         - `use_table_recognition`: `(bool)` 控制是否启用表格识别子产线
         - `use_formula_recognition`: `(bool)` 控制是否启用公式识别子产线
+        - `format_block_content`: `(bool)` 控制是否将 `block_content` 中的内容格式化为Markdown格式
 
     - `doc_preprocessor_res`: `(Dict[str, Union[List[float], str]])` 文档预处理结果字典，仅当`use_doc_preprocessor=True`时存在
         - `input_path`: `(str)` 文档预处理子产线接受的图像路径，当输入为`numpy.ndarray`时，保存为`None`，此处为`None`
@@ -1969,11 +1970,8 @@ for item in markdown_images:
         - `block_bbox`: `(np.ndarray)` 版面区域的边界框。
         - `block_label`: `(str)` 版面区域的标签，例如`text`, `table`等。
         - `block_content`: `(str)` 内容为版面区域内的内容。
-        - `seg_start_flag`: `(bool)` 标识该版面区域是否是段落的开始。
-        - `seg_end_flag`: `(bool)` 标识该版面区域是否是段落的结束。
-        - `sub_label`: `(str)` 版面区域的子标签，例如`text`的子标签可能为`title_text`。
-        - `sub_index`: `(int)` 版面区域的子索引，用于恢复Markdown。
-        - `index`: `(int)` 版面区域的索引，用于显示版面排序结果。
+        - `block_id`: `(int)` 版面区域的索引，用于显示版面排序结果。
+        - `block_order` `(int)` 版面区域的顺序，用于显示版面阅读顺序,对于非排序部分，默认值为 `None`。
 
 
 
@@ -2257,6 +2255,12 @@ for res in output:
 <td>否</td>
 </tr>
 <tr>
+<td><code>formatBlockContent</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>format_block_content</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
 <td><code>layoutThreshold</code></td>
 <td><code>number</code> | <code>object</code> | </code><code>null</code></td>
 <td>请参阅产线对象中 <code>predict</code> 方法的 <code>layout_threshold</code> 参数相关说明。</td>
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md
new file mode 100644
index 0000000000..d7d8db37bb
--- /dev/null
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md
@@ -0,0 +1,2031 @@
+---
+comments: true
+---
+
+# PaddleOCR-VL产线使用教程
+
+## PaddleOCR-VL产线介绍
+
+<待补充>
+
+### 1.1 模型基准测试数据
+
+<b>如您更考虑模型精度，请选择精度较高的模型，如您更考虑模型推理速度，请选择推理速度较快的模型，如您更考虑模型存储大小，请选择存储大小较小的模型</b>。
+
+> 推理耗时仅包含模型推理耗时，不包含前后处理耗时。
+
+<details><summary> 👉模型列表详情</summary>
+<p><b>文档图像方向分类模块：</b></p>
+<table>
+<thead>
+<tr>
+<th>模型</th><th>模型下载链接</th>
+<th>Top-1 Acc（%）</th>
+<th>GPU推理耗时（ms）<br/>[常规模式 / 高性能模式]</th>
+<th>CPU推理耗时（ms）<br/>[常规模式 / 高性能模式]</th>
+<th>模型存储大小（MB）</th>
+<th>介绍</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>PP-LCNet_x1_0_doc_ori</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/PP-LCNet_x1_0_doc_ori_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/PP-LCNet_x1_0_doc_ori_pretrained.pdparams">训练模型</a></td>
+<td>99.06</td>
+<td>2.62 / 0.59</td>
+<td>3.24 / 1.19</td>
+<td>7</td>
+<td>基于PP-LCNet_x1_0的文档图像分类模型，含有四个类别，即0度，90度，180度，270度</td>
+</tr>
+</tbody>
+</table>
+<p><b>文本图像矫正模块：</b></p>
+<table>
+<thead>
+<tr>
+<th>模型</th><th>模型下载链接</th>
+<th>CER </th>
+<th>GPU推理耗时（ms）<br/>[常规模式 / 高性能模式]</th>
+<th>CPU推理耗时（ms）<br/>[常规模式 / 高性能模式]</th>
+<th>模型存储大小（MB）</th>
+<th>介绍</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>UVDoc</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/UVDoc_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/UVDoc_pretrained.pdparams">训练模型</a></td>
+<td>0.179</td>
+<td>19.05 / 19.05</td>
+<td>- / 869.82</td>
+<td>30.3</td>
+<td>高精度文本图像矫正模型</td>
+</tr>
+</tbody>
+</table>
+
+<p><b>版面区域检测排序模块模型：</b></p>
+<table>
+<thead>
+<tr>
+<th>模型</th><th>模型下载链接</th>
+<th>mAP(0.5)（%）</th>
+<th>GPU推理耗时（ms）<br/>[常规模式 / 高性能模式]</th>
+<th>CPU推理耗时（ms）<br/>[常规模式 / 高性能模式]</th>
+<th>模型存储大小（MB）</th>
+<th>介绍</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>PP-DocLayoutV2</td>
+<td><a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_inference_model/paddle3.0.0/PP-DocLayoutV2_infer.tar">推理模型</a>/<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/PP-DocLayoutV2_pretrained.pdparams">训练模型</a></td>
+<td>-</td>
+<td>- / -</td>
+<td>- / -</td>
+<td>-</td>
+<td>基于RT-DETR-L在包含中英文论文、杂志、合同、书本、试卷和研报等场景的自建数据集训练的高精度版面区域定位和区域排序一体模型</td>
+</tr>
+</tbody>
+</table>
+
+<p><b>多模态识别模块模型：</b></p>
+<table>
+<tr>
+<th>模型</th><th>模型下载链接</th>
+<th>Top1 Acc(%)</th>
+<th>GPU推理耗时（ms）<br/>[常规模式 / 高性能模式]</th>
+<th>CPU推理耗时（ms）<br/>[常规模式 / 高性能模式]</th>
+<th>模型存储大小（MB）</th>
+</tr>
+<tr>
+<td>PaddleOCR-VL</td>
+<td><a href="待补充">推理模型</a>/<a href="待补充">训练模型</a></td>
+<td>-</td>
+<td>- / -</td>
+<td>- / -</td>
+<td>-</td>
+</tr>
+</table>
+
+<strong>测试环境说明:</strong>
+
+  <ul>
+      <li><b>性能测试环境</b>
+          <ul>
+            <li><strong>测试数据集：
+             </strong>
+                <ul>
+                  <li>文档图像方向分类模型：PaddleX 自建的数据集，覆盖证件和文档等多个场景，包含 1000 张图片。</li>
+                  <li> 文本图像矫正模型：<a href="https://www3.cs.stonybrook.edu/~cvl/docunet.html">DocUNet</a>。</li>
+                  <li>版面区域检测模型：PaddleOCR 自建的版面区域分析数据集，包含中英文论文、杂志和研报等常见的 1w 张文档类型图片。</li>
+                  <li>PP-DocLayout_plus-L：PaddleOCR 自建的版面区域检测数据集，包含中英文论文、杂志、报纸、研报、PPT、试卷、课本等 1300 张文档类型图片。</li>
+                </ul>
+             </li>
+              <li><strong>硬件配置：</strong>
+                  <ul>
+                      <li>GPU：NVIDIA Tesla T4</li>
+                      <li>CPU：Intel Xeon Gold 6271C @ 2.60GHz</li>
+                  </ul>
+              </li>
+              <li><strong>软件环境：</strong>
+                  <ul>
+                      <li>Ubuntu 20.04 / CUDA 11.8 / cuDNN 8.9 / TensorRT 8.6.1.6</li>
+                      <li>paddlepaddle 3.0.0 / paddlex 3.0.3</li>
+                  </ul>
+              </li>
+          </ul>
+      </li>
+      <li><b>推理模式说明</b></li>
+  </ul>
+
+<table border="1">
+    <thead>
+        <tr>
+            <th>模式</th>
+            <th>GPU配置</th>
+            <th>CPU配置</th>
+            <th>加速技术组合</th>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>常规模式</td>
+            <td>FP32精度 / 无TRT加速</td>
+            <td>FP32精度 / 8线程</td>
+            <td>PaddleInference</td>
+        </tr>
+        <tr>
+            <td>高性能模式</td>
+            <td>选择先验精度类型和加速策略的最优组合</td>
+            <td>FP32精度 / 8线程</td>
+            <td>选择先验最优后端（Paddle/OpenVINO/TRT等）</td>
+        </tr>
+    </tbody>
+</table>
+
+</details>
+
+### 1.2 产线基准测试数据
+
+<details>
+<summary>点击展开/折叠表格</summary>
+<待补充>
+<table border="1">
+<tr><th>流水线配置</th><th>硬件</th><th>平均推理时间 (s)</th><th>峰值CPU利用率 (%)</th><th>平均CPU利用率 (%)</th><th>峰值主机内存 (MB)</th><th>平均主机内存 (MB)</th><th>峰值GPU利用率 (%)</th><th>平均GPU利用率 (%)</th><th>峰值设备内存 (MB)</th><th>平均设备内存 (MB)</th></tr>
+<tr>
+<td rowspan="5">PP_StructureV3-default</td>
+<td>Intel 8350C + A100</td>
+<td>1.38</td>
+<td>1384.60</td>
+<td>113.26</td>
+<td>5781.59</td>
+<td>3431.21</td>
+<td>100</td>
+<td>32.79</td>
+<td>37370.00</td>
+<td>34165.68</td>
+</tr>
+<tr>
+<td>Intel 6271C + V100</td>
+<td>2.38</td>
+<td>608.70</td>
+<td>109.96</td>
+<td>6388.91</td>
+<td>3737.19</td>
+<td>100</td>
+<td>39.08</td>
+<td>26824.00</td>
+<td>24581.61</td>
+</tr>
+<tr>
+<td>Intel 8563C + H20</td>
+<td>1.36</td>
+<td>744.30</td>
+<td>112.82</td>
+<td>6199.01</td>
+<td>3865.78</td>
+<td>100</td>
+<td>43.81</td>
+<td>35132.00</td>
+<td>32077.12</td>
+</tr>
+<tr>
+<td>Intel 8350C + A10</td>
+<td>1.74</td>
+<td>418.50</td>
+<td>105.96</td>
+<td>6138.25</td>
+<td>3503.41</td>
+<td>100</td>
+<td>48.54</td>
+<td>18536.00</td>
+<td>18353.93</td>
+</tr>
+<tr>
+<td>Intel 6271C + T4</td>
+<td>3.70</td>
+<td>434.40</td>
+<td>105.45</td>
+<td>6865.87</td>
+<td>3595.68</td>
+<td>100</td>
+<td>71.92</td>
+<td>13970.00</td>
+<td>12668.58</td>
+</tr>
+</table>
+
+
+<table border="1">
+<tr><th>Pipeline configuration</th><th>description</th></tr>
+<tr>
+<td>PP_StructureV3-default</td>
+<td>默认配置</td>
+</tr>
+<tr>
+<td>PP_StructureV3-pp</td>
+<td>默认配置基础上，开启文档图像预处理</td>
+</tr>
+<tr>
+<td>PP_StructureV3-full</td>
+<td>默认配置基础上，开启文档图像预处理和图表解析</td>
+</tr>
+<tr>
+<td>PP_StructureV3-seal</td>
+<td>默认配置基础上，开启印章文本识别</td>
+</tr>
+<tr>
+<td>PP_StructureV3-chart</td>
+<td>默认配置基础上，开启文档图表解析</td>
+</tr>
+<tr>
+<td>PP_StructureV3-notable</td>
+<td>默认配置基础上，关闭表格识别</td>
+</tr>
+<tr>
+<td>PP_StructureV3-noformula</td>
+<td>默认配置基础上，关闭公式识别</td>
+</tr>
+<tr>
+<td>PP_StructureV3-lightweight</td>
+<td>默认配置基础上，将所有任务模型都换成最轻量版本</td>
+</tr>
+</table>
+</details>
+
+
+* 测试环境：
+    * PaddlePaddle 3.1.0、CUDA 11.8、cuDNN 8.9
+    * PaddleX @ develop (f1eb28e23cfa54ce3e9234d2e61fcb87c93cf407)
+    * Docker image: ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddle:3.1.0-gpu-cuda11.8-cudnn8.9
+* 测试数据：
+    * 测试数据包含表格、印章、公式、图表的280张图像。
+* 测试策略：
+    * 使用 20 个样本进行预热，然后对整个数据集重复 1 次以进行速度性能测试。
+* 备注：
+    * 由于我们没有收集NPU和XPU的设备内存数据，因此表中相应位置的数据标记为N/A。
+
+## 2. 快速开始
+
+PaddleX 所提供的模型产线均可以快速体验效果，你可以在本地使用命令行或 Python 体验通用通用版面解析v3产线的效果。
+
+在本地使用通用版面解析v3产线前，请确保您已经按照[PaddleX本地安装教程](../../../installation/installation.md)完成了PaddleX的wheel包安装。如果您希望选择性安装依赖，请参考安装教程中的相关说明。该产线对应的依赖分组为 `ocr`。此外，为了使用飞桨框架读取 safetensors 格式模型，请执行如下命令安装 safetensors：
+
+```bash
+python -m pip install https://paddle-whl.bj.bcebos.com/nightly/cu126/safetensors/safetensors-0.6.2.dev0-cp38-abi3-linux_x86_64.whl
+```
+
+> 默认配置暂不支持 Compute Capability 低于 8.0 的 GPU（如 V100、RTX 3060 等）。请参阅下一节，了解如何在此类 GPU 上使用推理加速框架。
+
+### 2.1 命令行方式体验
+一行命令即可快速体验版面解析产线效果，使用 [测试文件](https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/paddleocr_vl_demo.png)，并将 `--input` 替换为本地路径，进行预测
+
+```
+paddlex --pipeline PaddleOCR-VL \
+        --input paddleocr_vl_demo.png \
+        --use_doc_orientation_classify False \
+        --use_doc_unwarping False \
+        --save_path ./output \
+        --device gpu:0
+```
+
+<b>注：</b>PaddleX 支持多个模型托管平台，官方模型默认优先从 HuggingFace 下载。PaddleX 也支持通过环境变量 `PADDLE_PDX_MODEL_SOURCE` 设置优先使用的托管平台，目前支持 `huggingface`、`aistudio`、`bos`、`modelscope`，如优先使用 `bos`：`PADDLE_PDX_MODEL_SOURCE="bos"`；
+
+相关的参数说明可以参考[2.2.2 Python脚本方式集成](#222-python脚本方式集成)中的参数说明。支持同时指定多个设备以进行并行推理，详情请参考 [产线并行推理](../../instructions/parallel_inference.md#指定多个推理设备)。
+
+运行后，会将结果打印到终端上，结果如下：
+
+<details><summary> 👉点击展开</summary>
+<pre><code>
+{'res': {'input_path': 'paddleocr_vl_demo.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_seal_recognition': False, 'use_table_recognition': True, 'use_formula_recognition': True, 'use_chart_recognition': False, 'use_region_detection': True}, 'parsing_res_list': [{'block_label': 'doc_title', 'block_content': '助力双方交往搭建友谊桥梁', 'block_bbox': [133, 36, 1379, 123], 'block_id': 0, 'block_order': 1}, {'block_label': 'text', 'block_content': '本报记者沈小晓任彦黄培昭', 'block_bbox': [584, 159, 927, 179], 'block_id': 1, 'block_order': 2}, {'block_label': 'image', 'block_content': '', 'block_bbox': [774, 201, 1502, 685], 'block_id': 2, 'block_order': None}, {'block_label': 'figure_title', 'block_content': '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。中国驻厄立特里亚大使馆供图', 'block_bbox': [808, 704, 1484, 747], 'block_id': 3, 'block_order': None}, {'block_label': 'text', 'block_content': '身着中国传统民族服装的厄立特里亚青年依次登台表演中国民族舞、现代舞、扇子舞等，曼妙的舞姿赢得现场观众阵阵掌声。这是日前危立特里亚高等教育与研究院孔子学院(以下简称“厄特孔院")举办“喜迎新年"中国歌舞比赛的场景。\n', 'block_bbox': [9, 201, 358, 338], 'block_id': 4, 'block_order': 3}, {'block_label': 'text', 'block_content': '中国和厄立特里亚传统友谊深厚。近年来，在高质量共建“一带一路”框架下，中厄两国人文交流不断深化，互利合作的民意基础日益深厚。\n', 'block_bbox': [9, 345, 358, 435], 'block_id': 5, 'block_order': 4}, {'block_label': 'paragraph_title', 'block_content': '“学好中文，我们的未来不是梦”\n', 'block_bbox': [28, 456, 339, 514], 'block_id': 6, 'block_order': 5}, {'block_label': 'text', 'block_content': '“鲜花曾告诉我你怎样走过，大地知道你心中的每一个角落……"厄立特里亚阿斯马拉大学综合楼二层，一阵优美的歌声在走廊里回响。循着熟悉的旋律轻轻推开一间教室的门，学生们正跟着老师学唱中文歌曲《同一首歌》。', 'block_bbox': [9, 536, 358, 651], 'block_id': 7, 'block_order': 6}, {'block_label': 'text', 'block_content': '这是厄特孔院阿斯马拉大学教学点的一节中文歌曲课。为了让学生们更好地理解歌词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐字翻译和解释歌词。随着伴奏声响起，学生们边唱边随着节拍摇动身体，现场气氛热烈。', 'block_bbox': [9, 658, 359, 770], 'block_id': 8, 'block_order': 7}, {'block_label': 'text', 'block_content': '“这是中文歌曲初级班，共有32人。学生大部分来自首都阿斯马拉的中小学，年龄最小的仅有6岁。”尤斯拉告诉记者。', 'block_bbox': [10, 776, 359, 842], 'block_id': 9, 'block_order': 8}, {'block_label': 'text', 'block_content': '尤斯拉今年23岁，是厄立特里亚一所公立学校的艺术老师。她12岁开始在厄特孔院学习中文，在2017年第十届“汉语桥"世界中学生中文比赛中获得厄立特里亚赛区第一名，并和同伴代表厄立特里亚前往中国参加决赛，获得团体优胜奖。2022年起，尤斯拉开始在厄特孔院兼职教授中文歌曲，每周末两个课时。“中国文化博大精深，我希望我的学生们能够通过中文歌曲更好地理解中国文化。”她说。', 'block_bbox': [9, 848, 358, 1057], 'block_id': 10, 'block_order': 9}, {'block_label': 'text', 'block_content': '“姐姐，你想去中国吗?”“非常想！我想去看故宫、爬长城。”尤斯拉的学生中有一对能歌善舞的姐妹，姐姐露娅今年15岁，妹妹莉娅14岁，两人都已在厄特孔院学习多年，中文说得格外流利。\n', 'block_bbox': [9, 1064, 358, 1177], 'block_id': 11, 'block_order': 10}, {'block_label': 'text', 'block_content': '露娅对记者说：“这些年来，怀着对中文和中国文化的热爱，我们姐妹俩始终相互鼓励，一起学习。我们的中文一天比一天好，还学会了中文歌和中国舞。我们一定要到中国去。学好中文，我们的未来不是梦！”', 'block_bbox': [8, 1184, 358, 1297], 'block_id': 12, 'block_order': 11}, {'block_label': 'text', 'block_content': '据厄特孔院中方院长黄鸣飞介绍，这所孔院成立于2013年3月，由贵州财经大学和', 'block_bbox': [10, 1304, 358, 1346], 'block_id': 13, 'block_order': 12}, {'block_label': 'text', 'block_content': '厄立特里亚高等教育与研究院合作建立，开设了中国语言课程和中国文化课程，注册学生2万余人次。10余年来，厄特孔院已成为当地民众了解中国的一扇窗口。', 'block_bbox': [388, 200, 740, 290], 'block_id': 14, 'block_order': 13}, {'block_label': 'text', 'block_content': '黄鸣飞表示，随着来学习中文的人日益增多，阿斯马拉大学教学点已难以满足教学需要。2024年4月，由中企蜀道集团所属四川路桥承建的孔院教学楼项目在阿斯马拉开工建设，预计今年上半年竣工，建成后将为危特孔院提供全新的办学场地。\n', 'block_bbox': [389, 297, 740, 435], 'block_id': 15, 'block_order': 14}, {'block_label': 'paragraph_title', 'block_content': '“在中国学习的经历让我看到更广阔的世界”', 'block_bbox': [409, 456, 718, 515], 'block_id': 16, 'block_order': 15}, {'block_label': 'text', 'block_content': '多年来，厄立特里亚广大赴华留学生和培训人员积极投身国家建设，成为助力该国发展的人才和厄中友好的见证者和推动者。', 'block_bbox': [389, 537, 740, 603], 'block_id': 17, 'block_order': 16}, {'block_label': 'text', 'block_content': '在厄立特里亚全国妇女联盟工作的约翰娜·特韦尔德·凯莱塔就是其中一位。她曾在中华女子学院攻读硕士学位，研究方向是女性领导力与社会发展。其间，她实地走访中国多个地区，获得了观察中国社会发展的第一手资料。\n', 'block_bbox': [389, 609, 740, 745], 'block_id': 18, 'block_order': 17}, {'block_label': 'text', 'block_content': '谈起在中国求学的经历，约翰娜记忆犹新：“中国的发展在当今世界是独一无二的。沿着中国特色社会主义道路坚定前行，中国创造了发展奇迹，这一切都离不开中国共产党的领导。中国的发展经验值得许多国家学习借鉴。”\n', 'block_bbox': [389, 752, 740, 889], 'block_id': 19, 'block_order': 18}, {'block_label': 'text', 'block_content': '正在西南大学学习的厄立特里亚博士生穆卢盖塔·泽穆伊对中国怀有深厚感情。8年前，在北京师范大学获得硕士学位后，穆卢盖塔在社交媒体上写下这样一段话：“这是我人生的重要一步，自此我拥有了一双坚固的鞋子，赋予我穿越荆棘的力量。”', 'block_bbox': [389, 896, 740, 1033], 'block_id': 20, 'block_order': 19}, {'block_label': 'text', 'block_content': '穆卢盖塔密切关注中国在经济、科技、教育等领域的发展，“中国在科研等方面的实力与日俱增。在中国学习的经历让我看到更广阔的世界，从中受益匪浅。”\n', 'block_bbox': [389, 1040, 740, 1129], 'block_id': 21, 'block_order': 20}, {'block_label': 'text', 'block_content': '23岁的莉迪亚·埃斯蒂法诺斯已在厄特孔院学习3年，在中国书法、中国画等方面表现干分优秀，在2024年厄立特里亚赛区的“汉语桥”比赛中获得一等奖。莉迪亚说：“学习中国书法让我的内心变得安宁和纯粹。我也喜欢中国的服饰，希望未来能去中国学习，把中国不同民族元素融入服装设计中，创作出更多精美作品，也把厄特文化分享给更多的中国朋友。”\n', 'block_bbox': [389, 1136, 740, 1345], 'block_id': 22, 'block_order': 21}, {'block_label': 'text', 'block_content': '“不管远近都是客人，请不用客气；相约好了在一起，我们欢迎你……”在一场中厄青年联谊活动上，四川路桥中方员工同当地大学生合唱《北京欢迎你》。厄立特里亚技术学院计算机科学与工程专业学生鲁夫塔·谢拉是其中一名演唱者，她很早便在孔院学习中文，一直在为去中国留学作准备。“这句歌词是我们两国人民友谊的生动写照。无论是投身于厄立特里亚基础设施建设的中企员工，还是在中国留学的厄立特里亚学子，两国人民携手努力，必将推动两国关系不断向前发展。”鲁夫塔说。\n', 'block_bbox': [769, 776, 1121, 1058], 'block_id': 23, 'block_order': 22}, {'block_label': 'text', 'block_content': '厄立特里亚高等教育委员会主任助理萨马瑞表示：“每年我们都会组织学生到中国访问学习，自前有超过5000名厄立特里亚学生在中国留学。学习中国的教育经验，有助于提升厄立特里亚的教育水平。”', 'block_bbox': [770, 1064, 1121, 1177], 'block_id': 24, 'block_order': 23}, {'block_label': 'paragraph_title', 'block_content': '“共同向世界展示非洲和亚洲的灿烂文明”', 'block_bbox': [790, 1200, 1102, 1259], 'block_id': 25, 'block_order': 24}, {'block_label': 'text', 'block_content': '从阿斯马拉出发，沿着蜿蜒曲折的盘山公路一路向东寻找丝路印迹。驱车两个小时，记者来到位于厄立特里亚港口城市马萨', 'block_bbox': [770, 1280, 1122, 1346], 'block_id': 26, 'block_order': 25}, {'block_label': 'text', 'block_content': '瓦的北红海省博物馆。', 'block_bbox': [1154, 776, 1331, 794], 'block_id': 27, 'block_order': 26}, {'block_label': 'text', 'block_content': '博物馆二层陈列着一个发掘自阿杜利斯古城的中国古代陶制酒器，罐身上写着“万”“和”“禅”“山”等汉字。“这件文物证明，很早以前我们就通过海上丝绸之路进行贸易往来与文化交流。这也是厄立特里亚与中国友好交往历史的有力证明。”北红海省博物馆研究与文献部负责人伊萨亚斯·特斯法兹吉说。\n', 'block_bbox': [1152, 800, 1502, 986], 'block_id': 28, 'block_order': 27}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆考古学和人类学研究员菲尔蒙·特韦尔德十分喜爱中国文化。他表示：“学习彼此的语言和文化，将帮助厄中两国人民更好地理解彼此，助力双方交往，搭建友谊桥梁。”\n', 'block_bbox': [1152, 992, 1502, 1106], 'block_id': 29, 'block_order': 28}, {'block_label': 'text', 'block_content': '厄立特里亚国家博物馆馆长塔吉丁·努重达姆·优素福曾多次访问中国，对中华文明的传承与创新、现代化博物馆的建设与发展印象深刻。“中国博物馆不仅有许多保存完好的文物，还充分运用先进科技手段进行展示，帮助人们更好理解中华文明。”塔吉丁说，“危立特里亚与中国都拥有悠久的文明，始终相互理解、相互尊重。我希望未来与中国同行加强合作，共同向世界展示非洲和亚洲的灿烂文明。”\n', 'block_bbox': [1151, 1112, 1502, 1346], 'block_id': 30, 'block_order': 29}], 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 1, 'label': 'image', 'score': 0.9864752888679504, 'coordinate': [774.821, 201.05176, 1502.1008, 685.7733]}, {'cls_id': 2, 'label': 'text', 'score': 0.9859225749969482, 'coordinate': [769.8655, 776.2444, 1121.5986, 1058.4167]}, {'cls_id': 2, 'label': 'text', 'score': 0.9857110381126404, 'coordinate': [1151.98, 1112.5356, 1502.7852, 1346.3569]}, {'cls_id': 2, 'label': 'text', 'score': 0.9847239255905151, 'coordinate': [389.0322, 1136.3547, 740.2322, 1345.928]}, {'cls_id': 2, 'label': 'text', 'score': 0.9842492938041687, 'coordinate': [1152.1504, 800.1625, 1502.1265, 986.1522]}, {'cls_id': 2, 'label': 'text', 'score': 0.9840831160545349, 'coordinate': [9.158066, 848.8696, 358.5725, 1057.832]}, {'cls_id': 2, 'label': 'text', 'score': 0.9802583456039429, 'coordinate': [9.335953, 201.10046, 358.31543, 338.78876]}, {'cls_id': 2, 'label': 'text', 'score': 0.9801402688026428, 'coordinate': [389.1556, 297.4113, 740.07556, 435.41647]}, {'cls_id': 2, 'label': 'text', 'score': 0.9793564081192017, 'coordinate': [389.18976, 752.0959, 740.0832, 889.88043]}, {'cls_id': 2, 'label': 'text', 'score': 0.9793409109115601, 'coordinate': [389.02496, 896.34143, 740.7431, 1033.9465]}, {'cls_id': 2, 'label': 'text', 'score': 0.9776486754417419, 'coordinate': [8.950775, 1184.7842, 358.75067, 1297.8755]}, {'cls_id': 2, 'label': 'text', 'score': 0.9773538708686829, 'coordinate': [770.7178, 1064.5714, 1121.2249, 1177.9928]}, {'cls_id': 2, 'label': 'text', 'score': 0.9773064255714417, 'coordinate': [389.38086, 609.7071, 740.0553, 745.3206]}, {'cls_id': 2, 'label': 'text', 'score': 0.9765821099281311, 'coordinate': [1152.0115, 992.296, 1502.4929, 1106.1166]}, {'cls_id': 2, 'label': 'text', 'score': 0.9761461019515991, 'coordinate': [9.46727, 536.993, 358.2047, 651.32025]}, {'cls_id': 2, 'label': 'text', 'score': 0.975399911403656, 'coordinate': [9.353531, 1064.3059, 358.45312, 1177.8347]}, {'cls_id': 2, 'label': 'text', 'score': 0.9730532169342041, 'coordinate': [9.932312, 345.36237, 358.03476, 435.1646]}, {'cls_id': 2, 'label': 'text', 'score': 0.9722575545310974, 'coordinate': [388.91736, 200.93637, 740.00793, 290.80692]}, {'cls_id': 2, 'label': 'text', 'score': 0.9710634350776672, 'coordinate': [389.39496, 1040.3186, 740.0091, 1129.7168]}, {'cls_id': 2, 'label': 'text', 'score': 0.9696939587593079, 'coordinate': [9.6145935, 658.1123, 359.06088, 770.0288]}, {'cls_id': 2, 'label': 'text', 'score': 0.9664148092269897, 'coordinate': [770.235, 1280.4562, 1122.0927, 1346.4742]}, {'cls_id': 2, 'label': 'text', 'score': 0.9597565531730652, 'coordinate': [389.66678, 537.5609, 740.06274, 603.17725]}, {'cls_id': 2, 'label': 'text', 'score': 0.9594324827194214, 'coordinate': [10.162949, 776.86414, 359.08307, 842.1771]}, {'cls_id': 2, 'label': 'text', 'score': 0.9484634399414062, 'coordinate': [10.402863, 1304.7743, 358.9441, 1346.3749]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9476125240325928, 'coordinate': [28.159409, 456.7627, 339.5631, 514.9665]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9427680969238281, 'coordinate': [790.6992, 1200.3663, 1102.3799, 1259.1647]}, {'cls_id': 0, 'label': 'paragraph_title', 'score': 0.9424256682395935, 'coordinate': [409.02832, 456.6831, 718.8154, 515.5757]}, {'cls_id': 10, 'label': 'doc_title', 'score': 0.9376171827316284, 'coordinate': [133.77905, 36.884415, 1379.6667, 123.46867]}, {'cls_id': 2, 'label': 'text', 'score': 0.9020254015922546, 'coordinate': [584.9165, 159.1416, 927.22876, 179.01605]}, {'cls_id': 2, 'label': 'text', 'score': 0.895164430141449, 'coordinate': [1154.3364, 776.74646, 1331.8564, 794.2301]}, {'cls_id': 6, 'label': 'figure_title', 'score': 0.7892374992370605, 'coordinate': [808.9641, 704.2555, 1484.0623, 747.2296]}]}, 'overall_ocr_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': False}, 'dt_polys': array([[[ 129,   42],
+        ...,
+        [ 129,  140]],
+
+       ...,
+
+       [[1156, 1330],
+        ...,
+        [1156, 1351]]], dtype=int16), 'text_det_params': {'limit_side_len': 736, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['助力双方交往', '搭建友谊桥梁', '本报记者沈小晓', '任', '彦', '黄培昭', '身着中国传统民族服装的厄立特里亚青', '厄立特里亚高等教育与研究院合作建立，开', '年依次登台表演中国民族舞、现代舞、扇子舞', '设了中国语言课程和中国文化课程，注册学', '等，曼妙的舞姿赢得现场观众阵阵掌声。这', '生2万余人次。10余年来，厄特孔院已成为', '是日前危立特里亚高等教育与研究院孔子学', '当地民众了解中国的一扇窗口。', '院(以下简称“厄特孔院")举办“喜迎新年"中国', '黄鸣飞表示，随着来学习中文的人日益', '歌舞比赛的场景。', '增多，阿斯马拉大学教学点已难以满足教学', '中国和厄立特里亚传统友谊深厚。近年', '需要。2024年4月，由中企蜀道集团所属四', '来，在高质量共建“一带一路”框架下，中厄两', '川路桥承建的孔院教学楼项目在阿斯马拉开', '国人文交流不断深化，互利合作的民意基础', '工建设，预计今年上半年竣工，建成后将为危', '日益深厚。', '特孔院提供全新的办学场地。', '“学好中文，我们的', '“在中国学习的经历', '未来不是梦”', '让我看到更广阔的世界”', '“鲜花曾告诉我你怎样走过，大地知道你', '多年来，厄立特里亚广大赴华留学生和', '心中的每一个角落……"厄立特里亚阿斯马拉', '培训人员积极投身国家建设，成为助力该国', '大学综合楼二层，一阵优美的歌声在走廊里回', '发展的人才和厄中友好的见证者和推动者。', '响。循着熟悉的旋律轻轻推开一间教室的门，', '在厄立特里亚全国妇女联盟工作的约翰', '学生们正跟着老师学唱中文歌曲《同一首歌》。', '娜·特韦尔德·凯莱塔就是其中一位。她曾在', '这是厄特孔院阿斯马拉大学教学点的一', '中华女子学院攻读硕士学位，研究方向是女', '节中文歌曲课。为了让学生们更好地理解歌', '性领导力与社会发展。其间，她实地走访中国', '词大意，老师尤斯拉·穆罕默德萨尔·侯赛因逐', '多个地区，获得了观察中国社会发展的第一', '在厄立特里亚不久前举办的第六届中国风筝文化节上，当地小学生体验风筝制作。', '字翻译和解释歌词。随着伴奏声响起，学生们', '手资料。', '中国驻厄立特里亚大使馆供图', '边唱边随着节拍摇动身体，现场气氛热烈。', '谈起在中国求学的经历，约翰娜记忆犹', '“这是中文歌曲初级班，共有32人。学', '新：“中国的发展在当今世界是独一无二的。', '“不管远近都是客人，请不用客气；相约', '瓦的北红海省博物馆。', '生大部分来自首都阿斯马拉的中小学，年龄', '沿着中国特色社会主义道路坚定前行，中国', '好了在一起，我们欢迎你……”在一场中厄青', '博物馆二层陈列着一个发掘自阿杜利', '最小的仅有6岁。”尤斯拉告诉记者。', '创造了发展奇迹，这一切都离不开中国共产党', '年联谊活动上，四川路桥中方员工同当地大', '斯古城的中国古代陶制酒器，罐身上写着', '尤斯拉今年23岁，是厄立特里亚一所公立', '的领导。中国的发展经验值得许多国家学习', '学生合唱《北京欢迎你》。厄立特里亚技术学', '“万”“和”“禅”“山”等汉字。“这件文物证', '学校的艺术老师。她12岁开始在厄特孔院学', '借鉴。”', '院计算机科学与工程专业学生鲁夫塔·谢拉', '明，很早以前我们就通过海上丝绸之路进行', '习中文，在2017年第十届“汉语桥"世界中学生', '正在西南大学学习的厄立特里亚博士生', '是其中一名演唱者，她很早便在孔院学习中', '贸易往来与文化交流。这也是厄立特里亚', '中文比赛中获得厄立特里亚赛区第一名，并和', '穆卢盖塔·泽穆伊对中国怀有深厚感情。8', '文，一直在为去中国留学作准备。“这句歌词', '与中国友好交往历史的有力证明。”北红海', '同伴代表厄立特里亚前往中国参加决赛，获得', '年前，在北京师范大学获得硕士学位后，穆卢', '是我们两国人民友谊的生动写照。无论是投', '省博物馆研究与文献部负责人伊萨亚斯·特', '团体优胜奖。2022年起，尤斯拉开始在厄特孔', '盖塔在社交媒体上写下这样一段话：“这是我', '身于厄立特里亚基础设施建设的中企员工，', '斯法兹吉说。', '院兼职教授中文歌曲，每周末两个课时。“中国', '人生的重要一步，自此我拥有了一双坚固的', '还是在中国留学的厄立特里亚学子，两国人', '厄立特里亚国家博物馆考古学和人类学', '文化博大精深，我希望我的学生们能够通过中', '鞋子，赋予我穿越荆棘的力量。”', '民携手努力，必将推动两国关系不断向前发', '研究员菲尔蒙·特韦尔德十分喜爱中国文', '文歌曲更好地理解中国文化。”她说。', '穆卢盖塔密切关注中国在经济、科技、教', '展。”鲁夫塔说。', '化。他表示：“学习彼此的语言和文化，将帮', '“姐姐，你想去中国吗?”“非常想！我想', '育等领域的发展，“中国在科研等方面的实力', '厄立特里亚高等教育委员会主任助理萨', '助厄中两国人民更好地理解彼此，助力双方', '去看故宫、爬长城。”尤斯拉的学生中有一对', '与日俱增。在中国学习的经历让我看到更广', '马瑞表示：“每年我们都会组织学生到中国访', '交往，搭建友谊桥梁。”', '能歌善舞的姐妹，姐姐露娅今年15岁，妹妹', '阔的世界，从中受益匪浅。”', '问学习，自前有超过5000名厄立特里亚学生', '厄立特里亚国家博物馆馆长塔吉丁·努', '莉娅14岁，两人都已在厄特孔院学习多年，', '23岁的莉迪亚·埃斯蒂法诺斯已在厄特', '在中国留学。学习中国的教育经验，有助于', '重达姆·优素福曾多次访问中国，对中华文明', '中文说得格外流利。', '孔院学习3年，在中国书法、中国画等方面表', '提升厄立特里亚的教育水平。”', '的传承与创新、现代化博物馆的建设与发展', '露娅对记者说：“这些年来，怀着对中文', '现干分优秀，在2024年厄立特里亚赛区的', '印象深刻。“中国博物馆不仅有许多保存完好', '“共同向世界展示非', '和中国文化的热爱，我们姐妹俩始终相互鼓', '“汉语桥”比赛中获得一等奖。莉迪亚说：“学', '的文物，还充分运用先进科技手段进行展示，', '励，一起学习。我们的中文一天比一天好，还', '习中国书法让我的内心变得安宁和纯粹。我', '洲和亚洲的灿烂文明”', '帮助人们更好理解中华文明。”塔吉丁说，“危', '学会了中文歌和中国舞。我们一定要到中国', '也喜欢中国的服饰，希望未来能去中国学习，', '立特里亚与中国都拥有悠久的文明，始终相', '去。学好中文，我们的未来不是梦！”', '把中国不同民族元素融入服装设计中，创作', '从阿斯马拉出发，沿着蜿蜒曲折的盘山', '互理解、相互尊重。我希望未来与中国同行', '据厄特孔院中方院长黄鸣飞介绍，这所', '出更多精美作品，也把厄特文化分享给更多', '公路一路向东寻找丝路印迹。驱车两个小', '加强合作，共同向世界展示非洲和亚洲的灿', '孔院成立于2013年3月，由贵州财经大学和', '的中国朋友。”', '时，记者来到位于厄立特里亚港口城市马萨', '烂文明。”'], 'rec_scores': array([0.99113536, ..., 0.95110023]), 'rec_polys': array([[[ 129,   42],
+        ...,
+        [ 129,  140]],
+
+       ...,
+
+       [[1156, 1330],
+        ...,
+        [1156, 1351]]], dtype=int16), 'rec_boxes': array([[ 129, ...,  140],
+       ...,
+       [1156, ..., 1351]], dtype=int16)}}}
+</code></pre></details>
+
+运行结果参数说明可以参考[2.2.2 Python脚本方式集成](#222-python脚本方式集成)中的结果解释。
+
+<b>注：</b>由于产线的默认模型较大，推理速度可能较慢，您可以参考第一节的模型列表，替换推理速度更快的模型。
+
+### 2.2 Python脚本方式集成
+几行代码即可完成产线的快速推理：
+
+```python
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="PaddleOCR-VL")
+
+output = pipeline.predict(
+    input="./paddleocr_vl_demo.png",
+    use_doc_orientation_classify=False,
+    use_doc_unwarping=False,
+)
+for res in output:
+    res.print() ## 打印预测的结构化输出
+    res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果
+    res.save_to_markdown(save_path="output") ## 保存当前图像的markdown格式的结果
+```
+
+如果是 PDF 文件，会将 PDF 的每一页单独处理，每一页的 Markdown 文件也会对应单独的结果。如果希望整个 PDF 文件转换为 Markdown 文件，建议使用以下的方式运行：
+
+```python
+from pathlib import Path
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="PaddleOCR-VL")
+
+input_file = "./your_pdf_file.pdf"
+output_path = Path("./output")
+
+output = pipeline.predict(
+    input=input_file,
+    use_doc_orientation_classify=False,
+    use_doc_unwarping=False)
+
+markdown_list = []
+markdown_images = []
+
+for res in output:
+    md_info = res.markdown
+    markdown_list.append(md_info)
+    markdown_images.append(md_info.get("markdown_images", {}))
+
+markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
+
+mkd_file_path = output_path / f"{Path(input_file).stem}.md"
+mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+with open(mkd_file_path, "w", encoding="utf-8") as f:
+    f.write(markdown_texts)
+
+for item in markdown_images:
+    if item:
+        for path, image in item.items():
+            file_path = output_path / path
+            file_path.parent.mkdir(parents=True, exist_ok=True)
+            image.save(file_path)
+```
+
+**注：**
+
+- 在示例代码中，`use_doc_orientation_classify`、`use_doc_unwarping` 参数默认均设置为 `False`，分别表示关闭文档方向分类、文档扭曲矫正功能，如果需要使用这些功能，可以手动设置为 `True`。
+
+在上述 Python 脚本中，执行了如下几个步骤：
+<details><summary>（1）实例化 <code>create_pipeline</code> 实例化产线对象，具体参数说明如下：</summary>
+
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>pipeline</code></td>
+<td>产线名称或是产线配置文件路径。如为产线名称，则必须为 PaddleX 所支持的产线。</td>
+<td><code>str</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>config</code></td>
+<td>产线配置文件路径。</td>
+<td><code>str</code></td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>device</code></td>
+<td>产线推理设备。支持指定GPU具体卡号，如“gpu:0”，其他硬件具体卡号，如“npu:0”，CPU如“cpu”。支持同时指定多个设备以进行并行推理，详情请参考产线并行推理文档。</td>
+<td><code>str</code></td>
+<td><code>gpu:0</code></td>
+</tr>
+<tr>
+<td><code>use_hpip</code></td>
+<td>是否启用高性能推理插件。如果为 <code>None</code>，则使用配置文件或 <code>config</code> 中的配置。</td>
+<td><code>bool</code> | <code>None</code></td>
+<td>无</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>hpi_config</code></td>
+<td>高性能推理配置</td>
+<td><code>dict</code> | <code>None</code></td>
+<td>无</td>
+<td><code>None</code></td>
+</tr>
+</tbody>
+</table>
+
+</details>
+
+<details><summary>（2）调用版面解析产线对象的 <code>predict()</code> 方法进行推理预测。该方法将返回一个 <code>generator</code>。以下是 <code>predict()</code> 方法的参数及其说明：</summary>
+
+<table>
+<thead>
+<tr>
+<th>参数</th>
+<th>参数说明</th>
+<th>参数类型</th>
+<th>可选项</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td><code>input</code></td>
+<td>待预测数据，支持多种输入类型，必填</td>
+<td><code>Python Var|str|list</code></td>
+<td>
+<ul>
+<li><b>Python Var</b>：如 <code>numpy.ndarray</code> 表示的图像数据</li>
+<li><b>str</b>：如图像文件或者PDF文件的本地路径：<code>/root/data/img.jpg</code>；<b>如URL链接</b>，如图像文件或PDF文件的网络URL：<a href="https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/demo_paper.png">示例</a>；<b>如本地目录</b>，该目录下需包含待预测图像，如本地路径：<code>/root/data/</code>(当前不支持目录中包含PDF文件的预测，PDF文件需要指定到具体文件路径)</li>
+<li><b>List</b>：列表元素需为上述类型数据，如<code>[numpy.ndarray, numpy.ndarray]</code>，<code>[\"/root/data/img1.jpg\", \"/root/data/img2.jpg\"]</code>，<code>[\"/root/data1\", \"/root/data2\"]</code></li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>use_doc_orientation_classify</code></td>
+<td>是否使用文档方向分类模块</td>
+<td><code>bool|None</code></td>
+<td>
+<ul>
+<li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>use_doc_unwarping</code></td>
+<td>是否使用文档扭曲矫正模块</td>
+<td><code>bool|None</code></td>
+<td>
+<ul>
+<li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>use_layout_detection</code></td>
+<td>是否使用版面检测模块</td>
+<td><code>bool|None</code></td>
+<td>
+<ul>
+<li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>use_chart_recognition</code></td>
+<td>是否使用图表识别功能</td>
+<td><code>bool|None</code></td>
+<td>
+<ul>
+<li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>False</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>layout_threshold</code></td>
+<td>版面模型得分阈值</td>
+<td><code>float|dict|None</code></td>
+<td>
+<ul>
+<li><b>float</b>：<code>0-1</code> 之间的任意浮点数；</li>
+<li><b>dict</b>： <code>{0:0.1}</code> key为类别ID，value为该类别的阈值；</li>
+<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>0.5</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>layout_nms</code></td>
+<td>版面区域检测模型是否使用NMS后处理</td>
+<td><code>bool|None</code></td>
+<td>
+<ul>
+<li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>layout_unclip_ratio</code></td>
+<td>版面区域检测模型检测框的扩张系数</td>
+<td><code>float|Tuple[float,float]|dict|None</code></td>
+<td>
+<ul>
+<li><b>float</b>：任意大于 <code>0</code>  浮点数；</li>
+<li><b>Tuple[float,float]</b>：在横纵两个方向各自的扩张系数；</li>
+<li><b>字典</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>tuple</b>类型，如<code>{0: (1.1, 2.0)}</code>, 表示将模型输出的第0类别检测框中心不变，宽度扩张1.1倍，高度扩张2.0倍</li>
+<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>1.0</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>layout_merge_bboxes_mode</code></td>
+<td>版面区域检测的重叠框过滤方式</td>
+<td><code>str|dict|None</code></td>
+<td>
+<ul>
+<li><b>str</b>：<code>large</code>，<code>small</code>, <code>union</code>，分别表示重叠框过滤时选择保留大框，小框还是同时保留</li>
+<li><b>dict</b>, 字典的key为<b>int</b>类型，代表<code>cls_id</code>, value为<b>str</b>类型, 如<code>{0: "large", 2: "small"}</code>, 表示对第0类别检测框使用large模式，对第2类别检测框使用small模式</li>
+<li><b>None</b>：如果设置为 <code>None</code>, 将默认使用产线初始化的该参数值，初始化为 <code>large</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>use_queues</code></td>
+<td>用于控制是否启用内部队列。当设置为 <code>True</code> 时，数据加载（如将 PDF 页面渲染为图像）、版面检测模型处理以及 VLM 推理将分别在独立线程中异步执行，通过队列传递数据，从而提升效率。对于页数较多的 PDF 文档，或是包含大量图像或 PDF 文件的目录，这种方式尤其高效。</td>
+<td><code>bool|None</code></td>
+<td>
+<ul>
+<li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>prompt_label</code></td>
+<td>VL模型的prompt类型设置，当且仅当 <code>use_layout_detection=Fasle</code> 时生效</td>
+<td><code>str|None</code></td>
+<td>
+<ul>
+<li><b>str</b>：<code>ocr</code>、<code>formula</code>、<code>table</code> 或者 <code>chart</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>ocr</code>；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>format_block_content</code></td>
+<td>控制是否将 <code>block_content</code> 中的内容格式化为Markdown格式</td>
+<td><code>bool|None</code></td>
+<td>
+<ul>
+<li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>False</code>；</li>
+</ul>
+</td>
+</tr>
+<tr>
+<td><code>repetition_penalty</code></td>
+<td>VL模型采样使用的重复惩罚参数</td>
+<td><code>float|None</code></td>
+<td>
+<ul>
+<li><b>float</b>：任意大于等于<code>0</code>的浮点数；</li>
+<li><b>None</b>：如果设置为<code>None</code>，将使用默认值；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>temperature</code></td>
+<td>VL模型采样使用的温度参数</td>
+<td><code>float|None</code></td>
+<td>
+<ul>
+<li><b>float</b>：任意大于等于<code>0</code>的浮点数；</li>
+<li><b>None</b>：如果设置为<code>None</code>，将使用默认值；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>top_p</code></td>
+<td>VL模型采样使用的top-p参数</td>
+<td><code>float|None</code></td>
+<td>
+<ul>
+<li><b>float</b>：取值范围在<code>&#40;0, 1&#93;</code>的浮点数；</li>
+<li><b>None</b>：如果设置为<code>None</code>，将使用默认值；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>min_pixels</code></td>
+<td>VL模型预处理图像时允许的最小像素数</td>
+<td><code>int|None</code></td>
+<td>
+<ul>
+<li><b>int</b>：任意大于<code>0</code>的整数；</li>
+<li><b>None</b>：如果设置为<code>None</code>，将使用默认值；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+<tr>
+<td><code>max_pixels</code></td>
+<td>VL模型预处理图像时允许的最大像素数</td>
+<td><code>int|None</code></td>
+<td>
+<ul>
+<li><b>int</b>：任意大于<code>0</code>的整数；</li>
+<li><b>None</b>：如果设置为<code>None</code>，将使用默认值；</li>
+</ul>
+</td>
+<td><code>None</code></td>
+</tr>
+</table>
+
+</details>
+
+<details><summary>（3）对预测结果进行处理：每个样本的预测结果均为对应的Result对象，且支持打印、保存为图片、保存为<code>json</code>文件的操作:</summary>
+
+<table>
+<thead>
+<tr>
+<th>方法</th>
+<th>方法说明</th>
+<th>参数</th>
+<th>参数类型</th>
+<th>参数说明</th>
+<th>默认值</th>
+</tr>
+</thead>
+<tr>
+<td rowspan="3"><code>print()</code></td>
+<td rowspan="3">打印结果到终端</td>
+<td><code>format_json</code></td>
+<td><code>bool</code></td>
+<td>是否对输出内容进行使用 <code>JSON</code> 缩进格式化</td>
+<td><code>True</code></td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td rowspan="3"><code>save_to_json()</code></td>
+<td rowspan="3">将结果保存为json格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，当为目录时，保存文件命名与输入文件类型命名一致</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>indent</code></td>
+<td><code>int</code></td>
+<td>指定缩进级别，以美化输出的 <code>JSON</code> 数据，使其更具可读性，仅当 <code>format_json</code> 为 <code>True</code> 时有效</td>
+<td>4</td>
+</tr>
+<tr>
+<td><code>ensure_ascii</code></td>
+<td><code>bool</code></td>
+<td>控制是否将非 <code>ASCII</code> 字符转义为 <code>Unicode</code>。设置为 <code>True</code> 时，所有非 <code>ASCII</code> 字符将被转义；<code>False</code> 则保留原始字符，仅当<code>format_json</code>为<code>True</code>时有效</td>
+<td><code>False</code></td>
+</tr>
+<tr>
+<td><code>save_to_img()</code></td>
+<td>将中间各个模块的可视化图像保存在png格式的图像</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，支持目录或文件路径</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>save_to_markdown()</code></td>
+<td>将图像或者PDF文件中的每一页分别保存为markdown格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，支持目录或文件路径</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>save_to_html()</code></td>
+<td>将文件中的表格保存为html格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，支持目录或文件路径</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>save_to_xlsx()</code></td>
+<td>将文件中的表格保存为xlsx格式的文件</td>
+<td><code>save_path</code></td>
+<td><code>str</code></td>
+<td>保存的文件路径，支持目录或文件路径</td>
+<td>无</td>
+</tr>
+<tr>
+<td><code>concatenate_markdown_pages()</code></td>
+<td>将多页Markdown内容拼接为单一文档</td>
+<td><code>markdown_list</code></td>
+<td><code>list</code></td>
+<td>包含每一页Markdown数据的列表</td>
+<td>返回处理后的Markdown文本和图像列表</td>
+</tr>
+</table>
+
+- 调用`print()` 方法会将结果打印到终端，打印到终端的内容解释如下：
+    - `input_path`: `(str)` 待预测图像或者PDF的输入路径
+
+    - `page_index`: `(Union[int, None])` 如果输入是PDF文件，则表示当前是PDF的第几页，否则为 `None`
+
+    - `model_settings`: `(Dict[str, bool])` 配置产线所需的模型参数
+
+        - `use_doc_preprocessor`: `(bool)` 控制是否启用文档预处理子产线
+        - `use_layout_detection`: `(bool)` 控制是否启用版面检测模块
+        - `use_chart_recognition`: `(bool)` 控制是否启用图表识别功能
+        - `format_block_content`: `(bool)` 控制是否将 `block_content` 中的内容格式化为Markdown格式
+
+    - `doc_preprocessor_res`: `(Dict[str, Union[List[float], str]])` 文档预处理结果字典，仅当`use_doc_preprocessor=True`时存在
+        - `input_path`: `(str)` 文档预处理子产线接受的图像路径，当输入为`numpy.ndarray`时，保存为`None`，此处为`None`
+        - `page_index`: `None`，此处的输入为`numpy.ndarray`，所以值为`None`
+        - `model_settings`: `(Dict[str, bool])` 文档预处理子产线的模型配置参数
+          - `use_doc_orientation_classify`: `(bool)` 控制是否启用文档图像方向分类子模块
+          - `use_doc_unwarping`: `(bool)` 控制是否启用文本图像扭曲矫正子模块
+        - `angle`: `(int)` 文档图像方向分类子模块的预测结果，启用时返回实际角度值
+
+    - `parsing_res_list`: `(List[Dict])` 解析结果的列表，每个元素为一个字典，列表顺序为解析后的阅读顺序。
+        - `block_bbox`: `(np.ndarray)` 版面区域的边界框。
+        - `block_label`: `(str)` 版面区域的标签，例如`text`, `table`等。
+        - `block_content`: `(str)` 内容为版面区域内的内容。
+        - `block_id`: `(int)` 版面区域的索引，用于显示版面排序结果。
+        - `block_order` `(int)` 版面区域的顺序，用于显示版面阅读顺序,对于非排序部分，默认值为 `None`。
+
+    - `formula_res_list`: `(List[Dict[str, Union[numpy.ndarray, List[float], str]]])` 公式识别结果列表，每个元素为一个字典
+        - `rec_formula`: `(str)` 公式识别结果
+        - `rec_polys`: `(numpy.ndarray)` 公式检测框，shape为(4, 2)，dtype为int16
+        - `formula_region_id`: `(int)` 公式所在的区域编号
+
+- 调用`save_to_json()` 方法会将上述内容保存到指定的 `save_path` 中，如果指定为目录，则保存的路径为`save_path/{your_img_basename}_res.json`，如果指定为文件，则直接保存到该文件中。由于 json 文件不支持保存numpy数组，因此会将其中的 `numpy.array` 类型转换为列表形式。
+- 调用`save_to_img()` 方法会将可视化结果保存到指定的 `save_path` 中，如果指定为目录，则会将版面区域检测可视化图像、版面阅读顺序可视化图像等内容保存，如果指定为文件，则直接保存到该文件中。(产线通常包含较多结果图片，不建议直接指定为具体的文件路径，否则多张图会被覆盖，仅保留最后一张图)
+- 调用`save_to_markdown()` 方法会将转化后的 Markdown 文件保存到指定的 `save_path` 中，保存的文件路径为`save_path/{your_img_basename}.md`，如果输入是 PDF 文件，建议直接指定目录，否责多个 markdown 文件会被覆盖，该方法另外支持传入 `pretty` 参数用于控制是否将图片、表格等美化为居中展示。传入 `show_formula_number` 参数用于控制是否将公式编号展示在 markdown 结果文件中。
+- 调用 `concatenate_markdown_pages()` 方法将 `PaddleOCR-VL pipeline` 输出的多页Markdown内容`markdown_list`合并为单个完整文档，并返回合并后的Markdown内容。
+
+此外，也支持通过属性获取带结果的可视化图像和预测结果，具体如下：
+<table>
+<thead>
+<tr>
+<th>属性</th>
+<th>属性说明</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>json</code></td>
+<td>获取预测的 <code>json</code> 格式的结果</td>
+</tr>
+<tr>
+<td rowspan="2"><code>img</code></td>
+<td rowspan="2">获取格式为 <code>dict</code> 的可视化图像</td>
+</tr>
+<tr>
+</tr>
+<tr>
+<td rowspan="3"><code>markdown</code></td>
+<td rowspan="3">获取格式为 <code>dict</code> 的 markdown 结果</td>
+</tr>
+<tr>
+</tr>
+<tr>
+</tr>
+</tbody>
+</table>
+
+- `json` 属性获取的预测结果为字典类型的数据，相关内容与调用 `save_to_json()` 方法保存的内容一致。
+- `img` 属性返回的预测结果是一个字典类型的数据。其中，键分别为 `layout_det_res`、`overall_ocr_res`、`text_paragraphs_ocr_res`、`formula_res_region1`、`table_cell_img` 和 `seal_res_region1`，对应的值是 `Image.Image` 对象：分别用于显示版面区域检测、OCR、OCR文本段落、公式、表格和印章结果的可视化图像。如果没有使用可选模块，则字典中只包含 `layout_det_res`。
+- `markdown` 属性返回的预测结果是一个字典类型的数据。其中，键分别为 `markdown_texts` 、 `markdown_images`和`page_continuation_flags`，对应的值分别是 markdown 文本，在 Markdown 中显示的图像（`Image.Image` 对象）和用于标识当前页面第一个元素是否为段开始以及最后一个元素是否为段结束的bool元组。
+
+</details>
+
+此外，您可以获取版面解析产线配置文件，并加载配置文件进行预测。可执行如下命令将结果保存在 `my_path` 中：
+```
+paddlex --get_pipeline_config PaddleOCR-VL --save_path ./my_path
+```
+若您获取了配置文件，即可对版面解析产线各项配置进行自定义，只需要修改 `create_pipeline` 方法中的 `pipeline` 参数值为产线配置文件路径即可。示例如下：
+
+```python
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="./my_path/PaddleOCR-VL.yaml")
+
+output = pipeline.predict(
+    input="./paddleocr_vl_demo.png",
+    use_doc_orientation_classify=False,
+    use_doc_unwarping=False,
+    use_textline_orientation=False,
+)
+for res in output:
+    res.print() ## 打印预测的结构化输出
+    res.save_to_json(save_path="output") ## 保存当前图像的结构化json结果
+    res.save_to_markdown(save_path="output") ## 保存当前图像的markdown格式的结果
+```
+<b>注：</b> 配置文件中的参数为产线初始化参数，如果希望更改通用版面解析v3产线初始化参数，可以直接修改配置文件中的参数，并加载配置文件进行预测。同时，CLI 预测也支持传入配置文件，`--pipeline` 指定配置文件的路径即可。
+
+## 3. 使用推理加速框架提升 VLM 推理性能
+
+默认配置下的推理性能未经过充分优化，可能无法满足实际生产需求。PaddleX 支持通过 vLLM、SGLang 等推理加速框架提升 VLM 的推理性能，从而加快产线推理速度。使用流程主要分为两个步骤：
+
+1. 启动 VLM 推理服务；
+2. 配置 PaddleX 产线，作为客户端调用 VLM 推理服务。
+
+### 3.1 启动 VLM 推理服务
+
+#### 3.1.1 使用 Docker 镜像
+
+PaddleX 提供了 Docker 镜像，用于快速启动 vLLM 推理服务。可使用以下命令启动服务：
+
+```bash
+docker run \
+    -it \
+    --rm \
+    --gpus all \
+    --network host \
+    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlex-genai-vllm-server
+```
+
+服务默认监听 **8080** 端口。
+
+启动容器时可传入参数覆盖默认配置，例如：
+
+```bash
+docker run \
+    -it \
+    --rm \
+    --gpus all \
+    --network host \
+    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlex-genai-vllm-server \
+    paddlex_genai_server --model_name PaddleOCR-VL-0.9B --host 0.0.0.0 --port 8118 --backend vllm
+```
+
+若您使用的是  NVIDIA 50 系显卡 (Compute Capacity >= 12)，需要在启动服务前安装指定版本的 FlashAttention:
+
+```
+docker run \
+    -it \
+    --rm \
+    --gpus all \
+    --network host \
+    ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/paddlex-genai-vllm-server \
+    /bin/bash
+python -m pip install flash-attn==2.8.3
+paddlex_genai_server --model_name PaddleOCR-VL-0.9B --backend vllm --port 8118
+```
+
+#### 3.1.2 通过 PaddleX CLI 和启动
+
+由于推理加速框架可能与飞桨框架存在依赖冲突，建议在虚拟环境中安装。示例如下：
+
+```bash
+# 创建虚拟环境
+python -m venv .venv
+# 激活环境
+source .venv/bin/activate
+# 安装 PaddleX
+python -m pip install paddlex
+# 安装 vLLM 服务器插件
+paddlex --install genai-vllm-server
+# 安装 SGLang 服务器插件
+# paddlex --install genai-sglang-server
+```
+
+若您使用的是  NVIDIA 50 系显卡 (Compute Capacity >= 12)，需要在启动服务前安装指定版本的 FlashAttention:
+
+```
+python -m pip install flash-attn==2.8.3
+```
+
+安装完成后，可通过 `paddlex_genai_server` 命令启动服务：
+
+```bash
+paddlex_genai_server --model_name PaddleOCR-VL-0.9B --backend vllm --port 8118
+```
+
+该命令支持的参数如下：
+
+| 参数                 | 说明                        |
+| ------------------ | ------------------------- |
+| `--model_name`     | 模型名称                      |
+| `--model_dir`      | 模型目录                      |
+| `--host`           | 服务器主机名                    |
+| `--port`           | 服务器端口号                    |
+| `--backend`        | 后端名称，即使用的推理加速框架名称，可选 `vllm` 或 `sglang` |
+| `--backend_config` | 可指定 YAML 文件，包含后端配置        |
+
+### 3.2 客户端使用方法
+
+启动 VLM 推理服务后，客户端即可通过 PaddleX 调用该服务。在使用前，需要安装客户端插件：
+
+```bash
+paddlex --install genai-client
+```
+
+接着，获取产线配置文件：
+
+```bash
+paddlex --get_pipeline_config PaddleOCR-VL
+```
+
+配置文件的默认保存路径为 `PaddleOCR-VL.yaml`。将配置文件中的 `VLRecognition.genai_config.backend` 和 `VLRecognition.genai_config.server_url` 字段修改为与此前启动的服务相对应的值，例如：
+
+```yaml
+VLRecognition:
+  ...
+  genai_config:
+    backend: vllm-server
+    server_url: http://127.0.0.1:8118/v1
+```
+
+之后，可以使用修改好的配置文件进行产线调用。例如通过 CLI 调用：
+
+```bash
+paddlex --pipeline PaddleOCR-VL.yaml --input paddleocr_vl_demo.png
+```
+
+或通过 Python API 调用：
+
+```python
+from paddlex import create_pipeline
+
+pipeline = create_pipeline("PaddleOCR-VL.yaml")
+
+for res in pipeline.predict("paddleocr_vl_demo.png"):
+    res.print()
+```
+
+### 3.3 性能调优
+
+默认配置是在单张 NVIDIA A100 上进行调优的，并假设客户端独占服务，因此可能不适用于其他环境。如果用户在实际使用中遇到性能问题，可以尝试以下优化方法。
+
+#### 3.3.1 服务端参数调整
+
+不同推理加速框架支持的参数不同，可参考各自官方文档了解可用参数及其调整时机：
+
+- [vLLM 官方参数调优指南](https://docs.vllm.ai/en/latest/configuration/optimization.html)
+- [SGLang 超参数调整文档](https://docs.sglang.ai/advanced_features/hyperparameter_tuning.html)
+
+PaddleX VLM 推理服务支持通过配置文件进行调参。以下示例展示如何调整 vLLM 服务器的 `gpu-memory-utilization` 和 `max-num-seqs` 参数：
+
+1. 创建 YAML 文件 `vllm_config.yaml`，内容如下：
+
+   ```yaml
+   gpu-memory-utilization: 0.3
+   max-num-seqs: 128
+   ```
+
+2. 启动服务时指定配置文件路径：
+
+   ```bash
+   paddlex_genai_server --model_name PaddleOCR-VL-0.9B --backend vllm --backend_config vllm_config.yaml
+   ```
+
+如果使用支持进程替换（process substitution）的 shell（如 Bash），也可以无需创建配置文件，直接在启动服务时传入配置项：
+
+```bash
+paddlex_genai_server --model_name PaddleOCR-VL-0.9B --backend vllm --backend_config <(echo -e 'gpu-memory-utilization: 0.3\nmax-num-seqs: 128')
+```
+
+#### 3.3.2 客户端参数调整
+
+PaddleX 会将来自单张或多张输入图像中的子图分组并对服务器发起并发请求，因此并发请求数对性能影响显著。用户可通过修改配置文件中 `VLRecognition.genai_config.max_concurrency` 字段设置最大并发请求数。
+
+当客户端与 VLM 推理服务为 1 对 1 且服务端资源充足时，可适当增加并发数以提升性能；若服务端需支持多个客户端或计算资源有限，则应降低并发数，以避免资源过载导致服务异常。
+
+#### 3.3.3 常用硬件性能调优建议
+
+以下配置均针对客户端与 VLM 推理服务为 1 对 1 的场景。
+
+**NVIDIA RTX 3060**
+
+- **服务端**
+  - vLLM：`gpu-memory-utilization=0.8`
+
+## 4. 开发集成/部署
+如果产线可以达到您对产线推理速度和精度的要求，您可以直接进行开发集成/部署。
+
+若您需要将产线直接应用在您的Python项目中，可以参考 [2.2 Python脚本方式](#22-python脚本方式集成)中的示例代码。
+
+此外，PaddleX 也提供了其他三种部署方式，详细说明如下：
+
+🚀 <b>高性能推理</b>：在实际生产环境中，许多应用对部署策略的性能指标（尤其是响应速度）有着较严苛的标准，以确保系统的高效运行与用户体验的流畅性。为此，PaddleX 提供高性能推理插件，旨在对模型推理及前后处理进行深度性能优化，实现端到端流程的显著提速，详细的高性能推理流程请参考[PaddleX高性能推理指南](../../../pipeline_deploy/high_performance_inference.md)。
+
+☁️ <b>服务化部署</b>：服务化部署是实际生产环境中常见的一种部署形式。通过将推理功能封装为服务，客户端可以通过网络请求来访问这些服务，以获取推理结果。PaddleX 支持多种产线服务化部署方案，详细的产线服务化部署流程请参考[PaddleX服务化部署指南](../../../pipeline_deploy/serving.md)。
+
+以下是基础服务化部署的API参考与多语言服务调用示例：
+
+<details><summary>API参考</summary>
+<p>对于服务提供的主要操作：</p>
+<ul>
+<li>HTTP请求方法为POST。</li>
+<li>请求体和响应体均为JSON数据（JSON对象）。</li>
+<li>当请求处理成功时，响应状态码为<code>200</code>，响应体的属性如下：</li>
+</ul>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>logId</code></td>
+<td><code>string</code></td>
+<td>请求的UUID。</td>
+</tr>
+<tr>
+<td><code>errorCode</code></td>
+<td><code>integer</code></td>
+<td>错误码。固定为<code>0</code>。</td>
+</tr>
+<tr>
+<td><code>errorMsg</code></td>
+<td><code>string</code></td>
+<td>错误说明。固定为<code>"Success"</code>。</td>
+</tr>
+<tr>
+<td><code>result</code></td>
+<td><code>object</code></td>
+<td>操作结果。</td>
+</tr>
+</tbody>
+</table>
+<ul>
+<li>当请求处理未成功时，响应体的属性如下：</li>
+</ul>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>logId</code></td>
+<td><code>string</code></td>
+<td>请求的UUID。</td>
+</tr>
+<tr>
+<td><code>errorCode</code></td>
+<td><code>integer</code></td>
+<td>错误码。与响应状态码相同。</td>
+</tr>
+<tr>
+<td><code>errorMsg</code></td>
+<td><code>string</code></td>
+<td>错误说明。</td>
+</tr>
+</tbody>
+</table>
+<p>服务提供的主要操作如下：</p>
+<ul>
+<li><b><code>infer</code></b></li>
+</ul>
+<p>进行版面解析。</p>
+<p><code>POST /layout-parsing</code></p>
+<ul>
+<li>请求体的属性如下：</li>
+</ul>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+<th>是否必填</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>file</code></td>
+<td><code>string</code></td>
+<td>服务器可访问的图像文件或PDF文件的URL，或上述类型文件内容的Base64编码结果。默认对于超过10页的PDF文件，只有前10页的内容会被处理。<br /> 要解除页数限制，请在产线配置文件中添加以下配置：
+<pre><code>Serving:
+  extra:
+    max_num_input_imgs: null
+</code></pre>
+</td>
+<td>是</td>
+</tr>
+<tr>
+<td><code>fileType</code></td>
+<td><code>integer</code>｜<code>null</code></td>
+<td>文件类型。<code>0</code>表示PDF文件，<code>1</code>表示图像文件。若请求体无此属性，则将根据URL推断文件类型。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>useDocOrientationClassify</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>use_doc_orientation_classify</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>useDocUnwarping</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>use_doc_unwarping</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>useLayoutDetection</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>use_layout_detection</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>useChartRecognition</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>use_chart_recognition</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>layoutThreshold</code></td>
+<td><code>number</code> | <code>object</code> | </code><code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>layout_threshold</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>layoutNms</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>layout_nms</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>layoutUnclipRatio</code></td>
+<td><code>number</code> | <code>array</code> | <code>object</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>layout_unclip_ratio</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>layoutMergeBboxesMode</code></td>
+<td><code>string</code> | <code>object</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>layout_merge_bboxes_mode</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>promptLabel</code></td>
+<td><code>string</code> | <code>object</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>prompt_label</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>formatBlockContent</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>format_block_content</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>repetitionPenalty</code></td>
+<td><code>number</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>repetition_penalty</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>temperature</code></td>
+<td><code>number</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>temperature</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>topP</code></td>
+<td><code>number</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>top_p</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>minPixels</code></td>
+<td><code>number</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>min_pixels</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>maxPixels</code></td>
+<td><code>number</code> | <code>null</code></td>
+<td>请参阅产线对象中 <code>predict</code> 方法的 <code>max_pixels</code> 参数相关说明。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>prettifyMarkdown</code></td>
+<td><code>boolean</code></td>
+<td>是否输出美化后的 Markdown 文本。默认为 <code>true</code>。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>showFormulaNumber</code></td>
+<td><code>boolean</code></td>
+<td>输出的 Markdown 文本中是否包含公式编号。默认为 <code>false</code>。</td>
+<td>否</td>
+</tr>
+<tr>
+<td><code>visualize</code></td>
+<td><code>boolean</code> | <code>null</code></td>
+<td>是否返回可视化结果图以及处理过程中的中间图像等。
+<ul style="margin: 0 0 0 1em; padding-left: 0em;">
+<li>传入 <code>true</code>：返回图像。</li>
+<li>传入 <code>false</code>：不返回图像。</li>
+<li>若请求体中未提供该参数或传入 <code>null</code>：遵循产线配置文件<code>Serving.visualize</code> 的设置。</li>
+</ul>
+<br/>例如，在产线配置文件中添加如下字段：<br/>
+<pre><code>Serving:
+  visualize: False
+</code></pre>
+将默认不返回图像，通过请求体中的<code>visualize</code>参数可以覆盖默认行为。如果请求体和配置文件中均未设置（或请求体传入<code>null</code>、配置文件中未设置），则默认返回图像。
+</td>
+<td>否</td>
+</tr>
+</tbody>
+</table>
+<ul>
+<li>请求处理成功时，响应体的<code>result</code>具有如下属性：</li>
+</ul>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>layoutParsingResults</code></td>
+<td><code>array</code></td>
+<td>版面解析结果。数组长度为1（对于图像输入）或实际处理的文档页数（对于PDF输入）。对于PDF输入，数组中的每个元素依次表示PDF文件中实际处理的每一页的结果。</td>
+</tr>
+<tr>
+<td><code>dataInfo</code></td>
+<td><code>object</code></td>
+<td>输入数据信息。</td>
+</tr>
+</tbody>
+</table>
+<p><code>layoutParsingResults</code>中的每个元素为一个<code>object</code>，具有如下属性：</p>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>prunedResult</code></td>
+<td><code>object</code></td>
+<td>产线对象的 <code>predict</code> 方法生成结果的 JSON 表示中 <code>res</code> 字段的简化版本，其中去除了 <code>input_path</code> 和 <code>page_index</code> 字段。</td>
+</tr>
+<tr>
+<td><code>markdown</code></td>
+<td><code>object</code></td>
+<td>Markdown结果。</td>
+</tr>
+<tr>
+<td><code>outputImages</code></td>
+<td><code>object</code> | <code>null</code></td>
+<td>参见产线预测结果的 <code>img</code> 属性说明。图像为JPEG格式，使用Base64编码。</td>
+</tr>
+<tr>
+<td><code>inputImage</code></td>
+<td><code>string</code> | <code>null</code></td>
+<td>输入图像。图像为JPEG格式，使用Base64编码。</td>
+</tr>
+</tbody>
+</table>
+<p><code>markdown</code>为一个<code>object</code>，具有如下属性：</p>
+<table>
+<thead>
+<tr>
+<th>名称</th>
+<th>类型</th>
+<th>含义</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>text</code></td>
+<td><code>string</code></td>
+<td>Markdown文本。</td>
+</tr>
+<tr>
+<td><code>images</code></td>
+<td><code>object</code></td>
+<td>Markdown图片相对路径和Base64编码图像的键值对。</td>
+</tr>
+<tr>
+<td><code>isStart</code></td>
+<td><code>boolean</code></td>
+<td>当前页面第一个元素是否为段开始。</td>
+</tr>
+<tr>
+<td><code>isEnd</code></td>
+<td><code>boolean</code></td>
+<td>当前页面最后一个元素是否为段结束。</td>
+</tr>
+</tbody>
+</table></details>
+<details><summary>多语言调用服务示例</summary>
+<details>
+<summary>Python</summary>
+
+<pre><code class="language-python">
+import base64
+import requests
+import pathlib
+
+API_URL = "http://localhost:8080/layout-parsing" # 服务URL
+
+image_path = "./demo.jpg"
+
+# 对本地图像进行Base64编码
+with open(image_path, "rb") as file:
+    image_bytes = file.read()
+    image_data = base64.b64encode(image_bytes).decode("ascii")
+
+payload = {
+    "file": image_data, # Base64编码的文件内容或者文件URL
+    "fileType": 1, # 文件类型，1表示图像文件
+}
+
+# 调用API
+response = requests.post(API_URL, json=payload)
+
+# 处理接口返回数据
+assert response.status_code == 200
+result = response.json()["result"]
+for i, res in enumerate(result["layoutParsingResults"]):
+    print(res["prunedResult"])
+    md_dir = pathlib.Path(f"markdown_{i}")
+    md_dir.mkdir(exist_ok=True)
+    (md_dir / "doc.md").write_text(res["markdown"]["text"])
+    for img_path, img in res["markdown"]["images"].items():
+        img_path = md_dir / img_path
+        img_path.parent.mkdir(parents=True, exist_ok=True)
+        img_path.write_bytes(base64.b64decode(img))
+    print(f"Markdown document saved at {md_dir / 'doc.md'}")
+    for img_name, img in res["outputImages"].items():
+        img_path = f"{img_name}_{i}.jpg"
+        pathlib.Path(img_path).parent.mkdir(exist_ok=True)
+        with open(img_path, "wb") as f:
+            f.write(base64.b64decode(img))
+        print(f"Output image saved at {img_path}")
+</code></pre></details>
+
+<details><summary>C++</summary>
+
+<pre><code class="language-cpp">#include &lt;iostream&gt;
+#include &lt;filesystem&gt;
+#include &lt;fstream&gt;
+#include &lt;vector&gt;
+#include &lt;string&gt;
+#include "cpp-httplib/httplib.h" // https://github.com/Huiyicc/cpp-httplib
+#include "nlohmann/json.hpp" // https://github.com/nlohmann/json
+#include "base64.hpp" // https://github.com/tobiaslocker/base64
+
+namespace fs = std::filesystem;
+
+int main() {
+    httplib::Client client("localhost", 8080);
+
+    const std::string filePath = "./demo.jpg";
+
+    std::ifstream file(filePath, std::ios::binary | std::ios::ate);
+    if (!file) {
+        std::cerr << "Error opening file: " << filePath << std::endl;
+        return 1;
+    }
+
+    std::streamsize size = file.tellg();
+    file.seekg(0, std::ios::beg);
+    std::vector<char> buffer(size);
+    if (!file.read(buffer.data(), size)) {
+        std::cerr << "Error reading file." << std::endl;
+        return 1;
+    }
+
+    std::string bufferStr(buffer.data(), static_cast<size_t>(size));
+    std::string encodedFile = base64::to_base64(bufferStr);
+
+    nlohmann::json jsonObj;
+    jsonObj["file"] = encodedFile;
+    jsonObj["fileType"] = 1;
+
+    auto response = client.Post("/layout-parsing", jsonObj.dump(), "application/json");
+
+    if (response && response->status == 200) {
+        nlohmann::json jsonResponse = nlohmann::json::parse(response->body);
+        auto result = jsonResponse["result"];
+
+        if (!result.is_object() || !result.contains("layoutParsingResults")) {
+            std::cerr << "Unexpected response format." << std::endl;
+            return 1;
+        }
+
+        const auto& results = result["layoutParsingResults"];
+        for (size_t i = 0; i < results.size(); ++i) {
+            const auto& res = results[i];
+
+            if (res.contains("prunedResult")) {
+                std::cout << "Layout result [" << i << "]: " << res["prunedResult"].dump() << std::endl;
+            }
+
+            if (res.contains("outputImages") && res["outputImages"].is_object()) {
+                for (auto& [imgName, imgBase64] : res["outputImages"].items()) {
+                    std::string outputPath = imgName + "_" + std::to_string(i) + ".jpg";
+                    fs::path pathObj(outputPath);
+                    fs::path parentDir = pathObj.parent_path();
+                    if (!parentDir.empty() && !fs::exists(parentDir)) {
+                        fs::create_directories(parentDir);
+                    }
+
+                    std::string decodedImage = base64::from_base64(imgBase64.get<std::string>());
+
+                    std::ofstream outFile(outputPath, std::ios::binary);
+                    if (outFile.is_open()) {
+                        outFile.write(decodedImage.c_str(), decodedImage.size());
+                        outFile.close();
+                        std::cout << "Saved image: " << outputPath << std::endl;
+                    } else {
+                        std::cerr << "Failed to save image: " << outputPath << std::endl;
+                    }
+                }
+            }
+        }
+    } else {
+        std::cerr << "Request failed." << std::endl;
+        if (response) {
+            std::cerr << "HTTP status: " << response->status << std::endl;
+            std::cerr << "Response body: " << response->body << std::endl;
+        }
+        return 1;
+    }
+
+    return 0;
+}
+</code></pre></details>
+
+<details><summary>Java</summary>
+
+<pre><code class="language-java">import okhttp3.*;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.Base64;
+import java.nio.file.Paths;
+import java.nio.file.Files;
+
+public class Main {
+    public static void main(String[] args) throws IOException {
+        String API_URL = "http://localhost:8080/layout-parsing";
+        String imagePath = "./demo.jpg";
+
+        File file = new File(imagePath);
+        byte[] fileContent = java.nio.file.Files.readAllBytes(file.toPath());
+        String base64Image = Base64.getEncoder().encodeToString(fileContent);
+
+        ObjectMapper objectMapper = new ObjectMapper();
+        ObjectNode payload = objectMapper.createObjectNode();
+        payload.put("file", base64Image);
+        payload.put("fileType", 1);
+
+        OkHttpClient client = new OkHttpClient();
+        MediaType JSON = MediaType.get("application/json; charset=utf-8");
+
+        RequestBody body = RequestBody.create(JSON, payload.toString());
+
+        Request request = new Request.Builder()
+                .url(API_URL)
+                .post(body)
+                .build();
+
+        try (Response response = client.newCall(request).execute()) {
+            if (response.isSuccessful()) {
+                String responseBody = response.body().string();
+                JsonNode root = objectMapper.readTree(responseBody);
+                JsonNode result = root.get("result");
+
+                JsonNode layoutParsingResults = result.get("layoutParsingResults");
+                for (int i = 0; i < layoutParsingResults.size(); i++) {
+                    JsonNode item = layoutParsingResults.get(i);
+                    int finalI = i;
+                    JsonNode prunedResult = item.get("prunedResult");
+                    System.out.println("Pruned Result [" + i + "]: " + prunedResult.toString());
+
+                    JsonNode outputImages = item.get("outputImages");
+                    outputImages.fieldNames().forEachRemaining(imgName -> {
+                        try {
+                            String imgBase64 = outputImages.get(imgName).asText();
+                            byte[] imgBytes = Base64.getDecoder().decode(imgBase64);
+                            String imgPath = imgName + "_" + finalI + ".jpg";
+
+                            File outputFile = new File(imgPath);
+                            File parentDir = outputFile.getParentFile();
+                            if (parentDir != null && !parentDir.exists()) {
+                                parentDir.mkdirs();
+                                System.out.println("Created directory: " + parentDir.getAbsolutePath());
+                            }
+
+                            try (FileOutputStream fos = new FileOutputStream(outputFile)) {
+                                fos.write(imgBytes);
+                                System.out.println("Saved image: " + imgPath);
+                            }
+                        } catch (IOException e) {
+                            System.err.println("Failed to save image: " + e.getMessage());
+                        }
+                    });
+                }
+            } else {
+                System.err.println("Request failed with HTTP code: " + response.code());
+            }
+        }
+    }
+}
+</code></pre></details>
+
+<details><summary>Go</summary>
+
+<pre><code class="language-go">package main
+
+import (
+    "bytes"
+    "encoding/base64"
+    "encoding/json"
+    "fmt"
+    "io/ioutil"
+    "net/http"
+    "os"
+    "path/filepath"
+)
+
+func main() {
+    API_URL := "http://localhost:8080/layout-parsing"
+    filePath := "./demo.jpg"
+
+    fileBytes, err := ioutil.ReadFile(filePath)
+    if err != nil {
+        fmt.Printf("Error reading file: %v\n", err)
+        return
+    }
+    fileData := base64.StdEncoding.EncodeToString(fileBytes)
+
+    payload := map[string]interface{}{
+        "file":     fileData,
+        "fileType": 1,
+    }
+    payloadBytes, err := json.Marshal(payload)
+    if err != nil {
+        fmt.Printf("Error marshaling payload: %v\n", err)
+        return
+    }
+
+    client := &http.Client{}
+    req, err := http.NewRequest("POST", API_URL, bytes.NewBuffer(payloadBytes))
+    if err != nil {
+        fmt.Printf("Error creating request: %v\n", err)
+        return
+    }
+    req.Header.Set("Content-Type", "application/json")
+
+    res, err := client.Do(req)
+    if err != nil {
+        fmt.Printf("Error sending request: %v\n", err)
+        return
+    }
+    defer res.Body.Close()
+
+    if res.StatusCode != http.StatusOK {
+        fmt.Printf("Unexpected status code: %d\n", res.StatusCode)
+        return
+    }
+
+    body, err := ioutil.ReadAll(res.Body)
+    if err != nil {
+        fmt.Printf("Error reading response: %v\n", err)
+        return
+    }
+
+    type Markdown struct {
+        Text   string            `json:"text"`
+        Images map[string]string `json:"images"`
+    }
+
+    type LayoutResult struct {
+        PrunedResult map[string]interface{} `json:"prunedResult"`
+        Markdown     Markdown               `json:"markdown"`
+        OutputImages map[string]string      `json:"outputImages"`
+        InputImage   *string                `json:"inputImage"`
+    }
+
+    type Response struct {
+        Result struct {
+            LayoutParsingResults []LayoutResult `json:"layoutParsingResults"`
+            DataInfo             interface{}    `json:"dataInfo"`
+        } `json:"result"`
+    }
+
+    var respData Response
+    if err := json.Unmarshal(body, &respData); err != nil {
+        fmt.Printf("Error parsing response: %v\n", err)
+        return
+    }
+
+    for i, res := range respData.Result.LayoutParsingResults {
+        fmt.Printf("Result %d - prunedResult: %+v\n", i, res.PrunedResult)
+
+        mdDir := fmt.Sprintf("markdown_%d", i)
+        os.MkdirAll(mdDir, 0755)
+        mdFile := filepath.Join(mdDir, "doc.md")
+        if err := os.WriteFile(mdFile, []byte(res.Markdown.Text), 0644); err != nil {
+            fmt.Printf("Error writing markdown file: %v\n", err)
+        } else {
+            fmt.Printf("Markdown document saved at %s\n", mdFile)
+        }
+
+        for path, imgBase64 := range res.Markdown.Images {
+            fullPath := filepath.Join(mdDir, path)
+            if err := os.MkdirAll(filepath.Dir(fullPath), 0755); err != nil {
+                fmt.Printf("Error creating directory for markdown image: %v\n", err)
+                continue
+            }
+            imgBytes, err := base64.StdEncoding.DecodeString(imgBase64)
+            if err != nil {
+                fmt.Printf("Error decoding markdown image: %v\n", err)
+                continue
+            }
+            if err := os.WriteFile(fullPath, imgBytes, 0644); err != nil {
+                fmt.Printf("Error saving markdown image: %v\n", err)
+            }
+        }
+
+        for name, imgBase64 := range res.OutputImages {
+            imgBytes, err := base64.StdEncoding.DecodeString(imgBase64)
+            if err != nil {
+                fmt.Printf("Error decoding output image %s: %v\n", name, err)
+                continue
+            }
+            filename := fmt.Sprintf("%s_%d.jpg", name, i)
+
+            if err := os.MkdirAll(filepath.Dir(filename), 0755); err != nil {
+                fmt.Printf("Error creating directory for output image: %v\n", err)
+                continue
+            }
+
+            if err := os.WriteFile(filename, imgBytes, 0644); err != nil {
+                fmt.Printf("Error saving output image %s: %v\n", filename, err)
+            } else {
+                fmt.Printf("Output image saved at %s\n", filename)
+            }
+        }
+    }
+}
+</code></pre></details>
+
+<details><summary>C#</summary>
+
+<pre><code class="language-csharp">using System;
+using System.IO;
+using System.Net.Http;
+using System.Text;
+using System.Threading.Tasks;
+using Newtonsoft.Json.Linq;
+
+class Program
+{
+    static readonly string API_URL = "http://localhost:8080/layout-parsing";
+    static readonly string inputFilePath = "./demo.jpg";
+
+    static async Task Main(string[] args)
+    {
+        var httpClient = new HttpClient();
+
+        byte[] fileBytes = File.ReadAllBytes(inputFilePath);
+        string fileData = Convert.ToBase64String(fileBytes);
+
+        var payload = new JObject
+        {
+            { "file", fileData },
+            { "fileType", 1 }
+        };
+        var content = new StringContent(payload.ToString(), Encoding.UTF8, "application/json");
+
+        HttpResponseMessage response = await httpClient.PostAsync(API_URL, content);
+        response.EnsureSuccessStatusCode();
+
+        string responseBody = await response.Content.ReadAsStringAsync();
+        JObject jsonResponse = JObject.Parse(responseBody);
+
+        JArray layoutParsingResults = (JArray)jsonResponse["result"]["layoutParsingResults"];
+        for (int i = 0; i < layoutParsingResults.Count; i++)
+        {
+            var res = layoutParsingResults[i];
+            Console.WriteLine($"[{i}] prunedResult:\n{res["prunedResult"]}");
+
+            JObject outputImages = res["outputImages"] as JObject;
+            if (outputImages != null)
+            {
+                foreach (var img in outputImages)
+                {
+                    string imgName = img.Key;
+                    string base64Img = img.Value?.ToString();
+                    if (!string.IsNullOrEmpty(base64Img))
+                    {
+                        string imgPath = $"{imgName}_{i}.jpg";
+                        byte[] imageBytes = Convert.FromBase64String(base64Img);
+
+                        string directory = Path.GetDirectoryName(imgPath);
+                        if (!string.IsNullOrEmpty(directory) && !Directory.Exists(directory))
+                        {
+                            Directory.CreateDirectory(directory);
+                            Console.WriteLine($"Created directory: {directory}");
+                        }
+
+                        File.WriteAllBytes(imgPath, imageBytes);
+                        Console.WriteLine($"Output image saved at {imgPath}");
+                    }
+                }
+            }
+        }
+    }
+}
+</code></pre></details>
+
+<details><summary>Node.js</summary>
+
+<pre><code class="language-js">const axios = require('axios');
+const fs = require('fs');
+const path = require('path');
+
+const API_URL = 'http://localhost:8080/layout-parsing';
+const imagePath = './demo.jpg';
+const fileType = 1;
+
+function encodeImageToBase64(filePath) {
+  const bitmap = fs.readFileSync(filePath);
+  return Buffer.from(bitmap).toString('base64');
+}
+
+const payload = {
+  file: encodeImageToBase64(imagePath),
+  fileType: fileType
+};
+
+axios.post(API_URL, payload)
+  .then(response => {
+    const results = response.data.result.layoutParsingResults;
+    results.forEach((res, index) => {
+      console.log(`\n[${index}] prunedResult:`);
+      console.log(res.prunedResult);
+
+      const outputImages = res.outputImages;
+      if (outputImages) {
+        Object.entries(outputImages).forEach(([imgName, base64Img]) => {
+          const imgPath = `${imgName}_${index}.jpg`;
+
+          const directory = path.dirname(imgPath);
+          if (!fs.existsSync(directory)) {
+            fs.mkdirSync(directory, { recursive: true });
+            console.log(`Created directory: ${directory}`);
+          }
+
+          fs.writeFileSync(imgPath, Buffer.from(base64Img, 'base64'));
+          console.log(`Output image saved at ${imgPath}`);
+        });
+      } else {
+        console.log(`[${index}] No outputImages.`);
+      }
+    });
+  })
+  .catch(error => {
+    console.error('Error during API request:', error.message || error);
+  });
+</code></pre></details>
+
+<details><summary>PHP</summary>
+
+<pre><code class="language-php">&lt;?php
+
+$API_URL = "http://localhost:8080/layout-parsing";
+$image_path = "./demo.jpg";
+
+$image_data = base64_encode(file_get_contents($image_path));
+$payload = array("file" => $image_data, "fileType" => 1);
+
+$ch = curl_init($API_URL);
+curl_setopt($ch, CURLOPT_POST, true);
+curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($payload));
+curl_setopt($ch, CURLOPT_HTTPHEADER, array('Content-Type: application/json'));
+curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
+$response = curl_exec($ch);
+curl_close($ch);
+
+$result = json_decode($response, true)["result"]["layoutParsingResults"];
+
+foreach ($result as $i => $item) {
+    echo "[$i] prunedResult:\n";
+    print_r($item["prunedResult"]);
+
+    if (!empty($item["outputImages"])) {
+        foreach ($item["outputImages"] as $img_name => $img_base64) {
+            $output_image_path = "{$img_name}_{$i}.jpg";
+
+            $directory = dirname($output_image_path);
+            if (!is_dir($directory)) {
+                mkdir($directory, 0777, true);
+                echo "Created directory: $directory\n";
+            }
+
+            file_put_contents($output_image_path, base64_decode($img_base64));
+            echo "Output image saved at $output_image_path\n";
+        }
+    } else {
+        echo "No outputImages found for item $i\n";
+    }
+}
+?&gt;
+</code></pre></details>
+</details>
+<br/>
+
+📱 <b>端侧部署</b>：端侧部署是一种将计算和数据处理功能放在用户设备本身上的方式，设备可以直接处理数据，而不需要依赖远程的服务器。PaddleX 支持将模型部署在 Android 等端侧设备上，详细的端侧部署流程请参考[PaddleX端侧部署指南](../../../pipeline_deploy/on_device_deployment.md)。
+您可以根据需要选择合适的方式部署模型产线，进而进行后续的 AI 应用集成。
+
+## 5. 二次开发
+如果通用版面解析v3产线提供的默认模型权重在您的场景中，精度或速度不满意，您可以尝试利用<b>您自己拥有的特定领域或应用场景的数据</b>对现有模型进行进一步的<b>微调</b>，以提升通用版面解析v3产线的在您的场景中的识别效果。
+
+### 5.1 模型微调
+
+由于通用版面解析v3产线包含若干模块，模型产线的效果不及预期可能来自于其中任何一个模块。您可以对提取效果差的 case 进行分析，通过可视化图像，确定是哪个模块存在问题，并参考以下表格中对应的微调教程链接进行模型微调。
+
+
+<table>
+<thead>
+<tr>
+<th>情形</th>
+<th>微调模块</th>
+<th>微调参考链接</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td>版面区域检测不准，如印章、表格未检出等</td>
+<td>版面区域检测模块</td>
+<td><a href="https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/layout_detection.html">链接</a></td>
+</tr>
+<tr>
+<td>表格结构识别不准</td>
+<td>表格结构识别模块</td>
+<td><a href="https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/table_structure_recognition.html">链接</a></td>
+</tr>
+<tr>
+<td>公式识别不准</td>
+<td>公式识别模块</td>
+<td><a href="https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/formula_recognition.html">链接</a></td>
+</tr>
+<tr>
+<td>印章文本存在漏检</td>
+<td>印章文本检测模块</td>
+<td><a href="https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/seal_text_detection.html">链接</a></td>
+</tr>
+<tr>
+<td>文本存在漏检</td>
+<td>文本检测模块</td>
+<td><a href="https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/text_detection.html">链接</a></td>
+</tr>
+<tr>
+<td>文本内容都不准</td>
+<td>文本识别模块</td>
+<td><a href="https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/text_recognition.html">链接</a></td>
+</tr>
+<tr>
+<td>垂直或者旋转文本行矫正不准</td>
+<td>文本行方向分类模块</td>
+<td><a href="https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/textline_orientation_classification.html">链接</a></td>
+</tr>
+<tr>
+<td>整图旋转矫正不准</td>
+<td>文档图像方向分类模块</td>
+<td><a href="https://paddlepaddle.github.io/PaddleX/latest/module_usage/tutorials/ocr_modules/doc_img_orientation_classification.html">链接</a></td>
+</tr>
+<tr>
+<td>图像扭曲矫正不准</td>
+<td>文本图像矫正模块</td>
+<td>暂不支持微调</td>
+</tr>
+</tbody>
+</table>
+
+### 5.2 模型应用
+当您使用私有数据集完成微调训练后，可获得本地模型权重文件。
+
+若您需要使用微调后的模型权重，只需对产线配置文件做修改，将微调后模型权重的本地路径替换至产线配置文件中的对应位置即可：
+
+```yaml
+......
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayout_plus-L
+    model_dir: null # 替换为微调后的版面区域检测模型权重路径
+......
+SubPipelines:
+  GeneralOCR:
+    pipeline_name: OCR
+    text_type: general
+    use_doc_preprocessor: False
+    use_textline_orientation: False
+    SubModules:
+      TextDetection:
+        module_name: text_detection
+        model_name: PP-OCRv5_server_det
+        model_dir: null # 替换为微调后的文本测模型权重路径
+        limit_side_len: 960
+        limit_type: max
+        max_side_limit: 4000
+        thresh: 0.3
+        box_thresh: 0.6
+        unclip_ratio: 1.5
+
+      TextRecognition:
+        module_name: text_recognition
+        model_name: PP-OCRv5_server_rec
+        model_dir: null # 替换为微调后的文本识别模型权重路径
+        batch_size: 1
+        score_thresh: 0
+......
+```
+随后， 参考本地体验中的命令行方式或 Python 脚本方式，加载修改后的产线配置文件即可。
+
+##  6. 多硬件支持
+PaddleX 支持英伟达 GPU、昆仑芯 XPU、昇腾 NPU和寒武纪 MLU 等多种主流硬件设备，<b>仅需修改 `--device`参数</b>即可完成不同硬件之间的无缝切换。
+
+例如，您使用昇腾 NPU 进行版面解析产线的推理，使用的 CLI 命令为：
+
+```bash
+paddlex --pipeline PP-StructureV3 \
+        --input pp_structure_v3_demo.png  \
+        --use_doc_orientation_classify False \
+        --use_doc_unwarping False \
+        --use_textline_orientation False \
+        --use_e2e_wireless_table_rec_model True \
+        --save_path ./output \
+        --device npu:0
+```
+
+当然，您也可以在 Python 脚本中 `create_pipeline()` 时或者 `predict()` 时指定硬件设备。
+
+若您想在更多种类的硬件上使用通用版面解析v3产线，请参考[PaddleX多硬件使用指南](../../../other_devices_support/multi_devices_use_guide.md)。
diff --git a/mkdocs.yml b/mkdocs.yml
index 338e0f4225..907cb87bad 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -107,6 +107,7 @@ plugins:
             公式识别产线: Formula Recognition
             印章文本识别产线: Seal Recognition
             文档图像预处理产线: Document Image Preprocessing
+            PaddleOCR-VL产线: PaddleOCR-VL
             计算机视觉: Computer Vision
             通用图像分类: General Image Classification
             通用目标检测: General Object Detection
@@ -339,6 +340,7 @@ nav:
          - 公式识别产线: pipeline_usage/tutorials/ocr_pipelines/formula_recognition.md
          - 印章文本识别产线: pipeline_usage/tutorials/ocr_pipelines/seal_recognition.md
          - 文档图像预处理产线: pipeline_usage/tutorials/ocr_pipelines/doc_preprocessor.md
+         - PaddleOCR-VL产线: pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md
        - 计算机视觉:
          - 通用图像分类: pipeline_usage/tutorials/cv_pipelines/image_classification.md
          - 通用目标检测: pipeline_usage/tutorials/cv_pipelines/object_detection.md
diff --git a/paddlex/.version b/paddlex/.version
index 944880fa15..15a2799817 100644
--- a/paddlex/.version
+++ b/paddlex/.version
@@ -1 +1 @@
-3.2.0
+3.3.0
diff --git a/paddlex/configs/modules/text_recognition/arabic_PP-OCRv5_mobile_rec.yaml b/paddlex/configs/modules/text_recognition/arabic_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..edd9ff22dd
--- /dev/null
+++ b/paddlex/configs/modules/text_recognition/arabic_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,39 @@
+Global:
+  model: arabic_PP-OCRv5_mobile_rec
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/ocr_rec/ocr_rec_dataset_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert: 
+    enable: False
+    src_dataset_type: null
+  split: 
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  epochs_iters: 20
+  batch_size: 8
+  learning_rate: 0.001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/arabic_PP-OCRv5_mobile_rec_pretrained.pdparams
+  resume_path: null
+  log_interval: 20
+  eval_interval: 1
+  save_interval: 1
+
+Evaluate:
+  weight_path: "output/best_accuracy/best_accuracy.pdparams"
+  log_interval: 1
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/arabic_PP-OCRv5_mobile_rec_pretrained.pdparams
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_accuracy/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_rec_011_arabic.png"
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/modules/text_recognition/cyrillic_PP-OCRv5_mobile_rec.yaml b/paddlex/configs/modules/text_recognition/cyrillic_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..53c607d275
--- /dev/null
+++ b/paddlex/configs/modules/text_recognition/cyrillic_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,39 @@
+Global:
+  model: cyrillic_PP-OCRv5_mobile_rec
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/ocr_rec/ocr_rec_dataset_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert: 
+    enable: False
+    src_dataset_type: null
+  split: 
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  epochs_iters: 20
+  batch_size: 8
+  learning_rate: 0.001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/cyrillic_PP-OCRv5_mobile_rec_pretrained.pdparams
+  resume_path: null
+  log_interval: 20
+  eval_interval: 1
+  save_interval: 1
+
+Evaluate:
+  weight_path: "output/best_accuracy/best_accuracy.pdparams"
+  log_interval: 1
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/cyrillic_PP-OCRv5_mobile_rec_pretrained.pdparams
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_accuracy/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_rec_011_cyrillic.png"
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/modules/text_recognition/devanagari_PP-OCRv5_mobile_rec.yaml b/paddlex/configs/modules/text_recognition/devanagari_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..61947a2350
--- /dev/null
+++ b/paddlex/configs/modules/text_recognition/devanagari_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,39 @@
+Global:
+  model: devanagari_PP-OCRv5_mobile_rec
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/ocr_rec/ocr_rec_dataset_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert: 
+    enable: False
+    src_dataset_type: null
+  split: 
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  epochs_iters: 20
+  batch_size: 8
+  learning_rate: 0.001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/devanagari_PP-OCRv5_mobile_rec_pretrained.pdparams
+  resume_path: null
+  log_interval: 20
+  eval_interval: 1
+  save_interval: 1
+
+Evaluate:
+  weight_path: "output/best_accuracy/best_accuracy.pdparams"
+  log_interval: 1
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/devanagari_PP-OCRv5_mobile_rec_pretrained.pdparams
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_accuracy/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_rec_012_devanagari.png"
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/modules/text_recognition/ta_PP-OCRv5_mobile_rec.yaml b/paddlex/configs/modules/text_recognition/ta_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..6167892847
--- /dev/null
+++ b/paddlex/configs/modules/text_recognition/ta_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,39 @@
+Global:
+  model: ta_PP-OCRv5_mobile_rec
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/ocr_rec/ocr_rec_dataset_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert: 
+    enable: False
+    src_dataset_type: null
+  split: 
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  epochs_iters: 20
+  batch_size: 8
+  learning_rate: 0.001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/ta_PP-OCRv5_mobile_rec_pretrained.pdparams
+  resume_path: null
+  log_interval: 20
+  eval_interval: 1
+  save_interval: 1
+
+Evaluate:
+  weight_path: "output/best_accuracy/best_accuracy.pdparams"
+  log_interval: 1
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/ta_PP-OCRv5_mobile_rec_pretrained.pdparams
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_accuracy/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_rec_008_ta.png"
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/modules/text_recognition/te_PP-OCRv5_mobile_rec.yaml b/paddlex/configs/modules/text_recognition/te_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..25fc3f8d27
--- /dev/null
+++ b/paddlex/configs/modules/text_recognition/te_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,39 @@
+Global:
+  model: te_PP-OCRv5_mobile_rec
+  mode: check_dataset # check_dataset/train/evaluate/predict
+  dataset_dir: "/paddle/dataset/paddlex/ocr_rec/ocr_rec_dataset_examples"
+  device: gpu:0,1,2,3
+  output: "output"
+
+CheckDataset:
+  convert: 
+    enable: False
+    src_dataset_type: null
+  split: 
+    enable: False
+    train_percent: null
+    val_percent: null
+
+Train:
+  epochs_iters: 20
+  batch_size: 8
+  learning_rate: 0.001
+  pretrain_weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/te_PP-OCRv5_mobile_rec_pretrained.pdparams
+  resume_path: null
+  log_interval: 20
+  eval_interval: 1
+  save_interval: 1
+
+Evaluate:
+  weight_path: "output/best_accuracy/best_accuracy.pdparams"
+  log_interval: 1
+
+Export:
+  weight_path: https://paddle-model-ecology.bj.bcebos.com/paddlex/official_pretrained_model/te_PP-OCRv5_mobile_rec_pretrained.pdparams
+
+Predict:
+  batch_size: 1
+  model_dir: "output/best_accuracy/inference"
+  input: "https://paddle-model-ecology.bj.bcebos.com/paddlex/imgs/demo_image/general_ocr_rec_006_te.png"
+  kernel_option:
+    run_mode: paddle
diff --git a/paddlex/configs/pipelines/PP-StructureV3.yaml b/paddlex/configs/pipelines/PP-StructureV3.yaml
index 74b421fab3..a93952771a 100644
--- a/paddlex/configs/pipelines/PP-StructureV3.yaml
+++ b/paddlex/configs/pipelines/PP-StructureV3.yaml
@@ -9,6 +9,7 @@ use_table_recognition: True
 use_formula_recognition: True
 use_chart_recognition: False
 use_region_detection: True
+format_block_content: False
 
 SubModules:
   LayoutDetection:
diff --git a/paddlex/configs/pipelines/PaddleOCR-VL.yaml b/paddlex/configs/pipelines/PaddleOCR-VL.yaml
new file mode 100644
index 0000000000..43335a9def
--- /dev/null
+++ b/paddlex/configs/pipelines/PaddleOCR-VL.yaml
@@ -0,0 +1,96 @@
+
+pipeline_name: PaddleOCR-VL
+
+batch_size: 64
+
+use_queues: True
+
+use_doc_preprocessor: False
+use_layout_detection: True
+use_chart_recognition: False
+format_block_content: False
+
+SubModules:
+  LayoutDetection:
+    module_name: layout_detection
+    model_name: PP-DocLayoutV2
+    model_dir: null
+    batch_size: 8
+    threshold: 
+      0: 0.5 # abstract
+      1: 0.5 # algorithm
+      2: 0.5 # aside_text
+      3: 0.5 # chart
+      4: 0.5 # content
+      5: 0.4 # formula
+      6: 0.4 # doc_title
+      7: 0.5 # figure_title
+      8: 0.5 # footer
+      9: 0.5 # footer
+      10: 0.5 # footnote
+      11: 0.5 # formula_number
+      12: 0.5 # header
+      13: 0.5 # header
+      14: 0.5 # image
+      15: 0.4 # formula
+      16: 0.5 # number
+      17: 0.4 # paragraph_title
+      18: 0.5 # reference
+      19: 0.5 # reference_content
+      20: 0.45 # seal
+      21: 0.5 # table
+      22: 0.4 # text
+      23: 0.4 # text
+      24: 0.5 # vision_footnote
+    layout_nms: True
+    layout_unclip_ratio: [1.0, 1.0] 
+    layout_merge_bboxes_mode: 
+      0: "union" # abstract
+      1: "union" # algorithm
+      2: "union" # aside_text
+      3: "large" # chart
+      4: "union" # content
+      5: "large" # display_formula
+      6: "large" # doc_title
+      7: "union" # figure_title
+      8: "union" # footer
+      9: "union" # footer
+      10: "union" # footnote
+      11: "union" # formula_number
+      12: "union" # header
+      13: "union" # header
+      14: "union" # image
+      15: "large" # inline_formula
+      16: "union" # number
+      17: "large" # paragraph_title
+      18: "union" # reference
+      19: "union" # reference_content
+      20: "union" # seal
+      21: "union" # table
+      22: "union" # text
+      23: "union" # text
+      24: "union" # vision_footnote
+  VLRecognition:
+    module_name: vl_recognition
+    model_name: PaddleOCR-VL-0.9B
+    model_dir: null
+    batch_size: 2048
+    genai_config:
+      backend: native
+
+SubPipelines:
+  DocPreprocessor:
+    pipeline_name: doc_preprocessor
+    batch_size: 8
+    use_doc_orientation_classify: True
+    use_doc_unwarping: True
+    SubModules:
+      DocOrientationClassify:
+        module_name: doc_text_orientation
+        model_name: PP-LCNet_x1_0_doc_ori
+        model_dir: null
+        batch_size: 8
+      DocUnwarping:
+        module_name: image_unwarping
+        model_name: UVDoc
+        model_dir: null
diff --git a/paddlex/inference/common/result/mixin.py b/paddlex/inference/common/result/mixin.py
index 70f8164365..37f3ec1489 100644
--- a/paddlex/inference/common/result/mixin.py
+++ b/paddlex/inference/common/result/mixin.py
@@ -611,7 +611,9 @@ def __init__(self, *args: list, **kwargs: dict):
         self._save_funcs.append(self.save_to_markdown)
 
     @abstractmethod
-    def _to_markdown(self, pretty=True) -> Dict[str, Union[str, Dict[str, Any]]]:
+    def _to_markdown(
+        self, pretty=True, show_formula_number=False
+    ) -> Dict[str, Union[str, Dict[str, Any]]]:
         """
         Convert the result to markdown format.
 
@@ -632,7 +634,9 @@ def markdown(self) -> Dict[str, Union[str, Dict[str, Any]]]:
         """
         return self._to_markdown()
 
-    def save_to_markdown(self, save_path, pretty=True, *args, **kwargs) -> None:
+    def save_to_markdown(
+        self, save_path, pretty=True, show_formula_number=False, *args, **kwargs
+    ) -> None:
         """Save the markdown data to a file.
 
         Args:
@@ -670,7 +674,7 @@ def _is_markdown_file(file_path) -> bool:
             self._markdown_writer.write,
             self._img_writer.write,
             self.save_path,
-            self._to_markdown(pretty=pretty),
+            self._to_markdown(pretty=pretty, show_formula_number=show_formula_number),
             *args,
             **kwargs,
         )
diff --git a/paddlex/inference/genai/__init__.py b/paddlex/inference/genai/__init__.py
new file mode 100644
index 0000000000..3fb06b39fe
--- /dev/null
+++ b/paddlex/inference/genai/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.deps import require_genai_engine_plugin
+
+require_genai_engine_plugin()
diff --git a/paddlex/inference/genai/backends/__init__.py b/paddlex/inference/genai/backends/__init__.py
new file mode 100644
index 0000000000..b64cf01fdc
--- /dev/null
+++ b/paddlex/inference/genai/backends/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlex/inference/genai/backends/fastdeploy.py b/paddlex/inference/genai/backends/fastdeploy.py
new file mode 100644
index 0000000000..e7f2d85f86
--- /dev/null
+++ b/paddlex/inference/genai/backends/fastdeploy.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+from ....utils.deps import require_genai_engine_plugin
+from ..configs.utils import (
+    backend_config_to_args,
+    set_config_defaults,
+    update_backend_config,
+)
+
+
+def run_fastdeploy_server(
+    host, port, model_name, model_dir, config, chat_template_path
+):
+    require_genai_engine_plugin("fastdeploy-server")
+
+    if chat_template_path:
+        set_config_defaults(config, {"chat-template": str(chat_template_path)})
+
+    update_backend_config(
+        config,
+        {
+            "model": model_dir,
+            "host": host,
+            "port": port,
+        },
+    )
+
+    args = backend_config_to_args(config)
+    sys.argv[1:] = args
+
+    from fastdeploy.entrypoints.openai.api_server import main as run
+
+    run()
diff --git a/paddlex/inference/genai/backends/sglang.py b/paddlex/inference/genai/backends/sglang.py
new file mode 100644
index 0000000000..5b28b7831a
--- /dev/null
+++ b/paddlex/inference/genai/backends/sglang.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import textwrap
+
+from ....utils.deps import require_genai_engine_plugin
+
+
+def run_sglang_server(host, port, model_name, model_dir, config, chat_template_path):
+    require_genai_engine_plugin("sglang-server")
+
+    data = json.dumps(
+        {
+            "host": host,
+            "port": port,
+            "model_name": model_name,
+            "model_dir": model_dir,
+            "config": config,
+            "chat_template_path": str(chat_template_path),
+        }
+    )
+
+    # HACK
+    code = textwrap.dedent(
+        f"""
+    import json
+    import os
+
+    from paddlex.inference.genai.configs.utils import (
+        backend_config_to_args,
+        set_config_defaults,
+        update_backend_config,
+    )
+    from paddlex.inference.genai.models import get_model_components
+    from sglang.srt.configs.model_config import multimodal_model_archs
+    from sglang.srt.entrypoints.http_server import launch_server
+    from sglang.srt.managers.multimodal_processor import PROCESSOR_MAPPING
+    from sglang.srt.models.registry import ModelRegistry
+    from sglang.srt.server_args import prepare_server_args
+    from sglang.srt.utils import kill_process_tree
+
+    data = json.loads({repr(data)})
+
+    host = data["host"]
+    port = data["port"]
+    model_name = data["model_name"]
+    model_dir = data["model_dir"]
+    config = data["config"]
+    chat_template_path = data["chat_template_path"]
+
+    network_class, processor_class = get_model_components(model_name, "sglang")
+
+    ModelRegistry.models[network_class.__name__] = network_class
+    multimodal_model_archs.append(network_class.__name__)
+    PROCESSOR_MAPPING[network_class] = processor_class
+
+    set_config_defaults(config, {{"served-model-name": model_name}})
+
+    if chat_template_path:
+        set_config_defaults(config, {{"chat-template": chat_template_path}})
+
+    set_config_defaults(config, {{"enable-metrics": True}})
+
+    update_backend_config(
+        config,
+        {{
+            "model-path": model_dir,
+            "host": host,
+            "port": port,
+        }},
+    )
+
+    if __name__ == "__main__":
+        args = backend_config_to_args(config)
+
+        server_args = prepare_server_args(args)
+
+        try:
+            launch_server(server_args)
+        finally:
+            kill_process_tree(os.getpid(), include_parent=False)
+    """
+    )
+
+    with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
+        f.write(code)
+        script_path = f.name
+
+    try:
+        subprocess.check_call([sys.executable, script_path])
+    finally:
+        os.unlink(script_path)
diff --git a/paddlex/inference/genai/backends/vllm.py b/paddlex/inference/genai/backends/vllm.py
new file mode 100644
index 0000000000..35a1b77eaa
--- /dev/null
+++ b/paddlex/inference/genai/backends/vllm.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....utils.deps import is_genai_engine_plugin_available, require_genai_engine_plugin
+from ..configs.utils import (
+    backend_config_to_args,
+    set_config_defaults,
+    update_backend_config,
+)
+from ..models import ALL_MODEL_NAMES, get_model_components
+
+
+def register_models():
+    from vllm import ModelRegistry
+
+    if is_genai_engine_plugin_available("vllm-server"):
+        for model_name in ALL_MODEL_NAMES:
+            if model_name not in ModelRegistry.get_supported_archs():
+                net_cls, _ = get_model_components(model_name, "vllm")
+                ModelRegistry.register_model(net_cls.__name__, net_cls)
+
+
+def run_vllm_server(host, port, model_name, model_dir, config, chat_template_path):
+    require_genai_engine_plugin("vllm-server")
+
+    import uvloop
+    from vllm.entrypoints.openai.api_server import (
+        FlexibleArgumentParser,
+        cli_env_setup,
+        make_arg_parser,
+        run_server,
+        validate_parsed_serve_args,
+    )
+
+    cli_env_setup()
+    parser = FlexibleArgumentParser()
+    parser = make_arg_parser(parser)
+
+    set_config_defaults(config, {"served-model-name": model_name})
+
+    if chat_template_path:
+        set_config_defaults(config, {"chat-template": str(chat_template_path)})
+
+    update_backend_config(
+        config,
+        {
+            "model": model_dir,
+            "host": host,
+            "port": port,
+        },
+    )
+
+    args = backend_config_to_args(config)
+    args = parser.parse_args(args)
+    validate_parsed_serve_args(args)
+
+    uvloop.run(run_server(args))
diff --git a/paddlex/inference/genai/chat_templates/PaddleOCR-VL-0.9B.jinja b/paddlex/inference/genai/chat_templates/PaddleOCR-VL-0.9B.jinja
new file mode 100644
index 0000000000..116312d262
--- /dev/null
+++ b/paddlex/inference/genai/chat_templates/PaddleOCR-VL-0.9B.jinja
@@ -0,0 +1,46 @@
+{%- if not add_generation_prompt is defined -%}
+    {%- set add_generation_prompt = true -%}
+{%- endif -%}
+{%- if not cls_token is defined -%}
+    {%- set cls_token = "<|begin_of_sentence|>" -%}
+{%- endif -%}
+{%- if not sep_token is defined -%}
+    {%- set sep_token = "<|end_of_sentence|>" -%}
+{%- endif -%}
+{%- if not image_token is defined -%}
+    {%- set image_token = "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>" -%}
+{%- endif -%}
+{{- cls_token -}}
+{%- for message in messages -%}
+    {%- if message["role"] == "user" -%}
+        {{- "User: " -}}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "image" -%}
+                {{ image_token }}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ "\n" -}}
+    {%- elif message["role"] == "assistant" -%}
+        {{- "Assistant: " -}}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] + "\n" }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ sep_token -}}
+    {%- elif message["role"] == "system" -%}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] + "\n" }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{- "Assistant: " -}}
+{%- endif -%}
diff --git a/paddlex/inference/genai/chat_templates/__init__.py b/paddlex/inference/genai/chat_templates/__init__.py
new file mode 100644
index 0000000000..b64cf01fdc
--- /dev/null
+++ b/paddlex/inference/genai/chat_templates/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlex/inference/genai/configs/__init__.py b/paddlex/inference/genai/configs/__init__.py
new file mode 100644
index 0000000000..b64cf01fdc
--- /dev/null
+++ b/paddlex/inference/genai/configs/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlex/inference/genai/configs/paddleocr_vl_09b.py b/paddlex/inference/genai/configs/paddleocr_vl_09b.py
new file mode 100644
index 0000000000..6755395c63
--- /dev/null
+++ b/paddlex/inference/genai/configs/paddleocr_vl_09b.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def get_config(backend):
+    if backend == "fastdeploy":
+        return {
+            "gpu-memory-utilization": 0.3,
+            "max-model-len": 16384,
+            "max-num-batched-tokens": 131072,
+            "max-num-seqs": 256,
+        }
+    elif backend == "vllm":
+        return {
+            "trust-remote-code": True,
+            "gpu-memory-utilization": 0.5,
+            "max-model-len": 16384,
+            "max-num-batched-tokens": 131072,
+            "api-server-count": 4,
+        }
+    elif backend == "sglang":
+        return {
+            "trust-remote-code": True,
+            "mem-fraction-static": 0.5,
+            "context-length": 16384,
+            "max-prefill-tokens": 131072,
+        }
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
diff --git a/paddlex/inference/genai/configs/utils.py b/paddlex/inference/genai/configs/utils.py
new file mode 100644
index 0000000000..9df90aa91a
--- /dev/null
+++ b/paddlex/inference/genai/configs/utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+
+
+def load_backend_config(config_path):
+    with open(config_path, "r", encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+    return config
+
+
+def update_backend_config(config, overrides):
+    for k, v in overrides.items():
+        config[k] = v
+
+
+def set_config_defaults(config, defaults):
+    for k, v in defaults.items():
+        if k not in config:
+            config[k] = v
+
+
+def backend_config_to_args(config):
+    # Limited support
+    args = []
+    for k, v in config.items():
+        opt = "--" + k
+        args.append(opt)
+        if not isinstance(v, bool):
+            args.append(str(v))
+    return args
diff --git a/paddlex/inference/genai/constants.py b/paddlex/inference/genai/constants.py
new file mode 100644
index 0000000000..ef8bfc87b0
--- /dev/null
+++ b/paddlex/inference/genai/constants.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SUPPORTED_BACKENDS = ("fastdeploy", "vllm", "sglang")
+DEFAULT_BACKEND = "fastdeploy"
diff --git a/paddlex/inference/genai/models/__init__.py b/paddlex/inference/genai/models/__init__.py
new file mode 100644
index 0000000000..93b194c364
--- /dev/null
+++ b/paddlex/inference/genai/models/__init__.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import importlib
+from pathlib import Path
+from typing import Any, Dict, Optional, Type
+
+from pydantic import BaseModel
+
+from ....utils import logging
+from ...utils.official_models import official_models
+from ..utils import check_backend, model_name_to_module_name
+
+NETWORK_CLASS_GETTER_KEY = "get_network_class"
+PROCESSOR_CLASS_GETTER_KEY = "get_processor_class"
+CONFIG_GETTER_KEY = "get_config"
+CHAT_TEMPLATE_PATH_GETTER_KEY = "get_chat_template_path"
+DEFAULT_CHAT_TEMPLATE_FILENAME = "chat_template.jinja"
+
+ALL_MODEL_NAMES = {"PaddleOCR-VL-0.9B"}
+
+
+def _check_model_name_and_backend(model_name, backend):
+    if model_name not in ALL_MODEL_NAMES:
+        raise ValueError(f"Unknown model: {model_name}")
+
+    check_backend(backend)
+
+
+def get_model_dir(model_name, backend):
+    _check_model_name_and_backend(model_name, backend)
+
+    try:
+        model_dir = official_models[model_name]
+    except Exception as e:
+        raise RuntimeError(
+            f"Could not prepare the official model for the {repr(model_name)} model with the {repr(backend)} backend."
+        ) from e
+
+    return str(model_dir)
+
+
+def get_model_components(model_name, backend):
+    def _get_component(getter_key):
+        if not hasattr(model_module, getter_key):
+            raise RuntimeError(f"`{model_module}` does not have `{getter_key}`")
+        getter = getattr(model_module, getter_key)
+        comp = getter(backend)
+        return comp
+
+    _check_model_name_and_backend(model_name, backend)
+
+    mod_name = model_name_to_module_name(model_name)
+
+    try:
+        model_module = importlib.import_module(f".{mod_name}", package=__package__)
+    except ModuleNotFoundError as e:
+        raise ValueError(f"Unknown model: {model_name}") from e
+
+    network_class = _get_component(NETWORK_CLASS_GETTER_KEY)
+
+    if backend == "sglang":
+        processor_class = _get_component(PROCESSOR_CLASS_GETTER_KEY)
+    else:
+        processor_class = None
+
+    return network_class, processor_class
+
+
+def get_default_config(model_name, backend):
+    _check_model_name_and_backend(model_name, backend)
+
+    mod_name = model_name_to_module_name(model_name)
+
+    try:
+        config_module = importlib.import_module(
+            f"..configs.{mod_name}", package=__package__
+        )
+    except ModuleNotFoundError:
+        logging.debug("No default configs were found for the model '%s'", model_name)
+        default_config = {}
+    else:
+        if not hasattr(config_module, CONFIG_GETTER_KEY):
+            raise RuntimeError(f"`{config_module}` does not have `{CONFIG_GETTER_KEY}`")
+        config_getter = getattr(config_module, CONFIG_GETTER_KEY)
+        default_config = config_getter(backend)
+
+    return default_config
+
+
+@contextlib.contextmanager
+def get_chat_template_path(model_name, backend, model_dir):
+    _check_model_name_and_backend(model_name, backend)
+
+    with importlib.resources.path(
+        "paddlex.inference.genai.chat_templates", f"{model_name}.jinja"
+    ) as chat_template_path:
+        if not chat_template_path.exists():
+            default_chat_template_path = Path(model_dir, DEFAULT_CHAT_TEMPLATE_FILENAME)
+            if (
+                default_chat_template_path.exists()
+                and default_chat_template_path.is_file()
+            ):
+                # TODO: Support symbolic links
+                yield default_chat_template_path
+            else:
+                logging.debug(
+                    "No chat template was found for the model '%s' with the backend '%s'",
+                    model_name,
+                    backend,
+                )
+                yield None
+        else:
+            yield chat_template_path
diff --git a/paddlex/inference/genai/models/paddleocr_vl_09b/__init__.py b/paddlex/inference/genai/models/paddleocr_vl_09b/__init__.py
new file mode 100644
index 0000000000..266924f2be
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl_09b/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def get_network_class(backend):
+    if backend == "vllm":
+        from ._vllm import PaddleOCRVLForConditionalGeneration
+
+        return PaddleOCRVLForConditionalGeneration
+    elif backend == "sglang":
+        from ._sglang import PaddleOCRVLForConditionalGeneration
+
+        return PaddleOCRVLForConditionalGeneration
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+
+
+def get_processor_class(backend):
+    if backend == "sglang":
+        from ._sglang import PaddleOCRVLImageProcessor
+
+        return PaddleOCRVLImageProcessor
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
diff --git a/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/__init__.py b/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/__init__.py
new file mode 100644
index 0000000000..ebad8afbb6
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .network import PaddleOCRVLForConditionalGeneration
+from .processor import PaddleOCRVLImageProcessor
diff --git a/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/network.py b/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/network.py
new file mode 100644
index 0000000000..373285f517
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/network.py
@@ -0,0 +1,817 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections.abc import Iterable
+from typing import List, Optional, Set, Tuple, Union
+
+import numpy as np
+
+from ......utils.deps import is_dep_available
+
+if all(
+    map(is_dep_available, ("einops", "torch", "transformers", "sglang", "flash-attn"))
+):
+    import torch
+    import torch.nn as nn
+    from einops import rearrange
+    from flash_attn import flash_attn_varlen_func
+    from sglang.srt.distributed import get_tensor_model_parallel_world_size
+    from sglang.srt.layers.activation import get_act_fn
+    from sglang.srt.layers.linear import (
+        ColumnParallelLinear,
+        QKVParallelLinear,
+        RowParallelLinear,
+    )
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+    from sglang.srt.managers.mm_utils import (
+        MultiModalityDataPaddingPatternMultimodalTokens,
+        general_mm_embed_routine,
+    )
+    from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+    from sglang.srt.model_loader.weight_utils import default_weight_loader
+    from sglang.srt.models.ernie4 import Ernie4_5_ForCausalLM
+    from transformers.activations import GELUActivation
+    from transformers.modeling_outputs import (
+        BaseModelOutput,
+        BaseModelOutputWithPooling,
+    )
+    from transformers.utils import torch_int
+
+    class Projector(nn.Module):
+
+        def __init__(
+            self,
+            text_config,
+            vision_config,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.text_config = text_config
+            self.vision_config = vision_config
+            self.merge_kernel_size = (2, 2)
+
+            self.hidden_size = (
+                self.vision_config.hidden_size
+                * self.merge_kernel_size[0]
+                * self.merge_kernel_size[1]
+            )
+
+            self.pre_norm = torch.nn.LayerNorm(
+                self.vision_config.hidden_size, eps=1e-05
+            )
+            self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+            self.act = GELUActivation()
+            self.linear_2 = nn.Linear(
+                self.hidden_size, self.text_config.hidden_size, bias=True
+            )
+
+        def forward(
+            self,
+            image_features: torch.Tensor,
+            image_grid_thw: List[Tuple[int, int, int]],
+        ) -> torch.Tensor:
+            m1, m2 = self.merge_kernel_size
+            if isinstance(image_features, (list, tuple)):
+                processed_features = list()
+                for image_feature, image_grid in zip(image_features, image_grid_thw):
+                    image_feature = self.pre_norm(image_feature)
+                    t, h, w = image_grid
+
+                    image_feature = rearrange(
+                        image_feature,
+                        "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                        t=t,
+                        h=h // m1,
+                        p1=m1,
+                        w=w // m2,
+                        p2=m2,
+                    )
+                    hidden_states = self.linear_1(image_feature)
+                    hidden_states = self.act(hidden_states)
+                    hidden_states = self.linear_2(hidden_states)
+                    processed_features.append(hidden_states)
+
+                return processed_features
+
+            dims = image_features.shape[:-1]
+            dim = image_features.shape[-1]
+            image_features = image_features.view(np.prod(dims), dim)
+            hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+            hidden_states = self.linear_1(hidden_states)
+            hidden_states = self.act(hidden_states)
+            hidden_states = self.linear_2(hidden_states)
+
+            return hidden_states.view(*dims, -1)
+
+    class SiglipVisionEmbeddings(nn.Module):
+
+        def __init__(self, config):
+            super().__init__()
+            self.config = config
+            self.embed_dim = config.hidden_size
+            self.image_size = config.image_size
+            self.patch_size = config.patch_size
+
+            self.patch_embedding = nn.Conv2d(
+                in_channels=config.num_channels,
+                out_channels=self.embed_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+                padding="valid",
+            )
+
+            self.num_patches = (self.image_size // self.patch_size) ** 2
+            self.num_positions = self.num_patches
+            self.cache_position_embedding = dict()
+            self.cache_position_count = dict()
+            self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+            self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+            self.register_buffer(
+                "position_ids",
+                torch.arange(self.num_positions).expand((1, -1)),
+                persistent=False,
+            )
+
+        def interpolate_pos_encoding(
+            self,
+            embeddings: torch.Tensor,
+            height: int,
+            width: int,
+            is_after_patchify: bool = False,
+        ) -> torch.Tensor:
+
+            num_positions = self.position_embedding.weight.shape[0]
+
+            patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+            dim = embeddings.shape[-1]
+
+            if is_after_patchify:
+                new_height = height
+                new_width = width
+            else:
+                new_height = height // self.patch_size
+                new_width = width // self.patch_size
+
+            sqrt_num_positions = torch_int(num_positions**0.5)
+            patch_pos_embed = patch_pos_embed.reshape(
+                1, sqrt_num_positions, sqrt_num_positions, dim
+            )
+            patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+            patch_pos_embed = nn.functional.interpolate(
+                patch_pos_embed,
+                size=(new_height, new_width),
+                mode="bilinear",
+                align_corners=False,
+            )
+
+            patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+            return patch_pos_embed
+
+        def fetch_position_embedding_lfu_cache(
+            self, embeddings, h, w, max_cache: int = 20
+        ):
+            grid = (h, w)
+            if grid in self.cache_position_embedding:
+                self.cache_position_count[grid] += 1
+                return self.cache_position_embedding[grid]
+
+            if len(self.cache_position_embedding) >= max_cache:
+                min_hit_grid = min(
+                    self.cache_position_count,
+                    key=self.cache_position_count.get,
+                )
+                self.cache_position_count.pop(min_hit_grid)
+                self.cache_position_embedding.pop(min_hit_grid)
+
+            position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+            self.cache_position_count[grid] = 1
+            self.cache_position_embedding[grid] = position_embedding
+            return position_embedding
+
+        def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            position_ids: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            interpolate_pos_encoding=False,
+        ) -> torch.Tensor:
+            if pixel_values.dim() == 4:
+                pixel_values = pixel_values.unsqueeze(0)
+            if pixel_values.dim() == 5:
+                if position_ids is None:
+                    raise ValueError(
+                        "position_ids cannot be None when pixel_values.dim() is 5."
+                    )
+                (
+                    batch_size,
+                    squence_len,
+                    channel,
+                    height,
+                    width,
+                ) = pixel_values.shape
+                target_dtype = self.patch_embedding.weight.dtype
+                pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+                patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+                embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+                if interpolate_pos_encoding and image_grid_thw is not None:
+                    start = 0
+                    tmp_embeddings = list()
+                    for image_grid in image_grid_thw:
+                        t, h, w = image_grid
+                        end = start + t * h * w
+                        image_embeddings = embeddings[start:end, :]
+                        position_embedding = (
+                            self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                            .squeeze(0)
+                            .repeat(t, 1)
+                        )
+                        image_embeddings = image_embeddings + position_embedding
+                        tmp_embeddings.append(image_embeddings)
+                        start = end
+                    embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+                else:
+                    embeddings = embeddings + self.packing_position_embedding(
+                        position_ids
+                    )
+                return embeddings
+            else:
+                raise ValueError(
+                    "Unsupported pixel_values dimension:"
+                    f" {pixel_values.dim()}. Expected 4 or 5."
+                )
+
+    def apply_rotary_pos_emb_flashatt(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from sglang.srt.layers.rotary_embedding import apply_rotary_pos_emb
+
+        q_embed, k_embed = apply_rotary_pos_emb(q, k, cos, sin)
+        return q_embed, k_embed
+
+    class SiglipAttention(nn.Module):
+        """Multi-headed attention from 'Attention Is All You
+        Need' paper."""
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+
+            hidden_size = config.hidden_size
+            self.hidden_size = config.hidden_size
+            tp_size = get_tensor_model_parallel_world_size()
+            self.total_num_heads = config.num_attention_heads
+            assert self.total_num_heads % tp_size == 0
+            self.num_heads = self.total_num_heads // tp_size
+            self.total_num_kv_heads = config.num_attention_heads
+            if self.total_num_kv_heads >= tp_size:
+                assert self.total_num_kv_heads % tp_size == 0
+            else:
+                assert tp_size % self.total_num_kv_heads == 0
+            self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+            self.head_dim = config.hidden_size // self.total_num_heads
+            self.q_size = self.num_heads * self.head_dim
+            self.kv_size = self.num_kv_heads * self.head_dim
+            self.scale = self.head_dim**-0.5
+
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size,
+                self.head_dim,
+                self.total_num_heads,
+                self.total_num_kv_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.out_proj = RowParallelLinear(
+                input_size=hidden_size,
+                output_size=hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.out_proj",
+            )
+
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            rope_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        ) -> torch.Tensor:
+            batch_size, seq_length, embed_dim = hidden_states.shape
+
+            qkv_states, _ = self.qkv_proj(hidden_states)
+            queries, keys, values = qkv_states.chunk(3, dim=-1)
+
+            queries = queries.view(seq_length, self.num_heads, self.head_dim)
+            keys = keys.view(seq_length, self.num_heads, self.head_dim)
+            values = values.view(seq_length, self.num_heads, self.head_dim)
+
+            if rope_emb is not None:
+                cos, sin = rope_emb
+                queries, keys = apply_rotary_pos_emb_flashatt(
+                    queries.unsqueeze(0), keys.unsqueeze(0), cos, sin
+                )
+                queries = queries.squeeze(0)
+                keys = keys.squeeze(0)
+
+            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+
+            attn_output = flash_attn_varlen_func(
+                queries,
+                keys,
+                values,
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_k=cu_seqlens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+            ).reshape(seq_length, -1)
+
+            output, _ = self.out_proj(attn_output)
+            return output
+
+    class SigLIPRotaryEmbedding(nn.Module):
+
+        def __init__(self, dim: int, theta: float = 10000.0) -> None:
+            super().__init__()
+            self.dim = dim
+            self.theta = theta
+            self.rope_init()
+
+        def rope_init(self):
+            inv_freq = 1.0 / (
+                self.theta
+                ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        def forward(self, seqlen: int) -> torch.Tensor:
+            seq = torch.arange(
+                seqlen,
+                device=self.inv_freq.device,
+                dtype=self.inv_freq.dtype,
+            )
+            freqs = torch.outer(seq, self.inv_freq)
+            return freqs
+
+    class SiglipMLP(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ) -> None:
+            super().__init__()
+
+            self.config = config
+            self.activation_fn = get_act_fn(config.hidden_act)
+            if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
+                quantizable = True
+            else:
+                quantizable = (
+                    config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
+                )
+            self.fc1 = ColumnParallelLinear(
+                config.hidden_size,
+                config.intermediate_size,
+                quant_config=quant_config if quantizable else None,
+                prefix=f"{prefix}.fc1",
+            )
+            self.fc2 = RowParallelLinear(
+                config.intermediate_size,
+                config.hidden_size,
+                quant_config=quant_config if quantizable else None,
+                prefix=f"{prefix}.fc2",
+            )
+
+        def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+            hidden_states, _ = self.fc1(hidden_states)
+            hidden_states = self.activation_fn(hidden_states)
+            hidden_states, _ = self.fc2(hidden_states)
+            return hidden_states
+
+    class SiglipEncoderLayer(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.embed_dim = config.hidden_size
+            self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.self_attn = SiglipAttention(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+            self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.mlp = SiglipMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            rope_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        ) -> Tuple[torch.FloatTensor]:
+
+            residual = hidden_states
+
+            hidden_states = self.layer_norm1(hidden_states)
+            hidden_states = self.self_attn(
+                hidden_states=hidden_states,
+                cu_seqlens=cu_seqlens,
+                rope_emb=rope_emb,
+            )
+
+            hidden_states = residual + hidden_states
+
+            residual = hidden_states
+            hidden_states = self.layer_norm2(hidden_states)
+            hidden_states = self.mlp(hidden_states)
+
+            hidden_states = residual + hidden_states
+
+            return hidden_states
+
+    class SiglipEncoder(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+            embed_dim = config.hidden_size
+            num_heads = config.num_attention_heads
+            head_dim = embed_dim // num_heads
+            self.layers = nn.ModuleList(
+                [
+                    SiglipEncoderLayer(
+                        config,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                    )
+                    for layer_idx in range(config.num_hidden_layers)
+                ]
+            )
+            self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+        @staticmethod
+        def flatten_list(image_grid_thw):
+            tmp_image_grid_thw = list()
+            for image_grid in image_grid_thw:
+                if isinstance(image_grid, list):
+                    tmp_image_grid_thw.extend(image_grid)
+                else:
+                    tmp_image_grid_thw.append(image_grid)
+            return tmp_image_grid_thw
+
+        def forward(
+            self,
+            inputs_embeds,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            height_position_ids: Optional[torch.Tensor] = None,
+            width_position_ids: Optional[torch.Tensor] = None,
+        ) -> BaseModelOutput:
+            device = inputs_embeds.device
+            hidden_states = inputs_embeds
+            flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+            if width_position_ids is None or height_position_ids is None:
+                split_hids = list()
+                split_wids = list()
+                for t, h, w in flatten_image_grid_thw:
+                    image_pids = torch.arange(t * h * w, device=device) % (h * w)
+                    sample_hids = image_pids // w
+                    sample_wids = image_pids % w
+                    split_hids.append(sample_hids)
+                    split_wids.append(sample_wids)
+                width_position_ids = torch.concat(split_wids, dim=0)
+                height_position_ids = torch.concat(split_hids, dim=0)
+
+            pids = torch.stack(
+                [height_position_ids, width_position_ids],
+                dim=-1,
+            )
+            max_grid_size = pids.max() + 1
+            rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+            rope_emb = rope_emb_max_grid[pids].flatten(1)
+            rope_emb = rope_emb.repeat(1, 2)
+            rope_emb = (rope_emb.cos(), rope_emb.sin())
+
+            attn_cu_seqlens = cu_seqlens
+            hidden_states = inputs_embeds
+
+            for encoder_layer in self.layers:
+                hidden_states = encoder_layer(
+                    hidden_states,
+                    cu_seqlens=attn_cu_seqlens,
+                    rope_emb=rope_emb,
+                )
+            return hidden_states
+
+    class SiglipVisionTransformer(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+            embed_dim = config.hidden_size
+
+            self.embeddings = SiglipVisionEmbeddings(config)
+            self.encoder = SiglipEncoder(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.encoder",
+            )
+            self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        def forward(
+            self,
+            pixel_values,
+            interpolate_pos_encoding: Optional[bool] = False,
+            position_ids: Optional[torch.Tensor] = None,
+            height_position_ids: Optional[torch.Tensor] = None,
+            width_position_ids: Optional[torch.Tensor] = None,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+        ) -> BaseModelOutputWithPooling:
+
+            hidden_states = self.embeddings(
+                pixel_values,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+                position_ids=position_ids,
+                image_grid_thw=image_grid_thw,
+            )
+
+            last_hidden_state = self.encoder(
+                inputs_embeds=hidden_states,
+                cu_seqlens=cu_seqlens,
+                image_grid_thw=image_grid_thw,
+                height_position_ids=height_position_ids,
+                width_position_ids=width_position_ids,
+            )
+
+            last_hidden_state = self.post_layernorm(last_hidden_state)
+
+            sample_hidden_state = list()
+            if cu_seqlens is None:
+                raise ValueError(
+                    "cu_seqlens cannot be None for "
+                    "SiglipVisionTransformer output processing."
+                )
+            for i in range(cu_seqlens.shape[0] - 1):
+                start = cu_seqlens[i]
+                end = cu_seqlens[i + 1]
+                tensor = last_hidden_state[:, start:end, :].squeeze(0)
+                sample_hidden_state.append(tensor)
+
+            return sample_hidden_state
+
+    class SiglipVisionModel(nn.Module):
+        config_class = "PaddleOCRVisionConfig"
+        main_input_name = "pixel_values"
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+
+            self.vision_model = SiglipVisionTransformer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.vision_model",
+            )
+            self.quant_config = quant_config
+
+        @property
+        def dtype(self) -> torch.dtype:
+            return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+        @property
+        def device(self) -> torch.device:
+            return self.vision_model.embeddings.patch_embedding.weight.device
+
+        def get_input_embeddings(self) -> nn.Module:
+            return self.vision_model.embeddings.patch_embedding
+
+        def forward(
+            self,
+            pixel_values,
+            interpolate_pos_encoding: bool = False,
+            position_ids: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+        ) -> BaseModelOutputWithPooling:
+
+            return self.vision_model(
+                pixel_values=pixel_values,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+                position_ids=position_ids,
+                image_grid_thw=image_grid_thw,
+                cu_seqlens=cu_seqlens,
+            )
+
+    class PaddleOCRVLForConditionalGeneration(Ernie4_5_ForCausalLM):
+
+        def __init__(self, *, config, quant_config=None, prefix: str = ""):
+            super().__init__(config=config, prefix=prefix)
+            config = self.config
+
+            self.mlp_AR = Projector(config, config.vision_config)
+            self.visual = SiglipVisionModel(config=config.vision_config)
+            if not hasattr(self.model, "get_input_embeddings"):
+                import types
+
+                self.model.get_input_embeddings = types.MethodType(
+                    get_input_embeddings, self.model
+                )
+            self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+            pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+            return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+        def get_input_embeddings(self):
+            return self.model.embed_tokens
+
+        def encode_image(self, pixel_values, image_grid_thw):
+            pixel_values = pixel_values.type(self.visual.dtype)
+            siglip_position_ids = list()
+            image_grid_hws = list()
+            cu_seqlens = [0]
+
+            for idx, thw in enumerate(image_grid_thw):
+                thw_tuple = tuple(thw.detach().cpu().numpy().tolist())
+                numel = np.prod(thw_tuple)
+                image_grid_hws.append(thw_tuple)
+                image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+                siglip_position_ids.append(image_position_ids)
+                cu_seqlens.append(cu_seqlens[-1] + numel)
+
+            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+                pixel_values.device
+            )
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values.device
+            )
+            vision_outputs = self.visual(
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_hws,
+                position_ids=siglip_position_ids,
+                interpolate_pos_encoding=True,
+                cu_seqlens=cu_seqlens,
+            )
+            image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
+            image_embeds = torch.stack(image_embeds, dim=0)
+
+            return image_embeds
+
+        def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+            pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+                self.visual.dtype
+            )
+            image_grid_thw = torch.concat(
+                [item.image_grid_thw for item in items], dim=0
+            )
+            image_embeds = self.encode_image(pixel_values, image_grid_thw)
+
+            return image_embeds
+
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            forward_batch: ForwardBatch,
+            get_embedding: bool = False,
+        ):
+            if self.is_mrope_enabled:
+                positions = forward_batch.mrope_positions
+            if not (
+                forward_batch.forward_mode.is_decode()
+                or not forward_batch.contains_image_inputs()
+            ):
+                if self.is_mrope_enabled:
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}"
+                    )
+
+            hidden_states = general_mm_embed_routine(
+                input_ids=input_ids,
+                forward_batch=forward_batch,
+                language_model=self.model,
+                multimodal_model=self,
+                positions=positions,
+            )
+
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+
+        def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+            stacked_params_mapping = [
+                # (param_name, weight_name, shard_id)
+                (".qkv_proj", ".q_proj", "q"),
+                (".qkv_proj", ".k_proj", "k"),
+                (".qkv_proj", ".v_proj", "v"),
+                (".gate_up_proj", ".gate_proj", 0),
+                (".gate_up_proj", ".up_proj", 1),
+            ]
+            params_dict = dict(self.named_parameters())
+            for name, loaded_weight in weights:
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                if "head.attention" in name or "head.layernorm" in name:
+                    continue
+                if "head.mlp" in name or "head.probe" in name:
+                    continue
+
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if name in params_dict.keys():
+                        param = params_dict[name]
+                        weight_loader = getattr(
+                            param, "weight_loader", default_weight_loader
+                        )
+                        weight_loader(param, loaded_weight)
+                    else:
+                        raise KeyError(f"Parameter '{name}' not found in model.")
+
+    # monkey patch
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
diff --git a/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/processor.py b/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/processor.py
new file mode 100644
index 0000000000..ee7f81a10f
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl_09b/_sglang/processor.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+from ......utils.deps import is_dep_available
+
+if all(map(is_dep_available, ("sglang", "torch"))):
+    import asyncio
+    import math
+
+    import torch
+    from PIL import Image
+    from sglang.srt.multimodal.processors.base_processor import (
+        BaseMultimodalProcessor,
+        MultimodalSpecialTokens,
+    )
+
+    def smart_resize(
+        height: int,
+        width: int,
+        factor: int = 28,
+        min_pixels: int = 28 * 28 * 130,
+        max_pixels: int = 28 * 28 * 1280,
+    ):
+        """Rescales the image so that the following conditions are met:
+
+        1. Both dimensions (height and width) are divisible by 'factor'.
+
+        2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+        3. The aspect ratio of the image is maintained as closely as possible.
+
+        """
+        if height < factor:
+            print(
+                f"smart_resize: height={height} < factor={factor}, reset height=factor"
+            )
+            width = round((width * factor) / height)
+            height = factor
+
+        if width < factor:
+            print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
+            height = round((height * factor) / width)
+            width = factor
+
+        if max(height, width) / min(height, width) > 200:
+            raise ValueError(
+                f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+            )
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+
+    def resize_image(image, min_pixels, max_pixels, factor) -> Image.Image:
+        width, height = image.size
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+        image = image.resize((resized_width, resized_height))
+        return image
+
+    async def resize_image_async(image, min_pixels, max_pixels, factor):
+        return resize_image(image, min_pixels, max_pixels, factor)
+
+    class PaddleOCRVLImageProcessor(BaseMultimodalProcessor):
+
+        def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+            super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+
+            image_processor_config = _processor.image_processor
+            self.MIN_PIXELS = image_processor_config.min_pixels
+            self.MAX_PIXELS = image_processor_config.max_pixels
+            self.IMAGE_FACTOR = (
+                image_processor_config.patch_size * image_processor_config.merge_size
+            )
+
+            self.vision_start_token_id = hf_config.vision_start_token_id
+            self.mm_tokens = MultimodalSpecialTokens(
+                image_token="<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>",
+                image_token_id=hf_config.image_token_id,
+                video_token_id=hf_config.video_token_id,
+            ).build(_processor)
+
+        async def process_mm_data_async(
+            self,
+            image_data: List[Union[str, bytes]],
+            input_text,
+            request_obj,
+            *args,
+            **kwargs,
+        ):
+            base_output = self.load_mm_data(
+                prompt=input_text,
+                image_data=image_data,
+                multimodal_tokens=self.mm_tokens,
+            )
+
+            if base_output.images and isinstance(base_output.images[0], Image.Image):
+                resize_tasks = [
+                    resize_image_async(
+                        image, self.MIN_PIXELS, self.MAX_PIXELS, self.IMAGE_FACTOR
+                    )
+                    for image in base_output.images
+                ]
+                base_output.images = await asyncio.gather(*resize_tasks)
+
+            mm_items, input_ids, ret = self.process_and_combine_mm_data(
+                base_output, self.mm_tokens
+            )
+
+            input_ids = input_ids.flatten()
+            mrope_positions, mrope_position_delta = self.get_rope_index(
+                spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
+                image_token_id=self.mm_tokens.image_token_id,
+                video_token_id=self.mm_tokens.video_token_id,
+                vision_start_token_id=self.vision_start_token_id,
+                model_type=self.hf_config.model_type,
+                tokens_per_second=getattr(
+                    self.hf_config.vision_config, "tokens_per_second", None
+                ),
+                input_ids=input_ids.unsqueeze(0),
+                image_grid_thw=getattr(ret, "image_grid_thw", None),
+            )
+            mrope_positions = mrope_positions.squeeze(1)
+
+            return {
+                "mm_items": mm_items,
+                "input_ids": input_ids.tolist(),
+                "im_token_id": self.mm_tokens.image_token_id,
+                "mrope_positions": mrope_positions,
+                "mrope_position_delta": mrope_position_delta,
+            }
+
+        @staticmethod
+        def get_rope_index(
+            spatial_merge_size: int,
+            image_token_id: int,
+            video_token_id: int,
+            vision_start_token_id: int,
+            model_type: str,
+            tokens_per_second: Optional[int] = None,
+            input_ids: Optional[torch.LongTensor] = None,
+            image_grid_thw: Optional[torch.LongTensor] = None,
+            video_grid_thw: Optional[torch.LongTensor] = None,
+            second_per_grid_ts: Optional[torch.Tensor] = None,
+            **kwargs,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            mrope_position_deltas = []
+            if input_ids is not None and (
+                image_grid_thw is not None or video_grid_thw is not None
+            ):
+                total_input_ids = input_ids
+                position_ids = torch.ones(
+                    3,
+                    input_ids.shape[0],
+                    input_ids.shape[1],
+                    dtype=input_ids.dtype,
+                    device=input_ids.device,
+                )
+                image_index, video_index = 0, 0
+                for i, input_ids in enumerate(total_input_ids):
+                    image_nums, video_nums = 0, 0
+                    vision_start_indices = torch.argwhere(
+                        input_ids == vision_start_token_id
+                    ).squeeze(1)
+                    vision_tokens = input_ids[vision_start_indices + 1]
+                    image_nums = (vision_tokens == image_token_id).sum()
+                    video_nums = (vision_tokens == video_token_id).sum()
+                    input_tokens = input_ids.tolist()
+                    llm_pos_ids_list: list = []
+                    st = 0
+                    remain_images, remain_videos = image_nums, video_nums
+                    for _ in range(image_nums + video_nums):
+                        if image_token_id in input_tokens and remain_images > 0:
+                            ed_image = input_tokens.index(image_token_id, st)
+                        else:
+                            ed_image = len(input_tokens) + 1
+                        if video_token_id in input_tokens and remain_videos > 0:
+                            ed_video = input_tokens.index(video_token_id, st)
+                        else:
+                            ed_video = len(input_tokens) + 1
+                        if ed_image < ed_video:
+                            t, h, w = (
+                                image_grid_thw[image_index][0],
+                                image_grid_thw[image_index][1],
+                                image_grid_thw[image_index][2],
+                            )
+                            second_per_grid_t = 0
+                            image_index += 1
+                            remain_images -= 1
+                            ed = ed_image
+                        else:
+                            t, h, w = (
+                                video_grid_thw[video_index][0],
+                                video_grid_thw[video_index][1],
+                                video_grid_thw[video_index][2],
+                            )
+                            if second_per_grid_ts is not None:
+                                second_per_grid_t = second_per_grid_ts[video_index]
+                            else:
+                                second_per_grid_t = 1.0
+                            video_index += 1
+                            remain_videos -= 1
+                            ed = ed_video
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t.item(),
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+                        text_len = ed - st
+
+                        st_idx = (
+                            llm_pos_ids_list[-1].max() + 1
+                            if len(llm_pos_ids_list) > 0
+                            else 0
+                        )
+                        llm_pos_ids_list.append(
+                            torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                        )
+
+                        range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                        expanded_range = range_tensor.expand(
+                            -1, llm_grid_h * llm_grid_w
+                        )
+
+                        time_tensor = (
+                            expanded_range * second_per_grid_t * tokens_per_second
+                        )
+
+                        time_tensor_long = time_tensor.long()
+                        t_index = time_tensor_long.flatten()
+
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(llm_grid_t, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(llm_grid_t, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+                        )
+                        st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+                    if st < len(input_tokens):
+                        st_idx = (
+                            llm_pos_ids_list[-1].max() + 1
+                            if len(llm_pos_ids_list) > 0
+                            else 0
+                        )
+                        text_len = len(input_tokens) - st
+                        llm_pos_ids_list.append(
+                            torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                        )
+
+                    llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                    position_ids[..., i, :] = llm_positions.to(position_ids.device)
+                    mrope_position_deltas.append(
+                        llm_positions.max() + 1 - len(total_input_ids[i])
+                    )
+                mrope_position_deltas = torch.tensor(
+                    mrope_position_deltas, device=input_ids.device
+                ).unsqueeze(1)
+                return position_ids, mrope_position_deltas
+            else:
+                s = input_ids.shape[1]
+                position_ids = torch.arange(s)
+                position_ids = (
+                    position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
+                )
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                    -1, keepdim=True
+                )[0]
+                mrope_position_deltas = max_position_ids + 1 - s
+                return position_ids, mrope_position_deltas
diff --git a/paddlex/inference/genai/models/paddleocr_vl_09b/_vllm.py b/paddlex/inference/genai/models/paddleocr_vl_09b/_vllm.py
new file mode 100644
index 0000000000..b95da3be85
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl_09b/_vllm.py
@@ -0,0 +1,1214 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from .....utils.deps import is_dep_available
+
+if all(
+    map(is_dep_available, ("einops", "torch", "transformers", "vllm", "flash-attn"))
+):
+    import torch
+    import torch.nn as nn
+    from einops import rearrange, repeat
+    from transformers import BatchFeature
+    from transformers.activations import GELUActivation
+    from transformers.modeling_outputs import (
+        BaseModelOutput,
+        BaseModelOutputWithPooling,
+    )
+    from transformers.utils import torch_int
+    from vllm.compilation.decorators import support_torch_compile
+    from vllm.config import VllmConfig
+    from vllm.distributed import get_tensor_model_parallel_world_size
+    from vllm.model_executor.layers.activation import get_act_fn
+    from vllm.model_executor.layers.linear import (
+        ColumnParallelLinear,
+        QKVParallelLinear,
+        RowParallelLinear,
+    )
+    from vllm.model_executor.layers.logits_processor import LogitsProcessor
+    from vllm.model_executor.layers.quantization import QuantizationConfig
+    from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+    from vllm.model_executor.model_loader.weight_utils import (
+        default_weight_loader,
+        maybe_remap_kv_scale_name,
+    )
+    from vllm.model_executor.models.vision import get_vit_attn_backend
+    from vllm.platforms import _Backend, current_platform
+
+    try:
+        from vllm.model_executor.models.ernie45 import Ernie4_5_ForCausalLM
+    except ImportError:
+        from vllm.model_executor.models.ernie45 import (
+            Ernie4_5ForCausalLM as Ernie4_5_ForCausalLM,
+        )
+    from vllm.model_executor.models.interfaces import SupportsMultiModal
+    from vllm.model_executor.models.utils import (
+        AutoWeightsLoader,
+        PPMissingLayer,
+        is_pp_missing_parameter,
+        merge_multimodal_embeddings,
+    )
+    from vllm.multimodal import MULTIMODAL_REGISTRY
+    from vllm.multimodal.inputs import (
+        MultiModalDataDict,
+        MultiModalFieldConfig,
+        MultiModalKwargs,
+        NestedTensors,
+    )
+    from vllm.multimodal.parse import (
+        ImageProcessorItems,
+        ImageSize,
+        MultiModalDataItems,
+    )
+    from vllm.multimodal.processing import (
+        BaseMultiModalProcessor,
+        BaseProcessingInfo,
+        PromptReplacement,
+        PromptUpdate,
+    )
+    from vllm.multimodal.profiling import BaseDummyInputsBuilder
+    from vllm.sequence import IntermediateTensors
+
+    def smart_resize(
+        height: int,
+        width: int,
+        factor: int = 28,
+        min_pixels: int = 28 * 28 * 130,
+        max_pixels: int = 28 * 28 * 1280,
+    ):
+        """Rescales the image so that the following conditions are met:
+
+        1. Both dimensions (height and width) are divisible by 'factor'.
+
+        2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+        3. The aspect ratio of the image is maintained as closely as possible.
+
+        """
+        # if height < factor or width < factor:
+        #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+        # if int(height < factor//4) + int(width < factor//4):
+        #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
+
+        if height < factor:
+            print(
+                f"smart_resize: height={height} < factor={factor}, reset height=factor"
+            )
+            width = round((width * factor) / height)
+            height = factor
+
+        if width < factor:
+            print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
+            height = round((height * factor) / width)
+            width = factor
+
+        if max(height, width) / min(height, width) > 200:
+            raise ValueError(
+                f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+            )
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+
+    class PaddleOCRVLProcessingInfo(BaseProcessingInfo):
+
+        def get_hf_config(self):
+            return self.ctx.get_hf_config()
+
+        def get_hf_processor(self, **kwargs: object):
+            return self.ctx.get_hf_processor(**kwargs)
+
+        def get_image_processor(self, **kwargs: object):
+            return self.get_hf_processor(**kwargs).image_processor
+
+        def get_supported_mm_limits(self):
+            return {"image": None}
+
+        def get_num_image_tokens(
+            self,
+            *,
+            image_width: int,
+            image_height: int,
+            image_processor,
+        ) -> int:
+            if image_processor is None:
+                image_processor = self.get_image_processor()
+
+            do_resize = True
+            hf_config = self.get_hf_config()
+            vision_config = hf_config.vision_config
+            patch_size = vision_config.patch_size
+            merge_size = vision_config.spatial_merge_size
+
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height=image_height,
+                    width=image_width,
+                    factor=patch_size * merge_size,
+                    min_pixels=image_processor.min_pixels,
+                    max_pixels=image_processor.max_pixels,
+                )
+                preprocessed_size = ImageSize(
+                    width=resized_width, height=resized_height
+                )
+            else:
+                preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+            grid_t = 1
+            grid_h = preprocessed_size.height // patch_size
+            grid_w = preprocessed_size.width // patch_size
+
+            num_patches = grid_t * grid_h * grid_w
+            num_image_tokens = num_patches // (merge_size**2)
+
+            return num_image_tokens
+
+        def get_image_size_with_most_features(self) -> ImageSize:
+            hf_config = self.get_hf_config()
+            image_size = hf_config.vision_config.image_size
+            return ImageSize(height=image_size, width=image_size)
+
+    class PaddleOCRVLDummyInputsBuilder(
+        BaseDummyInputsBuilder[PaddleOCRVLProcessingInfo]
+    ):
+
+        def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+            num_images = mm_counts.get("image", 0)
+
+            processor = self.info.get_hf_processor()
+            image_token = processor.image_token
+
+            return image_token * num_images
+
+        def get_dummy_mm_data(
+            self,
+            seq_len: int,
+            mm_counts: Mapping[str, int],
+        ) -> MultiModalDataDict:
+            num_images = mm_counts.get("image", 0)
+
+            (target_width, target_height) = (
+                self.info.get_image_size_with_most_features()
+            )
+
+            return {
+                "image": self._get_dummy_images(
+                    width=target_width, height=target_height, num_images=num_images
+                )
+            }
+
+    class PaddleOCRVLMultiModalProcessor(
+        BaseMultiModalProcessor[PaddleOCRVLProcessingInfo]
+    ):
+
+        def _call_hf_processor(
+            self,
+            prompt: str,
+            mm_data: Mapping[str, object],
+            mm_kwargs: Mapping[str, object],
+            tok_kwargs: Mapping[str, object],
+        ) -> BatchFeature:
+            if mm_data:
+                processed_outputs = self.info.ctx.call_hf_processor(
+                    self.info.get_hf_processor(**mm_kwargs),
+                    dict(text=prompt, **mm_data),
+                    dict(**mm_kwargs, **tok_kwargs),
+                )
+                processed_outputs["pixel_values"] = processed_outputs[
+                    "pixel_values"
+                ].unsqueeze(0)
+            else:
+                tokenizer = self.info.get_tokenizer()
+                processed_outputs = tokenizer(
+                    prompt, add_special_tokens=True, return_tensors="pt"
+                )
+            return processed_outputs
+
+        def _get_mm_fields_config(
+            self,
+            hf_inputs: BatchFeature,
+            hf_processor_mm_kwargs: Mapping[str, object],
+        ) -> Mapping[str, MultiModalFieldConfig]:
+            return dict(
+                pixel_values=MultiModalFieldConfig.batched("image"),
+                image_grid_thw=MultiModalFieldConfig.batched("image"),
+            )
+
+        def _get_prompt_updates(
+            self,
+            mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            out_mm_kwargs: MultiModalKwargs,
+        ) -> Sequence[PromptUpdate]:
+            image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+            hf_config = self.info.get_hf_config()
+            image_token_id = hf_config.image_token_id
+
+            def get_replacement(item_idx: int, image_processor):
+                images = mm_items.get_items("image", ImageProcessorItems)
+
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    image_processor=image_processor,
+                )
+
+                return [image_token_id] * num_image_tokens
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=partial(
+                        get_replacement, image_processor=image_processor
+                    ),
+                ),
+            ]
+
+    class Projector(nn.Module):
+
+        def __init__(
+            self,
+            text_config,
+            vision_config,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.text_config = text_config
+            self.vision_config = vision_config
+            self.merge_kernel_size = (2, 2)
+
+            self.hidden_size = (
+                self.vision_config.hidden_size
+                * self.merge_kernel_size[0]
+                * self.merge_kernel_size[1]
+            )
+
+            self.pre_norm = torch.nn.LayerNorm(
+                self.vision_config.hidden_size, eps=1e-05
+            )
+            self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+            self.act = GELUActivation()
+            self.linear_2 = nn.Linear(
+                self.hidden_size, self.text_config.hidden_size, bias=True
+            )
+
+        def forward(
+            self,
+            image_features: torch.Tensor,
+            image_grid_thw: List[Tuple[int, int, int]],
+        ) -> torch.Tensor:
+            m1, m2 = self.merge_kernel_size
+            if isinstance(image_features, (list, tuple)):
+                processed_features = list()
+                for image_feature, image_grid in zip(image_features, image_grid_thw):
+                    image_feature = self.pre_norm(image_feature)
+                    t, h, w = image_grid
+
+                    image_feature = rearrange(
+                        image_feature,
+                        "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                        t=t,
+                        h=h // m1,
+                        p1=m1,
+                        w=w // m2,
+                        p2=m2,
+                    )
+                    hidden_states = self.linear_1(image_feature)
+                    hidden_states = self.act(hidden_states)
+                    hidden_states = self.linear_2(hidden_states)
+                    processed_features.append(hidden_states)
+
+                return processed_features
+
+            dims = image_features.shape[:-1]
+            dim = image_features.shape[-1]
+            image_features = image_features.view(np.prod(dims), dim)
+            hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+            hidden_states = self.linear_1(hidden_states)
+            hidden_states = self.act(hidden_states)
+            hidden_states = self.linear_2(hidden_states)
+
+            return hidden_states.view(*dims, -1)
+
+    class SiglipVisionEmbeddings(nn.Module):
+
+        def __init__(self, config):
+            super().__init__()
+            self.config = config
+            self.embed_dim = config.hidden_size
+            self.image_size = config.image_size
+            self.patch_size = config.patch_size
+
+            self.patch_embedding = nn.Conv2d(
+                in_channels=config.num_channels,
+                out_channels=self.embed_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+                padding="valid",
+            )
+
+            self.num_patches = (self.image_size // self.patch_size) ** 2
+            self.num_positions = self.num_patches
+            self.cache_position_embedding = dict()
+            self.cache_position_count = dict()
+            self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+            self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+            self.register_buffer(
+                "position_ids",
+                torch.arange(self.num_positions).expand((1, -1)),
+                persistent=False,
+            )
+
+        def interpolate_pos_encoding(
+            self,
+            embeddings: torch.Tensor,
+            height: int,
+            width: int,
+            is_after_patchify: bool = False,
+        ) -> torch.Tensor:
+
+            num_positions = self.position_embedding.weight.shape[0]
+
+            patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+            dim = embeddings.shape[-1]
+
+            if is_after_patchify:
+                new_height = height
+                new_width = width
+            else:
+                new_height = height // self.patch_size
+                new_width = width // self.patch_size
+
+            sqrt_num_positions = torch_int(num_positions**0.5)
+            patch_pos_embed = patch_pos_embed.reshape(
+                1, sqrt_num_positions, sqrt_num_positions, dim
+            )
+            patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+            patch_pos_embed = nn.functional.interpolate(
+                patch_pos_embed,
+                size=(new_height, new_width),
+                mode="bilinear",
+                align_corners=False,
+            )
+
+            patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+            return patch_pos_embed
+
+        def fetch_position_embedding_lfu_cache(
+            self, embeddings, h, w, max_cache: int = 20
+        ):
+            grid = (h, w)
+            if grid in self.cache_position_embedding:
+                self.cache_position_count[grid] += 1
+                return self.cache_position_embedding[grid]
+
+            if len(self.cache_position_embedding) >= max_cache:
+                min_hit_grid = min(
+                    self.cache_position_count,
+                    key=self.cache_position_count.get,
+                )
+                self.cache_position_count.pop(min_hit_grid)
+                self.cache_position_embedding.pop(min_hit_grid)
+
+            position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+            self.cache_position_count[grid] = 1
+            self.cache_position_embedding[grid] = position_embedding
+            return position_embedding
+
+        def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            position_ids: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            interpolate_pos_encoding=False,
+        ) -> torch.Tensor:
+            if pixel_values.dim() == 4:
+                pixel_values = pixel_values.unsqueeze(0)
+            if pixel_values.dim() == 5:
+                if position_ids is None:
+                    raise ValueError(
+                        "position_ids cannot be None when pixel_values.dim() is 5."
+                    )
+                (
+                    batch_size,
+                    squence_len,
+                    channel,
+                    height,
+                    width,
+                ) = pixel_values.shape
+                target_dtype = self.patch_embedding.weight.dtype
+                pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+                patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+                embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+                if interpolate_pos_encoding and image_grid_thw is not None:
+                    start = 0
+                    tmp_embeddings = list()
+                    for image_grid in image_grid_thw:
+                        t, h, w = image_grid
+                        end = start + t * h * w
+                        image_embeddings = embeddings[start:end, :]
+                        position_embedding = (
+                            self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                            .squeeze(0)
+                            .repeat(t, 1)
+                        )
+                        image_embeddings = image_embeddings + position_embedding
+                        tmp_embeddings.append(image_embeddings)
+                        start = end
+                    embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+                else:
+                    embeddings = embeddings + self.packing_position_embedding(
+                        position_ids
+                    )
+                return embeddings
+            else:
+                raise ValueError(
+                    "Unsupported pixel_values dimension:"
+                    f" {pixel_values.dim()}. Expected 4 or 5."
+                )
+
+    def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+        if not interleaved:
+            x1, x2 = x.chunk(2, dim=-1)
+            return torch.cat((-x2, x1), dim=-1)
+        else:
+            x1, x2 = x[..., ::2], x[..., 1::2]
+            return rearrange(
+                torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
+            )
+
+    def apply_rotary_emb_torch(
+        x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+    ) -> torch.Tensor:
+        """
+        x: (batch_size, seqlen, nheads, headdim)
+        cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+        """
+        ro_dim = cos.shape[-1] * 2
+        assert ro_dim <= x.shape[-1]
+        cos = repeat(
+            cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+        )
+        sin = repeat(
+            sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+        )
+        return torch.cat(
+            [
+                x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+                x[..., ro_dim:],
+            ],
+            dim=-1,
+        )
+
+    def apply_rotary_pos_emb_flashatt(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos = cos.chunk(2, dim=-1)[0].contiguous()
+        sin = sin.chunk(2, dim=-1)[0].contiguous()
+
+        apply_rotary_emb = apply_rotary_emb_torch
+        if current_platform.is_cuda():
+            from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
+        q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
+        k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
+        return q_embed, k_embed
+
+    class SiglipAttention(nn.Module):
+        """Multi-headed attention from 'Attention Is All You
+        Need' paper."""
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+
+            hidden_size = config.hidden_size
+            self.hidden_size = config.hidden_size
+            tp_size = get_tensor_model_parallel_world_size()
+            self.total_num_heads = config.num_attention_heads
+            assert self.total_num_heads % tp_size == 0
+            self.num_heads = self.total_num_heads // tp_size
+            self.total_num_kv_heads = config.num_attention_heads
+            if self.total_num_kv_heads >= tp_size:
+                assert self.total_num_kv_heads % tp_size == 0
+            else:
+                assert tp_size % self.total_num_kv_heads == 0
+            self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+            self.head_dim = config.hidden_size // self.total_num_heads
+            self.q_size = self.num_heads * self.head_dim
+            self.kv_size = self.num_kv_heads * self.head_dim
+            self.scale = self.head_dim**-0.5
+
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size,
+                self.head_dim,
+                self.total_num_heads,
+                self.total_num_kv_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.out_proj = RowParallelLinear(
+                input_size=hidden_size,
+                output_size=hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.out_proj",
+            )
+
+            # Detect attention implementation.
+            self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+            if self.attn_backend not in {
+                _Backend.FLASH_ATTN,
+                _Backend.TORCH_SDPA,
+                _Backend.XFORMERS,
+            }:
+                raise RuntimeError(
+                    f"PaddleOCR-VL does not support {self.attn_backend} backend now."
+                )
+
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            rope_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        ) -> torch.Tensor:
+            batch_size, seq_length, embed_dim = hidden_states.shape
+
+            qkv_states, _ = self.qkv_proj(hidden_states)
+            q, k, v = qkv_states.chunk(3, dim=-1)
+
+            q = q.view(batch_size, seq_length, self.num_heads, self.head_dim)
+            k = k.view(batch_size, seq_length, self.num_heads, self.head_dim)
+            v = v.view(batch_size, seq_length, self.num_heads, self.head_dim)
+
+            if rope_emb is not None:
+                cos, sin = rope_emb
+                q, k = apply_rotary_pos_emb_flashatt(q, k, cos, sin)
+
+            if self.attn_backend == _Backend.FLASH_ATTN:
+                from flash_attn import flash_attn_varlen_func
+
+                q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+                max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+                output = flash_attn_varlen_func(
+                    q,
+                    k,
+                    v,
+                    cu_seqlens_q=cu_seqlens,
+                    cu_seqlens_k=cu_seqlens,
+                    max_seqlen_q=max_seqlen,
+                    max_seqlen_k=max_seqlen,
+                )
+
+                context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
+            elif self.attn_backend == _Backend.TORCH_SDPA:
+                # Execute attention entry by entry for speed & less VRAM.
+                import torch.nn.functional as F
+
+                outputs = []
+                for i in range(1, len(cu_seqlens)):
+                    start_idx = cu_seqlens[i - 1]
+                    end_idx = cu_seqlens[i]
+                    q_i = q[:, start_idx:end_idx]
+                    k_i = k[:, start_idx:end_idx]
+                    v_i = v[:, start_idx:end_idx]
+                    q_i, k_i, v_i = (
+                        rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
+                    )
+                    output_i = F.scaled_dot_product_attention(
+                        q_i, k_i, v_i, dropout_p=0.0
+                    )
+                    output_i = rearrange(output_i, "b h s d -> b s h d ")
+                    outputs.append(output_i)
+                context_layer = torch.cat(outputs, dim=1)
+            elif self.attn_backend == _Backend.XFORMERS:
+                from xformers import ops as xops
+                from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+                seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+                attn_bias = BlockDiagonalMask.from_seqlens(
+                    q_seqlen=seqlens, kv_seqlen=None, device=q.device
+                )
+
+                context_layer = xops.memory_efficient_attention_forward(
+                    q, k, v, attn_bias=attn_bias, p=0, scale=None
+                )
+
+            context_layer = rearrange(
+                context_layer, "b s h d -> b s (h d)"
+            ).contiguous()
+
+            output, _ = self.out_proj(context_layer)
+            return output
+
+    class SigLIPRotaryEmbedding(nn.Module):
+
+        def __init__(self, dim: int, theta: float = 10000.0) -> None:
+            super().__init__()
+            self.dim = dim
+            self.theta = theta
+            self.rope_init()
+
+        def rope_init(self):
+            inv_freq = 1.0 / (
+                self.theta
+                ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        def forward(self, seqlen: int) -> torch.Tensor:
+            seq = torch.arange(
+                seqlen,
+                device=self.inv_freq.device,
+                dtype=self.inv_freq.dtype,
+            )
+            freqs = torch.outer(seq, self.inv_freq)
+            return freqs
+
+    class SiglipMLP(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ) -> None:
+            super().__init__()
+
+            self.config = config
+            self.activation_fn = get_act_fn(config.hidden_act)
+            # Special handling for BNB and torchao quantization
+            if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
+                quantizable = True
+            else:
+                # For other quantization, we require the hidden size to be a
+                # multiple of 64
+                quantizable = (
+                    config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
+                )
+            self.fc1 = ColumnParallelLinear(
+                config.hidden_size,
+                config.intermediate_size,
+                quant_config=quant_config if quantizable else None,
+                prefix=f"{prefix}.fc1",
+            )
+            self.fc2 = RowParallelLinear(
+                config.intermediate_size,
+                config.hidden_size,
+                quant_config=quant_config if quantizable else None,
+                prefix=f"{prefix}.fc2",
+            )
+
+        def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+            hidden_states, _ = self.fc1(hidden_states)
+            hidden_states = self.activation_fn(hidden_states)
+            hidden_states, _ = self.fc2(hidden_states)
+            return hidden_states
+
+    class SiglipEncoderLayer(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.embed_dim = config.hidden_size
+            self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.self_attn = SiglipAttention(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+            self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.mlp = SiglipMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            rope_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        ) -> Tuple[torch.FloatTensor]:
+
+            residual = hidden_states
+
+            hidden_states = self.layer_norm1(hidden_states)
+            hidden_states = self.self_attn(
+                hidden_states=hidden_states,
+                cu_seqlens=cu_seqlens,
+                rope_emb=rope_emb,
+            )
+
+            hidden_states = residual + hidden_states
+
+            residual = hidden_states
+            hidden_states = self.layer_norm2(hidden_states)
+            hidden_states = self.mlp(hidden_states)
+
+            hidden_states = residual + hidden_states
+
+            return hidden_states
+
+    class SiglipEncoder(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+            embed_dim = config.hidden_size
+            num_heads = config.num_attention_heads
+            head_dim = embed_dim // num_heads
+            self.layers = nn.ModuleList(
+                [
+                    SiglipEncoderLayer(
+                        config,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                    )
+                    for layer_idx in range(config.num_hidden_layers)
+                ]
+            )
+            self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+        @staticmethod
+        def flatten_list(image_grid_thw):
+            tmp_image_grid_thw = list()
+            for image_grid in image_grid_thw:
+                if isinstance(image_grid, list):
+                    tmp_image_grid_thw.extend(image_grid)
+                else:
+                    tmp_image_grid_thw.append(image_grid)
+            return tmp_image_grid_thw
+
+        def forward(
+            self,
+            inputs_embeds,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            height_position_ids: Optional[torch.Tensor] = None,
+            width_position_ids: Optional[torch.Tensor] = None,
+        ) -> BaseModelOutput:
+            device = inputs_embeds.device
+            hidden_states = inputs_embeds
+
+            flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+            if width_position_ids is None or height_position_ids is None:
+                split_hids = list()
+                split_wids = list()
+                for t, h, w in flatten_image_grid_thw:
+                    image_pids = torch.arange(t * h * w, device=device) % (h * w)
+                    sample_hids = image_pids // w
+                    sample_wids = image_pids % w
+                    split_hids.append(sample_hids)
+                    split_wids.append(sample_wids)
+                width_position_ids = torch.concat(split_wids, dim=0)
+                height_position_ids = torch.concat(split_hids, dim=0)
+
+            pids = torch.stack(
+                [height_position_ids, width_position_ids],
+                dim=-1,
+            )
+            max_grid_size = pids.max() + 1
+            rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+            rope_emb = rope_emb_max_grid[pids].flatten(1)
+            rope_emb = rope_emb.repeat(1, 2)
+            rope_emb = (rope_emb.cos(), rope_emb.sin())
+
+            attn_cu_seqlens = cu_seqlens
+            hidden_states = inputs_embeds
+
+            for encoder_layer in self.layers:
+                hidden_states = encoder_layer(
+                    hidden_states,
+                    cu_seqlens=attn_cu_seqlens,
+                    rope_emb=rope_emb,
+                )
+            return hidden_states
+
+    class SiglipVisionTransformer(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+            embed_dim = config.hidden_size
+
+            self.embeddings = SiglipVisionEmbeddings(config)
+            self.encoder = SiglipEncoder(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.encoder",
+            )
+            self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        def forward(
+            self,
+            pixel_values,
+            interpolate_pos_encoding: Optional[bool] = False,
+            position_ids: Optional[torch.Tensor] = None,
+            height_position_ids: Optional[torch.Tensor] = None,
+            width_position_ids: Optional[torch.Tensor] = None,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+        ) -> BaseModelOutputWithPooling:
+
+            hidden_states = self.embeddings(
+                pixel_values,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+                position_ids=position_ids,
+                image_grid_thw=image_grid_thw,
+            )
+
+            last_hidden_state = self.encoder(
+                inputs_embeds=hidden_states,
+                cu_seqlens=cu_seqlens,
+                image_grid_thw=image_grid_thw,
+                height_position_ids=height_position_ids,
+                width_position_ids=width_position_ids,
+            )
+
+            last_hidden_state = self.post_layernorm(last_hidden_state)
+
+            sample_hidden_state = list()
+            if cu_seqlens is None:
+                raise ValueError(
+                    "cu_seqlens cannot be None for "
+                    "SiglipVisionTransformer output processing."
+                )
+            for i in range(cu_seqlens.shape[0] - 1):
+                start = cu_seqlens[i]
+                end = cu_seqlens[i + 1]
+                tensor = last_hidden_state[:, start:end, :].squeeze(0)
+                sample_hidden_state.append(tensor)
+
+            return sample_hidden_state
+
+    class SiglipVisionModel(nn.Module):
+        config_class = "PaddleOCRVisionConfig"
+        main_input_name = "pixel_values"
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+
+            self.vision_model = SiglipVisionTransformer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.vision_model",
+            )
+            self.quant_config = quant_config
+
+        @property
+        def dtype(self) -> torch.dtype:
+            return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+        @property
+        def device(self) -> torch.device:
+            return self.vision_model.embeddings.patch_embedding.weight.device
+
+        def get_input_embeddings(self) -> nn.Module:
+            return self.vision_model.embeddings.patch_embedding
+
+        def forward(
+            self,
+            pixel_values,
+            interpolate_pos_encoding: bool = False,
+            position_ids: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+        ) -> BaseModelOutputWithPooling:
+
+            return self.vision_model(
+                pixel_values=pixel_values,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+                position_ids=position_ids,
+                image_grid_thw=image_grid_thw,
+                cu_seqlens=cu_seqlens,
+            )
+
+        def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> set[str]:
+            stacked_params_mapping = [
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+            ]
+            params_dict = dict(self.named_parameters(remove_duplicate=False))
+            loaded_params: set[str] = set()
+            for name, loaded_weight in weights:
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                if "head.attention" in name or "head.layernorm" in name:
+                    continue
+                if "head.mlp" in name or "head.probe" in name:
+                    continue
+                if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)
+                ):
+                    param = params_dict[scale_name]
+                    weight_loader = getattr(
+                        param,
+                        "weight_loader",
+                        default_weight_loader,
+                    )
+                    loaded_weight = (
+                        loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(scale_name)
+                    continue
+                for (
+                    param_name,
+                    weight_name,
+                    shard_id,
+                ) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param,
+                        "weight_loader",
+                        default_weight_loader,
+                    )
+                    weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+            return loaded_params
+
+    @MULTIMODAL_REGISTRY.register_processor(
+        PaddleOCRVLMultiModalProcessor,
+        info=PaddleOCRVLProcessingInfo,
+        dummy_inputs=PaddleOCRVLDummyInputsBuilder,
+    )
+    @support_torch_compile(
+        # set dynamic_arg_dims to support mrope
+        dynamic_arg_dims={
+            "input_ids": 0,
+            "positions": -1,
+            "intermediate_tensors": 0,
+            "inputs_embeds": 0,
+        }
+    )
+    class PaddleOCRVLForConditionalGeneration(Ernie4_5_ForCausalLM, SupportsMultiModal):
+
+        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+            super().__init__(vllm_config=vllm_config, prefix=prefix)
+            config = self.config
+
+            self.mlp_AR = Projector(config, config.vision_config)
+            self.visual = SiglipVisionModel(config=config.vision_config)
+            self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+            self.logits_processor = LogitsProcessor(config.vocab_size)
+
+            for layer in self.model.layers:
+                if not isinstance(layer, PPMissingLayer):
+                    layer.self_attn.rotary_emb.is_neox_style = True
+
+        def compute_logits(
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata,
+        ) -> Optional[torch.Tensor]:
+            logits = self.logits_processor(
+                self.lm_head, hidden_states, sampling_metadata
+            )
+            return logits
+
+        @property
+        def language_model(self):
+            return self.model
+
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            intermediate_tensors: Optional[IntermediateTensors] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            **kwargs,
+        ):
+            if intermediate_tensors is not None:
+                inputs_embeds = None
+
+            elif inputs_embeds is None:
+                vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+                inputs_embeds = self.get_input_embeddings(input_ids, vision_embeddings)
+                input_ids = None
+
+            return self.language_model(
+                input_ids, positions, intermediate_tensors, inputs_embeds
+            )
+
+        @classmethod
+        def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+            if modality.startswith("image"):
+                return "<|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|>"
+
+            raise ValueError("Only image modality is supported")
+
+        def encode_image(self, pixel_values, image_grid_thw):
+            pixel_values = pixel_values.type(self.visual.dtype)
+            siglip_position_ids = list()
+            image_grid_hws = list()
+            cu_seqlens = [0]
+
+            for idx, thw in enumerate(image_grid_thw):
+                thw_tuple = tuple(thw.detach().cpu().numpy().tolist())
+                numel = np.prod(thw_tuple)
+                image_grid_hws.append(thw_tuple)
+                image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+                siglip_position_ids.append(image_position_ids)
+                cu_seqlens.append(cu_seqlens[-1] + numel)
+
+            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+                pixel_values.device
+            )
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values.device
+            )
+
+            vision_outputs = self.visual(
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_hws,
+                position_ids=siglip_position_ids,
+                interpolate_pos_encoding=True,
+                cu_seqlens=cu_seqlens,
+            )
+            image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
+
+            return image_embeds
+
+        def get_multimodal_embeddings(self, **kwargs):
+            pixel_values = kwargs["pixel_values"]
+            image_grid_thw = kwargs["image_grid_thw"]
+
+            multimodal_embeddings = []
+            for pv, ig in zip(pixel_values, image_grid_thw):
+                if pv is not None:
+                    image_embeds = self.encode_image(pv, ig)
+                    multimodal_embeddings += image_embeds
+
+            return multimodal_embeddings
+
+        def get_input_embeddings(
+            self,
+            input_ids: torch.Tensor,
+            multimodal_embeddings: Optional[NestedTensors] = None,
+        ) -> torch.Tensor:
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            if multimodal_embeddings is not None and len(multimodal_embeddings) != 0:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    multimodal_embeddings,
+                    self.config.image_token_id,
+                )
+
+            return inputs_embeds
+
+        def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> set[str]:
+
+            loader = AutoWeightsLoader(self)
+            autoloaded_weights = loader.load_weights(weights)
+            return autoloaded_weights
diff --git a/paddlex/inference/genai/server.py b/paddlex/inference/genai/server.py
new file mode 100644
index 0000000000..da4cba5957
--- /dev/null
+++ b/paddlex/inference/genai/server.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import sys
+
+from ...utils import logging
+from ...utils.deps import is_genai_engine_plugin_available
+from .configs.utils import load_backend_config, update_backend_config
+from .constants import DEFAULT_BACKEND, SUPPORTED_BACKENDS
+from .models import get_chat_template_path, get_default_config, get_model_dir
+
+
+def get_arg_parser():
+    parser = argparse.ArgumentParser("PaddleX generative AI server.")
+    parser.add_argument("--model_name", type=str, required=True)
+    parser.add_argument("--model_dir", type=str)
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--backend", type=str, choices=SUPPORTED_BACKENDS, default=DEFAULT_BACKEND
+    )
+    parser.add_argument(
+        "--backend_config", type=str, help="Path to the backend configuration file."
+    )
+    return parser
+
+
+def run_genai_server(args):
+    plugin_name = f"{args.backend}-server"
+    if not is_genai_engine_plugin_available(plugin_name):
+        logging.error(
+            f"The '{plugin_name}' plugin is not available. Please install it first."
+        )
+        sys.exit(1)
+
+    if args.backend == "fastdeploy":
+        from .backends.fastdeploy import run_fastdeploy_server
+
+        run_server_func = run_fastdeploy_server
+    elif args.backend == "vllm":
+        from .backends.vllm import run_vllm_server
+
+        run_server_func = run_vllm_server
+    elif args.backend == "sglang":
+        from .backends.sglang import run_sglang_server
+
+        run_server_func = run_sglang_server
+    else:
+        raise AssertionError
+
+    if args.model_dir:
+        model_dir = args.model_dir
+    else:
+        try:
+            model_dir = get_model_dir(args.model_name, args.backend)
+        except Exception:
+            logging.error("Failed to get model directory", exc_info=True)
+            sys.exit(1)
+
+    if args.backend_config:
+        try:
+            backend_config = load_backend_config(args.backend_config)
+        except Exception:
+            logging.error(
+                f"Failed to load backend configuration from file: {args.backend_config}",
+                exc_info=True,
+            )
+            sys.exit(1)
+    else:
+        backend_config = {}
+
+    try:
+        default_config = get_default_config(args.model_name, args.backend)
+    except Exception:
+        logging.error(
+            f"Failed to get default configuration for the model", exc_info=True
+        )
+        sys.exit(1)
+    update_backend_config(
+        default_config,
+        backend_config,
+    )
+    backend_config = default_config
+
+    with get_chat_template_path(
+        args.model_name, args.backend, model_dir
+    ) as chat_template_path:
+        run_server_func(
+            args.host,
+            args.port,
+            args.model_name,
+            model_dir,
+            backend_config,
+            chat_template_path,
+        )
+
+
+def main(args=None):
+    parser = get_arg_parser()
+    args = parser.parse_args(args=args)
+    run_genai_server(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlex/inference/genai/utils.py b/paddlex/inference/genai/utils.py
new file mode 100644
index 0000000000..36ac9bbf4c
--- /dev/null
+++ b/paddlex/inference/genai/utils.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .constants import SUPPORTED_BACKENDS
+
+
+def model_name_to_module_name(model_name):
+    mod_name = model_name.lower().replace("-", "_").replace(".", "")
+    if mod_name[0].isdigit():
+        return "m_" + mod_name
+    return mod_name
+
+
+def check_backend(backend):
+    if backend not in SUPPORTED_BACKENDS:
+        raise ValueError(f"{repr(backend)} is not a supported backend.")
diff --git a/paddlex/inference/models/__init__.py b/paddlex/inference/models/__init__.py
index cc26d1de0d..40c5818aea 100644
--- a/paddlex/inference/models/__init__.py
+++ b/paddlex/inference/models/__init__.py
@@ -25,6 +25,7 @@
 # from .general_recognition import ShiTuRecPredictor
 from .anomaly_detection import UadPredictor
 from .base import BasePredictor
+from .common.genai import GenAIConfig, need_local_model
 from .doc_vlm import DocVLMPredictor
 from .face_feature import FaceFeaturePredictor
 from .formula_recognition import FormulaRecPredictor
@@ -55,25 +56,34 @@
 def create_predictor(
     model_name: str,
     model_dir: Optional[str] = None,
-    device=None,
+    device: Optional[str] = None,
     pp_option=None,
     use_hpip: bool = False,
     hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+    genai_config: Optional[Union[Dict[str, Any], GenAIConfig]] = None,
     *args,
     **kwargs,
 ) -> BasePredictor:
-    if model_dir is None:
+    # TODO: Check if the model is a genai model
+    if genai_config is not None:
+        genai_config = GenAIConfig.model_validate(genai_config)
+
+    if need_local_model(genai_config):
+        if model_dir is None:
+            assert (
+                model_name in official_models
+            ), f"The model ({model_name}) is not supported! Please using directory of local model files or model name supported by PaddleX!"
+            model_dir = official_models[model_name]
+        else:
+            assert Path(model_dir).exists(), f"{model_dir} is not exists!"
+            model_dir = Path(model_dir)
+        config = BasePredictor.load_config(model_dir)
         assert (
-            model_name in official_models
-        ), f"The model ({model_name}) is not supported! Please using directory of local model files or model name supported by PaddleX!"
-        model_dir = official_models[model_name]
+            model_name == config["Global"]["model_name"]
+        ), f"Model name mismatch，please input the correct model dir."
     else:
-        assert Path(model_dir).exists(), f"{model_dir} is not exists!"
-        model_dir = Path(model_dir)
-    config = BasePredictor.load_config(model_dir)
-    assert (
-        model_name == config["Global"]["model_name"]
-    ), f"Model name mismatch，please input the correct model dir."
+        config = None
+
     return BasePredictor.get(model_name)(
         model_dir=model_dir,
         config=config,
@@ -81,6 +91,8 @@ def create_predictor(
         pp_option=pp_option,
         use_hpip=use_hpip,
         hpi_config=hpi_config,
+        genai_config=genai_config,
+        model_name=model_name,
         *args,
         **kwargs,
     )
diff --git a/paddlex/inference/models/base/predictor/base_predictor.py b/paddlex/inference/models/base/predictor/base_predictor.py
index cda676269b..db0a4323d0 100644
--- a/paddlex/inference/models/base/predictor/base_predictor.py
+++ b/paddlex/inference/models/base/predictor/base_predictor.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from abc import ABC, abstractmethod
 from copy import deepcopy
 from pathlib import Path
@@ -34,8 +35,10 @@
 from ....utils.benchmark import ENTRY_POINT_NAME, benchmark
 from ....utils.hpi import HPIConfig, HPIInfo
 from ....utils.io import YAMLReader
+from ....utils.model_paths import get_model_paths
 from ....utils.pp_option import PaddlePredictorOption
 from ...common import HPInfer, PaddleInfer
+from ...common.genai import GenAIClient, GenAIConfig, need_local_model
 
 
 class PredictionWrap:
@@ -79,7 +82,7 @@ class BasePredictor(
 
     def __init__(
         self,
-        model_dir: str,
+        model_dir: Optional[str] = None,
         config: Optional[Dict[str, Any]] = None,
         *,
         device: Optional[str] = None,
@@ -87,11 +90,14 @@ def __init__(
         pp_option: Optional[PaddlePredictorOption] = None,
         use_hpip: bool = False,
         hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+        genai_config: Optional[GenAIConfig] = None,
+        model_name: Optional[str] = None,
     ) -> None:
         """Initializes the BasePredictor.
 
         Args:
-            model_dir (str): The directory where the model files are stored.
+            model_dir (Optional[str], optional): The directory where the model
+                files are stored.
             config (Optional[Dict[str, Any]], optional): The model configuration
                 dictionary. Defaults to None.
             device (Optional[str], optional): The device to run the inference
@@ -105,11 +111,43 @@ def __init__(
             hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
                 The high-performance inference configuration dictionary.
                 Defaults to None.
+            genai_config (Optional[GenAIConfig]], optional): The generative AI
+                configuration. Defaults to None.
+            model_name (Optional[str], optional): Optional model name.
+                Defaults to None.
         """
         super().__init__()
 
-        self.model_dir = Path(model_dir)
-        self.config = config if config else self.load_config(self.model_dir)
+        if need_local_model(genai_config):
+            if model_dir is None:
+                raise ValueError(
+                    "`model_dir` should not be `None`, as a local model is needed."
+                )
+            self.model_dir = Path(model_dir)
+            self.config = config if config else self.load_config(self.model_dir)
+            self._use_local_model = True
+        else:
+            if model_dir is not None:
+                warnings.warn("`model_dir` will be ignored, as it is not needed.")
+            self.model_dir = None
+            self.config = config
+            self._genai_config = genai_config
+            assert genai_config.server_url is not None
+            self._genai_client = GenAIClient(
+                backend=genai_config.backend,
+                base_url=genai_config.server_url,
+                max_concurrency=genai_config.max_concurrency,
+                model_name=model_name,
+                **(genai_config.client_kwargs or {}),
+            )
+            self._use_local_model = False
+
+        if model_name:
+            if self.config:
+                if self.config["Global"]["model_name"] != model_name:
+                    raise ValueError("`model_name` is not consistent with `config`")
+            self._model_name = model_name
+
         self.batch_sampler = self._build_batch_sampler()
         self.result_class = self._get_result_class()
 
@@ -117,12 +155,16 @@ def __init__(
         self.predict = self.__call__
 
         self.batch_sampler.batch_size = batch_size
-        self._use_hpip = use_hpip
-        if not use_hpip:
-            self._pp_option = self._prepare_pp_option(pp_option, device)
+
+        if self.model_dir and get_model_paths(self.model_dir, self.MODEL_FILE_PREFIX):
+            self._use_hpip = use_hpip
+            if not use_hpip:
+                self._pp_option = self._prepare_pp_option(pp_option, device)
+            else:
+                require_hpip()
+                self._hpi_config = self._prepare_hpi_config(hpi_config, device)
         else:
-            require_hpip()
-            self._hpi_config = self._prepare_hpi_config(hpi_config, device)
+            self._use_hpip = False
 
         logging.debug(f"{self.__class__.__name__}: {self.model_dir}")
 
@@ -144,7 +186,13 @@ def model_name(self) -> str:
         Returns:
             str: The model name.
         """
-        return self.config["Global"]["model_name"]
+        if self.config:
+            return self.config["Global"]["model_name"]
+        else:
+            if hasattr(self, "_model_name"):
+                return self._model_name
+            else:
+                raise AttributeError(f"{repr(self)} has no attribute 'model_name'.")
 
     @property
     def pp_option(self) -> PaddlePredictorOption:
@@ -162,6 +210,12 @@ def hpi_config(self) -> HPIConfig:
     def use_hpip(self) -> bool:
         return self._use_hpip
 
+    @property
+    def genai_config(self) -> GenAIConfig:
+        if not hasattr(self, "_genai_config"):
+            raise AttributeError(f"{repr(self)} has no attribute 'genai_config'.")
+        return self._genai_config
+
     def __call__(
         self,
         input: Any,
@@ -240,7 +294,6 @@ def get_hpi_info(self):
         try:
             return HPIInfo.model_validate(self.config["Hpi"])
         except ValidationError as e:
-            logging.exception("The HPI info in the model config file is invalid.")
             raise RuntimeError(f"Invalid HPI info: {str(e)}") from e
 
     def create_static_infer(self):
@@ -291,6 +344,10 @@ def process(self, batch_data: List[Any]) -> Dict[str, List[Any]]:
         """
         raise NotImplementedError
 
+    def close(self) -> None:
+        if hasattr(self, "_genai_client"):
+            self._genai_client.close()
+
     @classmethod
     def get_config_path(cls, model_dir) -> str:
         """Get the path to the configuration file for the given model directory.
@@ -345,6 +402,7 @@ def _prepare_pp_option(
             device_info = None
         if pp_option is None:
             pp_option = PaddlePredictorOption()
+
         if device_info:
             pp_option.device_type = device_info[0]
             pp_option.device_id = device_info[1]
diff --git a/paddlex/inference/models/common/genai.py b/paddlex/inference/models/common/genai.py
new file mode 100644
index 0000000000..ce274dd394
--- /dev/null
+++ b/paddlex/inference/models/common/genai.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import asyncio
+import atexit
+import concurrent.futures
+import threading
+import time
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, model_validator
+from typing_extensions import Literal
+
+from ....utils import logging
+from ....utils.deps import class_requires_deps
+
+SERVER_BACKENDS = ["fastdeploy-server", "vllm-server", "sglang-server"]
+
+
+class GenAIConfig(BaseModel):
+    backend: Literal["native", "fastdeploy-server", "vllm-server", "sglang-server"] = (
+        "native"
+    )
+    server_url: Optional[str] = None
+    max_concurrency: int = 200
+    client_kwargs: Optional[Dict[str, Any]] = None
+
+    @model_validator(mode="after")
+    def check_server_url(self):
+        if self.backend in SERVER_BACKENDS and self.server_url is None:
+            raise ValueError(
+                f"`server_url` must not be `None` for the {repr(self.backend)} backend."
+            )
+        return self
+
+
+def need_local_model(genai_config):
+    if genai_config is not None and genai_config.backend in SERVER_BACKENDS:
+        return False
+    return True
+
+
+# TODO: Can we set the event loop externally?
+class _AsyncThreadManager:
+    def __init__(self):
+        self.loop = None
+        self.thread = None
+        self.stopped = False
+        self._event_start = threading.Event()
+
+    def start(self):
+        if self.is_running():
+            return
+
+        def _run_loop():
+            self.loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self.loop)
+            self._event_start.set()
+            try:
+                self.loop.run_forever()
+            finally:
+                self.loop.close()
+                self.stopped = True
+
+        self.thread = threading.Thread(target=_run_loop, daemon=True)
+        self.thread.start()
+        self._event_start.wait()
+
+    def stop(self):
+        # TODO: Graceful shutdown
+        if not self.is_running():
+            return
+        self.loop.call_soon_threadsafe(self.loop.stop)
+        self.thread.join(timeout=1)
+        if self.thread.is_alive():
+            logging.warning("Background thread did not terminate in time")
+        self.loop = None
+        self.thread = None
+
+    def run_async(self, coro, return_future=False):
+        if not self.is_running():
+            raise RuntimeError("Event loop is not running")
+
+        future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+        return future
+
+    def is_running(self):
+        return self.loop is not None and not self.loop.is_closed() and not self.stopped
+
+
+_async_thread_manager = None
+
+
+def get_async_manager():
+    global _async_thread_manager
+    if _async_thread_manager is None:
+        _async_thread_manager = _AsyncThreadManager()
+    return _async_thread_manager
+
+
+def is_aio_loop_ready():
+    manager = get_async_manager()
+    return manager.is_running() and not manager.is_closed()
+
+
+def start_aio_loop():
+    manager = get_async_manager()
+    if not manager.is_running():
+        manager.start()
+        atexit.register(manager.stop)
+
+
+def close_aio_loop():
+    manager = get_async_manager()
+    if manager.is_running():
+        manager.stop()
+
+
+def run_async(coro, return_future=False, timeout=None):
+    manager = get_async_manager()
+
+    if not manager.is_running():
+        start_aio_loop()
+        time.sleep(0.1)
+
+    if not manager.is_running():
+        raise RuntimeError("Failed to start event loop")
+
+    future = manager.run_async(coro)
+
+    if return_future:
+        return future
+
+    try:
+        return future.result(timeout=timeout)
+    except concurrent.futures.TimeoutError:
+        logging.warning(f"Task timed out after {timeout} seconds")
+        raise
+    except Exception as e:
+        logging.error(f"Task failed with error: {e}")
+        raise
+
+
+@class_requires_deps("openai")
+class GenAIClient(object):
+
+    def __init__(
+        self, backend, base_url, max_concurrency=200, model_name=None, **kwargs
+    ):
+        from openai import AsyncOpenAI
+
+        super().__init__()
+
+        self.backend = backend
+        self._max_concurrency = max_concurrency
+        self._model_name = model_name
+
+        if "api_key" not in kwargs:
+            kwargs["api_key"] = "null"
+        self._client = AsyncOpenAI(base_url=base_url, **kwargs)
+
+        self._semaphore = asyncio.Semaphore(self._max_concurrency)
+
+    @property
+    def openai_client(self):
+        return self._client
+
+    def create_chat_completion(self, messages, *, return_future=False, **kwargs):
+        if self._model_name is not None:
+            model_name = self._model_name
+        else:
+            model_name = run_async(self._get_model_name(), timeout=10)
+            self._model_name = model_name
+
+        async def _create_chat_completion_with_semaphore(*args, **kwargs):
+            async with self._semaphore:
+                return await self._client.chat.completions.create(
+                    *args,
+                    **kwargs,
+                )
+
+        return run_async(
+            _create_chat_completion_with_semaphore(
+                model=model_name,
+                messages=messages,
+                **kwargs,
+            ),
+            return_future=return_future,
+        )
+
+    def close(self):
+        run_async(self._client.close(), timeout=5)
+
+    async def _get_model_name(self):
+        try:
+            models = await self._client.models.list()
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to get the model list from the OpenAI-compatible server: {e}"
+            ) from e
+        return models.data[0].id
diff --git a/paddlex/inference/models/common/static_infer.py b/paddlex/inference/models/common/static_infer.py
index cfd558b966..5a50fb91f7 100644
--- a/paddlex/inference/models/common/static_infer.py
+++ b/paddlex/inference/models/common/static_infer.py
@@ -436,6 +436,8 @@ def _create(
             elif self._option.device_type == "dcu":
                 if hasattr(config, "enable_new_ir"):
                     config.enable_new_ir(self._option.enable_new_ir)
+                    if self._option.enable_new_ir and self._option.enable_cinn:
+                        config.enable_cinn()
                 config.enable_use_gpu(100, self._option.device_id)
                 config.disable_mkldnn()
                 if hasattr(config, "enable_new_executor"):
diff --git a/paddlex/inference/models/common/tokenizer/__init__.py b/paddlex/inference/models/common/tokenizer/__init__.py
index ff9930a519..78ec1a3d7d 100644
--- a/paddlex/inference/models/common/tokenizer/__init__.py
+++ b/paddlex/inference/models/common/tokenizer/__init__.py
@@ -15,6 +15,7 @@
 from .bert_tokenizer import BertTokenizer
 from .clip_tokenizer import CLIPTokenizer
 from .gpt_tokenizer import GPTTokenizer
+from .llama_tokenizer import LlamaTokenizer
 from .qwen2_5_tokenizer import MIXQwen2_5_Tokenizer
 from .qwen2_tokenizer import MIXQwen2Tokenizer, Qwen2Tokenizer
 from .qwen_tokenizer import QWenTokenizer
diff --git a/paddlex/inference/models/common/tokenizer/clip_tokenizer.py b/paddlex/inference/models/common/tokenizer/clip_tokenizer.py
index 24786709c9..d526db8db1 100644
--- a/paddlex/inference/models/common/tokenizer/clip_tokenizer.py
+++ b/paddlex/inference/models/common/tokenizer/clip_tokenizer.py
@@ -14,13 +14,13 @@
 
 
 import json
-import logging
 import os
 import shutil
 import unicodedata
 from functools import lru_cache
 from typing import List, Optional
 
+from .....utils import logging
 from .tokenizer_utils import (
     PretrainedTokenizer,
     _is_control,
diff --git a/paddlex/inference/models/common/tokenizer/llama_tokenizer.py b/paddlex/inference/models/common/tokenizer/llama_tokenizer.py
new file mode 100644
index 0000000000..ce6d0fdf2e
--- /dev/null
+++ b/paddlex/inference/models/common/tokenizer/llama_tokenizer.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from paddlex.inference.models.common.tokenizer.tokenizer_utils import (
+    PretrainedTokenizer,
+)
+
+
+class LlamaTokenizer(PretrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    resource_files_names = {
+        "vocab_file": "sentencepiece.bpe.model",
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "__internal_testing__/micro-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "__internal_testing__/tiny-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-7b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-13b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-30b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-65b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+        },
+    }
+
+    pretrained_init_configuration = {
+        "__internal_testing__/micro-random-llama": {},
+        "__internal_testing__/tiny-random-llama": {},
+        "facebook/llama-7b": {},
+        "facebook/llama-13b": {},
+        "facebook/llama-30b": {},
+        "facebook/llama-65b": {},
+    }
+    padding_side = "left"
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
+        sp_model_kwargs=None,
+        decode_with_prefix_space=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(
+            bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
+        )
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", True))
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def __len__(self):
+        """
+        Returns the vocabulary size. added_tokens_encoder has to be added in the sp_model
+        """
+        added_size = 0
+
+        for id in self.added_tokens_decoder:
+            if id >= self.sp_model.get_piece_size():
+                added_size += 1
+
+        return self.vocab_size + added_size
+
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+
+    def get_spm_processor(self, from_slow=True):
+        import sentencepiece as spm
+        from sentencepiece import sentencepiece_model_pb2 as model_pb2
+
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        if from_slow:  # no dependency on protobuf
+            tokenizer.Load(self.vocab_file)
+            return tokenizer
+
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.id_to_piece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            raise ValueError(
+                f"Vocabulary path ({save_directory}) should be a directory"
+            )
+
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + self.resource_files_names["vocab_file"],
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+
+        return output
+
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
diff --git a/paddlex/inference/models/common/tokenizer/tokenizer_utils.py b/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
index b73b6b01c6..2da3a88ea8 100644
--- a/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
+++ b/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
@@ -1239,12 +1239,6 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
                     # Strip white spaces on the left
                     if tok_extended.lstrip and left:
                         tokens[i - 1] = left.rstrip()  # Opposite here
-                else:
-                    # We strip left and right by default
-                    if right:
-                        tokens[i + 1] = right.lstrip()
-                    if left:
-                        tokens[i - 1] = left.rstrip()
         # ["This is something", "<special_token_1>", "else"]
         tokenized_text = []
         for token in tokens:
diff --git a/paddlex/inference/models/common/vlm/activations.py b/paddlex/inference/models/common/vlm/activations.py
index 831a686d25..56ba495328 100644
--- a/paddlex/inference/models/common/vlm/activations.py
+++ b/paddlex/inference/models/common/vlm/activations.py
@@ -156,6 +156,8 @@ def __getitem__(self, key):
     "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}),
     "gelu_fast": FastGELUActivation,
     "gelu_new": NewGELUActivation,
+    # HACK
+    "gelu_pytorch_tanh": NewGELUActivation,
     "gelu_python": (GELUActivation, {"use_gelu_python": True}),
     "linear": LinearActivation,
     "mish": MishActivation,
diff --git a/paddlex/inference/models/common/vlm/generation/configuration_utils.py b/paddlex/inference/models/common/vlm/generation/configuration_utils.py
index 83f4db0051..4f12772346 100644
--- a/paddlex/inference/models/common/vlm/generation/configuration_utils.py
+++ b/paddlex/inference/models/common/vlm/generation/configuration_utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import json
 import os
 import warnings
@@ -171,6 +172,9 @@ def __init__(self, **kwargs):
         # Validate the values of the attributes
         self.validate(is_init=True)
 
+    def to_dict(self):
+        return copy.deepcopy(self.__dict__)
+
     def __eq__(self, other):
         if not isinstance(other, GenerationConfig):
             return False
diff --git a/paddlex/inference/models/common/vlm/generation/utils.py b/paddlex/inference/models/common/vlm/generation/utils.py
index 89e67ace62..a438c91886 100644
--- a/paddlex/inference/models/common/vlm/generation/utils.py
+++ b/paddlex/inference/models/common/vlm/generation/utils.py
@@ -894,9 +894,9 @@ def generate(
                 # ['是的', '嗯嗯']
         """
         if generation_config is None:
-            if (
-                self.generation_config is None
-                or self.generation_config._from_model_config
+            if self.generation_config is None or (
+                self.generation_config._from_model_config
+                and self.config._has_non_default_generation_parameters()
             ):
                 new_generation_config = GenerationConfig.from_model_config(self.config)
                 if new_generation_config != self.generation_config:
@@ -1097,6 +1097,8 @@ def generate(
         if "logits_processors" in model_kwargs:
             model_kwargs.pop("logits_processors")
 
+        model_kwargs["use_cache"] = generation_config.use_cache
+
         stopping_criteria = (
             stopping_criteria
             if stopping_criteria is not None
@@ -1239,7 +1241,6 @@ def greedy_search(
         synced_gpus=False,
         **model_kwargs,
     ):
-        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
         logits_processors = (
             logits_processors
             if logits_processors is not None
@@ -1362,7 +1363,6 @@ def sample(
         synced_gpus=False,
         **model_kwargs,
     ):
-        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
 
         logits_processors = (
             logits_processors
@@ -1751,8 +1751,6 @@ def beam_search(
         synced_gpus=False,
         **model_kwargs,
     ):
-        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
-
         logits_processors = (
             logits_processors
             if logits_processors is not None
@@ -1958,7 +1956,6 @@ def group_beam_search(
         synced_gpus=False,
         **model_kwargs,
     ):
-        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
         logits_processors = (
             logits_processors
             if logits_processors is not None
diff --git a/paddlex/inference/models/common/vlm/transformers/configuration_utils.py b/paddlex/inference/models/common/vlm/transformers/configuration_utils.py
index daa6a51a1d..5936c41717 100644
--- a/paddlex/inference/models/common/vlm/transformers/configuration_utils.py
+++ b/paddlex/inference/models/common/vlm/transformers/configuration_utils.py
@@ -865,9 +865,6 @@ def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
     def __eq__(self, other):
         return self.__dict__ == other.__dict__
 
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
     def to_diff_dict(self, saving_file=False) -> Dict[str, Any]:
         """
         Removes all attributes from config which correspond to the default config attributes for better readability and
diff --git a/paddlex/inference/models/common/vlm/transformers/model_utils.py b/paddlex/inference/models/common/vlm/transformers/model_utils.py
index d409f5f889..26e408d586 100644
--- a/paddlex/inference/models/common/vlm/transformers/model_utils.py
+++ b/paddlex/inference/models/common/vlm/transformers/model_utils.py
@@ -18,6 +18,8 @@
 import re
 import warnings
 from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Type, Union
 
 import numpy as np
@@ -33,9 +35,12 @@
 from paddle.nn import Layer
 
 from ......utils import logging
+from ......utils.deps import is_dep_available, require_deps
 from ...tokenizer.tokenizer_utils import InitTrackerMeta, adapt_stale_fwd_patch
 from ..generation import GenerationConfig, GenerationMixin
 from ..utils import (
+    ASYMMETRY_QUANT_SCALE_MAX,
+    ASYMMETRY_QUANT_SCALE_MIN,
     CONFIG_NAME,
     LEGACY_CONFIG_NAME,
     PADDLE_WEIGHTS_INDEX_NAME,
@@ -44,6 +49,7 @@
     PYTORCH_WEIGHTS_NAME,
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
+    SYMMETRY_QUANT_SCALE,
     device_guard,
     resolve_file_path,
 )
@@ -53,7 +59,6 @@
     ContextManagers,
     fn_args_to_dict,
     get_checkpoint_shard_files,
-    is_safetensors_available,
     paddlenlp_load,
     weight_name_suffix,
 )
@@ -128,12 +133,112 @@ def _split_keys_evenly(keys: list, n: int) -> list:
     return result
 
 
+def _load_part_state_dict_from_safetensors(
+    keys,
+    checkpoint_file: Union[str, os.PathLike],
+    tensor_parallel_split_mapping,
+    fliter_dict_keys,
+    device,
+    quantization_linear_list=None,
+    quantization_config=None,
+    dtype=None,
+    return_numpy=False,
+    convert_from_hf=False,
+    transpose_weight_keys=None,
+):
+    import paddle
+    from safetensors import safe_open
+
+    if transpose_weight_keys:
+        transpose_weight_keys = set(transpose_weight_keys)
+
+    def _is_need_transpose(key):
+        if "lora" not in key and convert_from_hf and transpose_weight_keys:
+            return key in transpose_weight_keys
+
+    def _transpose_hf_weight(key, weight):
+        if _is_need_transpose(key):
+            return weight.transpose([-1, -2])
+        return weight
+
+    part_state_dict = {}
+    scale_dict = {}
+    with safe_open(checkpoint_file, framework="paddle") as f:
+        for key in keys:
+            # 1. non-merge ckpt loading dont have filter key.
+            # 2. merge ckpt will skip quant scale by `fliter_dict_keys`
+            if (
+                key.endswith(SYMMETRY_QUANT_SCALE)
+                or key.endswith(ASYMMETRY_QUANT_SCALE_MIN)
+                or key.endswith(ASYMMETRY_QUANT_SCALE_MAX)
+            ):
+                continue
+
+            if fliter_dict_keys is not None and key not in fliter_dict_keys:
+                continue
+
+            py_safe_slice_ = f.get_slice(key)
+            if (
+                quantization_linear_list is not None
+                and key.split(".weight")[0] in quantization_linear_list
+                and not key.endswith("_scale")
+            ):
+                raise NotImplementedError
+            else:
+                if key in tensor_parallel_split_mapping:
+                    tp_fn = tensor_parallel_split_mapping[key]
+                    if _is_need_transpose(key):
+                        assert isinstance(tp_fn, partial)
+                        is_column = True
+                        if "is_column" in tp_fn.keywords:
+                            is_column = tp_fn.keywords["is_column"]
+                        is_column = not is_column
+                        tp_fn = partial(
+                            tp_fn.func,
+                            *tp_fn.args,
+                            **{**tp_fn.keywords, "is_column": is_column},
+                        )
+                    if len(py_safe_slice_.shape) == 0:
+                        weight = tp_fn(py_safe_slice_[:])
+                    else:
+                        weight = tp_fn(py_safe_slice_)
+                else:
+                    weight = py_safe_slice_[:]
+
+                if not return_numpy and device == "expected":
+                    weight = weight._copy_to(
+                        paddle.framework._current_expected_place(), False
+                    )
+                weight = _transpose_hf_weight(key, weight)
+                if return_numpy:
+                    weight = weight.numpy()
+                part_state_dict[key] = weight
+
+        for key in keys:
+            if (
+                key.endswith(SYMMETRY_QUANT_SCALE)
+                or key.endswith(ASYMMETRY_QUANT_SCALE_MIN)
+                or key.endswith(ASYMMETRY_QUANT_SCALE_MAX)
+            ):
+                scale = f.get_tensor(key)
+                if not return_numpy and device == "expected":
+                    scale = scale._copy_to(
+                        paddle.framework._current_expected_place(), False
+                    )
+                if return_numpy:
+                    scale = scale.numpy()
+                scale_dict[key] = scale
+    return part_state_dict, scale_dict
+
+
 def load_state_dict(
     checkpoint_file: Union[str, os.PathLike],
     tensor_parallel_split_mapping=None,
     fliter_dict_keys=None,
     device="cpu",
     ckpt_quant_stage="O0",
+    convert_from_hf=False,
+    transpose_weight_keys=None,
 ):
     """
     Reads a PaddlePaddle checkpoint file, returning properly formatted errors if they arise.
@@ -142,14 +247,35 @@ def load_state_dict(
     if tensor_parallel_split_mapping is None:
         tensor_parallel_split_mapping = {}
 
-    state_dict = paddlenlp_load(checkpoint_file, map_location="cpu")
+    if Path(checkpoint_file).suffix == ".safetensors":
+        require_deps("safetensors")
+        from safetensors import safe_open
+
+        with safe_open(checkpoint_file, framework="paddle") as f:
+            state_dict, scale_dict = _load_part_state_dict_from_safetensors(
+                list(f.keys()),
+                checkpoint_file,
+                tensor_parallel_split_mapping,
+                fliter_dict_keys,
+                "expected",
+                quantization_linear_list=None,
+                quantization_config=None,
+                dtype=None,
+                return_numpy=False,
+                convert_from_hf=convert_from_hf,
+                transpose_weight_keys=transpose_weight_keys,
+            )
+    else:
+        state_dict = paddlenlp_load(checkpoint_file, map_location="cpu")
     return state_dict
 
 
 _re_layer_prefix = re.compile(r"\.(\d+)\.")
 
 
-def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
+def _load_state_dict_into_model(
+    model_to_load, state_dict, start_prefix, convert_from_hf
+):
     # torch will cast dtype in load_state_dict, but paddle strictly check dtype
     _convert_state_dict_dtype_and_shape(state_dict, model_to_load)
 
@@ -168,6 +294,11 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):
             "ignore", message=r".*is not found in the provided dict.*"
         )
         warnings.filterwarnings("ignore", message=r".*paddle.to_tensor.*")
+        if convert_from_hf:
+            try:
+                model_to_load.set_hf_state_dict(state_dict)
+            except NotImplementedError:
+                pass
         model_to_load.set_state_dict(state_dict)
         error_msgs.extend([str(x.message) for x in w])
 
@@ -1131,6 +1262,7 @@ def _load_pretrained_model(
         keep_in_fp32_modules=None,
         quantization_linear_list=None,
         sharded_metadata=None,
+        convert_from_hf=False,
     ) -> Tuple[List[str]]:
         """load the state_dict into model, and do the following things:
 
@@ -1148,7 +1280,13 @@ def _load_pretrained_model(
         """
         is_safetensors = False
 
-        model_state_dict = model.state_dict()
+        if convert_from_hf:
+            try:
+                model_state_dict = model.get_hf_state_dict()
+            except NotImplementedError:
+                model_state_dict = model.state_dict()
+        else:
+            model_state_dict = model.state_dict()
 
         expected_keys = list(model_state_dict.keys())
         prefix = model.base_model_prefix
@@ -1403,7 +1541,10 @@ def _fuse_or_split_keys(
                 )
             else:
                 error_msgs = _load_state_dict_into_model(
-                    model_to_load, state_dict, start_prefix
+                    model_to_load,
+                    state_dict,
+                    start_prefix,
+                    convert_from_hf=convert_from_hf,
                 )
         else:
             # Sharded checkpoint or whole but low_cpu_mem_usage==True
@@ -1461,10 +1602,19 @@ def _fuse_or_split_keys(
 
                 if config.quantization_config.is_weight_quantize():
                     filter_dict_keys = None
+                try:
+                    transpose_weight_keys = model.get_transpose_weight_keys()
+                except NotImplementedError:
+                    if convert_from_hf:
+                        raise ValueError("`convert_from_hf=True` is not supported")
+                    else:
+                        transpose_weight_keys = None
                 state_dict = load_state_dict(
                     shard_file,
                     tp_actions if pre_tensor_parallel_split else None,
                     filter_dict_keys,
+                    convert_from_hf=convert_from_hf,
+                    transpose_weight_keys=transpose_weight_keys,
                 )
 
                 # convert for fusing or splitting weights
@@ -1528,7 +1678,10 @@ def _fuse_or_split_keys(
                     error_msgs += new_error_msgs
                 else:
                     error_msgs += _load_state_dict_into_model(
-                        model_to_load, state_dict, start_prefix
+                        model_to_load,
+                        state_dict,
+                        start_prefix,
+                        convert_from_hf=convert_from_hf,
                     )
 
                 # force memory release
@@ -1544,23 +1697,15 @@ def _fuse_or_split_keys(
             )
 
         if len(unexpected_keys) > 0:
-            if logging.logging.level < 20:
-                logging.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                    f" initializing {model.__class__.__name__}: {sorted(unexpected_keys)}\n- This IS expected if you are"
-                    f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
-                    " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                    " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                    f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
-                    " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
-                )
-            else:
-                logging.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                    f" initializing the model, - This IS expected if you are"
-                    f" initializing the model from a checkpoint of a model trained on another task or"
-                    " with another architecture."
-                )
+            logging.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {sorted(unexpected_keys)}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
+                " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
+                " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
+            )
         else:
             logging.info(
                 f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
@@ -1596,7 +1741,9 @@ def _fuse_or_split_keys(
         return model, missing_keys, unexpected_keys, mismatched_keys
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+    def from_pretrained(
+        cls, pretrained_model_name_or_path, *args, convert_from_hf=False, **kwargs
+    ):
         """
         Creates an instance of `PretrainedModel`. Model weights are loaded
         by specifying name of a built-in pretrained model, a pretrained model from HF Hub, a community contributed model,
@@ -1663,7 +1810,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             subfolder = ""
         variant = kwargs.pop("variant", None)
         use_safetensors = kwargs.pop(
-            "use_safetensors", None if is_safetensors_available() else False
+            "use_safetensors", None if is_dep_available("safetensors") else False
         )
 
         low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
@@ -1739,6 +1886,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             )
         )
 
+        init_args = config["init_args"] or ()
+        with ContextManagers(init_contexts):
+            model = cls(config, *init_args, **model_kwargs)
+
         if convert_from_torch and state_dict is None:
             if (
                 resolved_archive_file.endswith(PYTORCH_WEIGHTS_NAME)
@@ -1780,7 +1931,18 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
             ):
                 raise NotImplementedError
             else:
-                state_dict = load_state_dict(resolved_archive_file)
+                try:
+                    transpose_weight_keys = model.get_transpose_weight_keys()
+                except NotImplementedError:
+                    if convert_from_hf:
+                        raise ValueError("`convert_from_hf=True` is not supported")
+                    else:
+                        transpose_weight_keys = None
+                state_dict = load_state_dict(
+                    resolved_archive_file,
+                    convert_from_hf=convert_from_hf,
+                    transpose_weight_keys=transpose_weight_keys,
+                )
 
             logging.info("Loaded weights file from disk, setting weights to model.")
 
@@ -1815,10 +1977,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
                         state_dict[k] = paddle.Tensor.__call__(
                             state_dict.pop(k), zero_copy=True
                         )
-        # 3. init the model
-        init_args = config["init_args"] or ()
-        with ContextManagers(init_contexts):
-            model = cls(config, *init_args, **model_kwargs)
 
         if use_keep_in_fp32_modules:
             # low_cpu_mem_usage = True
@@ -1844,6 +2002,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
                 keep_in_fp32_modules=keep_in_fp32_modules,
                 quantization_linear_list=quantization_linear_list,
                 sharded_metadata=sharded_metadata if is_sharded else None,
+                convert_from_hf=convert_from_hf,
             )
         )
 
@@ -2012,3 +2171,12 @@ def _generate_auto_dist_config(self, auto_dist_degree):
             final_config["pp_config"] = merged_config["pp_config"]
 
         return final_config
+
+    def get_transpose_weight_keys(self):
+        raise NotImplementedError
+
+    def get_hf_state_dict(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def set_hf_state_dict(self, *args, **kwargs):
+        raise NotImplementedError
diff --git a/paddlex/inference/models/common/vlm/transformers/utils.py b/paddlex/inference/models/common/vlm/transformers/utils.py
index 395e8c2f65..c5f04f3362 100644
--- a/paddlex/inference/models/common/vlm/transformers/utils.py
+++ b/paddlex/inference/models/common/vlm/transformers/utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import importlib
 import inspect
 import os
 from contextlib import ExitStack
@@ -118,10 +117,6 @@ def is_paddle_support_lazy_init():
     return hasattr(paddle, "LazyGuard")
 
 
-def is_safetensors_available():
-    return importlib.util.find_spec("safetensors") is not None
-
-
 def paddlenlp_load(path, map_location="cpu"):
     assert map_location in ["cpu", "gpu", "xpu", "npu", "numpy", "np"]
     if map_location in ["numpy", "np"]:
diff --git a/paddlex/inference/models/doc_vlm/modeling/__init__.py b/paddlex/inference/models/doc_vlm/modeling/__init__.py
index e4ad8559ef..298a5b85d2 100644
--- a/paddlex/inference/models/doc_vlm/modeling/__init__.py
+++ b/paddlex/inference/models/doc_vlm/modeling/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 from .GOT_ocr_2_0 import PPChart2TableInference
+from .paddleocr_vl import PaddleOCRVLForConditionalGeneration
 from .qwen2_5_vl import PPDocBee2Inference
 from .qwen2_vl import PPDocBeeInference, Qwen2VLForConditionalGeneration
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/__init__.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/__init__.py
new file mode 100644
index 0000000000..42126a0ce8
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._paddleocr_vl import PaddleOCRVLForConditionalGeneration
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_config.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_config.py
new file mode 100644
index 0000000000..6c563a0cda
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_config.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/configuration_keye.py
+# Original header:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....common.vlm.transformers import PretrainedConfig
+
+
+class PaddleOCRVisionConfig(PretrainedConfig):
+    model_type = "paddleocr_vl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+
+
+class PaddleOCRVLConfig(PretrainedConfig):
+    model_type = "paddleocr_vl"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"vision_config": PaddleOCRVisionConfig}
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=11008,
+        max_position_embeddings=32768,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        image_token_id=101304,
+        video_token_id=101305,
+        vision_start_token_id=101306,
+        rope_scaling=None,
+        rms_norm_eps=1e-6,
+        use_cache=False,
+        use_flash_attention=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        head_dim=128,
+        hidden_act="silu",
+        use_bias=False,
+        rope_theta=10000,
+        weight_share_add_bias=True,
+        ignored_index=-100,
+        attention_probs_dropout_prob=0.0,
+        hidden_dropout_prob=0.0,
+        compression_ratio: float = 1.0,
+        num_key_value_heads=None,
+        max_sequence_length=None,
+        tie_word_embeddings=False,
+        vision_config=None,
+        **kwargs,
+    ):
+        # Set default for tied embeddings if not specified.
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.rope_scaling = rope_scaling
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_flash_attention = use_flash_attention
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.head_dim = head_dim
+        if hidden_act != "silu":
+            raise NotImplementedError
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.use_bias = use_bias
+        self.weight_share_add_bias = weight_share_add_bias
+        self.rope_theta = rope_theta
+        self.ignored_index = ignored_index
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.compression_ratio = compression_ratio
+        self.num_key_value_heads = num_key_value_heads
+        self.max_sequence_length = max_sequence_length
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+        # Currently, these configuration items are hard-coded
+        from ......utils.env import get_paddle_cuda_version
+
+        cuda_version = get_paddle_cuda_version()
+        if cuda_version and cuda_version[0] > 11:
+            self.fuse_rms_norm = True
+        else:
+            self.fuse_rms_norm = False
+        self.use_sparse_flash_attn = True
+        self.use_var_len_flash_attn = False
+        self.scale_qk_coeff = 1.0
+        self.fuse_softmax_mask = False
+        self.use_sparse_head_and_loss_fn = False
+        self.use_recompute_loss_fn = False
+        self.use_fused_head_and_loss_fn = False
+        self.fuse_linear = False
+        self.token_balance_seqlen = False
+        self.use_rmsnorm = True
+        self.fuse_ln = False
+        self.cachekv_quant = False
+        self.fuse_swiglu = False
+        self.freq_allocation = 20
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_distributed/__init__.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_distributed/__init__.py
new file mode 100644
index 0000000000..64c5821cf2
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_distributed/__init__.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Distributed utils
+"""
+
+# TODO: Support XPU
+
+import paddle
+
+from .common_dist_utils import (
+    AllGatherVarlenOp,
+    ColumnParallelLinear,
+    ColumnSequenceParallelLinear,
+    GatherOp,
+    RowParallelLinear,
+    RowSequenceParallelLinear,
+    RRColumnSequenceParallelLinear,
+    RRRowSequenceParallelLinear,
+    ScatterOp,
+    get_hcg,
+    mark_as_sequence_parallel_parameter,
+    sequence_parallel_sparse_mask_labels,
+)
+
+__all__ = [
+    "ColumnParallelLinear",
+    "ColumnSequenceParallelLinear",
+    "RowParallelLinear",
+    "RowSequenceParallelLinear",
+    "GatherOp",
+    "ScatterOp",
+    "mark_as_sequence_parallel_parameter",
+    "ParallelCrossEntropy",
+    "get_rng_state_tracker",
+    "parallel_matmul",
+    "RRColumnSequenceParallelLinear",
+    "RRRowSequenceParallelLinear",
+    "AllGatherVarlenOp",
+    "sequence_parallel_sparse_mask_labels",
+    "get_hcg",
+]
+
+
+def parallel_matmul(
+    x,
+    y,
+    bias=None,
+    transpose_y=False,
+    tensor_parallel_degree=1,
+    tensor_parallel_output=True,
+    fuse_linear=False,
+    training=None,
+):
+    """
+    Parallel matmul wrapper.
+
+    Args:
+        x (Tensor): Input tensor.
+        y (Tensor): Weight tensor.
+        bias (Tensor, optional): Bias tensor. Default is None.
+        transpose_y (bool, optional): Whether to transpose y. Default is False.
+        tensor_parallel_degree (int, optional): Tensor parallel degree. Default is 1.
+        tensor_parallel_output (bool, optional): Whether to output tensor parallel. Default is True.
+        fuse_linear (bool, optional): Whether to fuse linear. Default is False.
+        training (bool, optional): Training state. Default is None.
+    Returns:
+        Tensor: Output tensor.
+    """
+    if paddle.is_compiled_with_xpu():
+        from .common_dist_utils import _parallel_matmul as default_parallel_matmul
+
+        return default_parallel_matmul(
+            x,
+            y,
+            bias=bias,
+            transpose_y=transpose_y,
+            tensor_parallel_degree=tensor_parallel_degree,
+            tensor_parallel_output=tensor_parallel_output,
+            fuse_linear=fuse_linear,
+        )
+    else:
+        from .common_dist_utils import _parallel_matmul
+
+    return _parallel_matmul(
+        x,
+        y,
+        bias=bias,
+        transpose_y=transpose_y,
+        tensor_parallel_degree=tensor_parallel_degree,
+        tensor_parallel_output=tensor_parallel_output,
+        fuse_linear=fuse_linear,
+    )
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_distributed/common_dist_utils.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_distributed/common_dist_utils.py
new file mode 100644
index 0000000000..bc69262193
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_distributed/common_dist_utils.py
@@ -0,0 +1,713 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Common distributed utils.
+"""
+
+import paddle
+import paddle.nn.functional as F
+from paddle import distributed as dist
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+    AllGatherOp,
+    ColumnSequenceParallelLinear,
+    GatherOp,
+    ReduceScatterOp,
+    RowSequenceParallelLinear,
+    ScatterOp,
+    all_gather,
+    mark_as_sequence_parallel_parameter,
+    scatter,
+)
+from paddle.incubate.tensor.manipulation import create_async_load
+
+from .._refined_recompute.utils import RefinedRecomputeFunction
+
+__all__ = [
+    "get_hcg",
+    "_parallel_matmul",
+    "scatter_axis",
+    "mp_slice",
+    "all_gather_varlen",
+    "ColumnParallelLinear",
+    "ColumnSequenceParallelLinear",
+    "RowParallelLinear",
+    "RowSequenceParallelLinear",
+    "GatherOp",
+    "ScatterOp",
+    "mark_as_sequence_parallel_parameter",
+    "RRColumnSequenceParallelLinear",
+    "RRRowSequenceParallelLinear",
+    "AllGatherVarlenOp",
+    "sequence_parallel_sparse_mask_labels",
+    "get_async_loader",
+    "hack_offload_wait",
+    "hack_reload_wait",
+    "all_gather_group",
+    "reduce_scatter_group",
+]
+
+
+def get_hcg():
+    """
+    Get hybrid communicate group.
+    """
+    return fleet.get_hybrid_communicate_group()
+
+
+def _parallel_matmul(
+    x,
+    y,
+    bias=None,
+    transpose_y=False,
+    tensor_parallel_degree=1,
+    tensor_parallel_output=True,
+    fuse_linear=False,
+):
+    """
+    Performs parallel matrix multiplication with tensor model parallelism support.
+
+    Args:
+        x (paddle.Tensor): Input tensor with shape [batch_size, seq_len, hidden_size]
+        y (Union[paddle.Tensor, EagerParamBase]): Weight matrix which can be:
+            - Regular tensor
+            - Distributed parameter in tensor parallel mode
+        bias (Optional[paddle.Tensor]): Optional bias tensor
+        transpose_y (bool): Whether to transpose the 'y' matrix before multiplication
+        tensor_parallel_degree (int): Degree of tensor model parallelism (default: 1)
+        tensor_parallel_output (bool): Whether to keep output in tensor parallel format
+            or gather across devices (default: True)
+        fuse_linear (bool): Whether to use fused linear operation for optimization
+
+    Returns:
+        paddle.Tensor
+
+    Raises:
+        AssertionError: If tensor parallel is enabled but weight is not distributed
+        AttributeError: If called without distributed.launch context
+    """
+    if tensor_parallel_degree > 1:
+        if isinstance(y, paddle.base.framework.EagerParamBase):
+            assert y.is_distributed
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        pg = fleet.get_hybrid_communicate_group().get_model_parallel_group()
+        input_parallel = paddle.distributed.collective._c_identity(x, group=pg)
+
+        if transpose_y:
+            logits = paddle.matmul(input_parallel, y, transpose_y=True)
+            if bias is not None:
+                logits += bias
+        else:
+            if fuse_linear:
+                logits = paddle.incubate.nn.functional.fused_linear(
+                    input_parallel, y, bias
+                )
+            else:
+                logits = F.linear(input_parallel, y, bias)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=pg)
+
+    else:
+        if fuse_linear:
+            logits = paddle.incubate.nn.functional.fused_linear(
+                x, y, bias, transpose_weight=transpose_y
+            )
+        else:
+            logits = paddle.matmul(x, y, transpose_y=transpose_y)
+            if bias is not None:
+                logits += bias
+        return logits
+
+
+def scatter_axis(input, group=None, axis=0):
+    """
+    Uniformly splits the `input` along dimension 0 across model parallel groups.
+    This API is not related to `distributed.scatter`.
+
+    Args:
+        input: Input tensor to be split
+        group: Communication group for parallel processing (default: model parallel group)
+        axis: Dimension along which to split (default: 0)
+
+    Returns:
+        A slice of the input tensor corresponding to this rank's portion
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    rank = group.rank
+    seq_len = input.shape[axis]
+    assert seq_len % parallelism == 0, (
+        f"Input sequence length {seq_len} can't be divided exactly"
+        f" by sequence parallelism {parallelism}"
+    )
+    interval = seq_len // parallelism
+    input = paddle.slice(
+        input, axes=[axis], starts=[interval * rank], ends=[interval * (rank + 1)]
+    )
+    # slice uses stride, so we maintain the memory of whole input, use assign to free the whole input
+    # which can avoid OOM.
+    input = paddle.assign(input)
+    return input
+
+
+def mp_slice(x, indices=None, group=None, axis=0):
+    """
+    Slices tensor `x` along dimension 0 according to `indices` without communication.
+
+    Args:
+        x: Input tensor to be sliced
+        indices: List of indices defining how to slice the tensor
+        group: Communication group for parallel processing (default: model parallel group)
+        axis: Dimension along which to slice (default: 0)
+
+    Returns:
+        A slice of the input tensor corresponding to this rank's portion
+    """
+    if indices is None:
+        return scatter(x, group, axis)
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return x
+    rank = group.rank
+    assert len(indices) == parallelism, (len(indices), parallelism)
+    indices = F.pad(paddle.to_tensor(indices).cumsum(0), [1, 0])
+    input = paddle.slice(
+        x, axes=[axis], starts=[indices[rank]], ends=[indices[rank + 1]]
+    )
+    input = paddle.assign(input)
+    return input
+
+
+def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True):
+    """
+    Variable-length version of `all_gather` that behaves similarly to `distributed.all_gather`.
+
+    Args:
+        input: Local tensor to be gathered
+        indices: List of sizes from each rank indicating how much to gather from each
+        group: Communication group for parallel processing (default: model parallel group)
+        axis: Dimension along which to gather (only 0 is supported)
+        sync_op: Whether to synchronize the operation
+
+    Returns:
+        A concatenated tensor containing all gathered data
+    """
+    assert axis == 0, "only support axis=0"
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    input_sizes = [len(input)] * parallelism
+    output_sizes = indices
+    out = paddle.empty([sum(indices)] + input.shape[1:], dtype=input.dtype)
+    task = dist.stream.alltoall_single(
+        out,
+        (
+            paddle.concat([input] * parallelism, 0) if len(input) else input
+        ),  # TODO: check this
+        output_sizes,  # input-size
+        input_sizes,
+        group=group,
+        sync_op=sync_op,
+        use_calc_stream=sync_op,
+    )
+    task.wait()
+    return out
+
+
+class ReduceScatterGroupOp(PyLayer):
+    """
+    Perform group reduce scatter.
+    """
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """Forward pass: Reduce-Scatter operation
+        Args:
+            input (Tensor):  Input tensor with shape [s, b, h].
+                            The 's' dimension will be split across model parallel group.
+            group (ProcessGroup): Model parallel process group,
+                                uses global group by default.
+        Returns:
+            Tensor: Output tensor after Reduce-Scatter with shape [s/n, b, h],
+                   each device holds partial data of the original input.
+        """
+        ctx.group = group
+        return reduce_scatter_group(input, group=group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """Backward pass: All-Gather operation
+        Args:
+            grad (Tensor): Upstream gradient with shape [s/n, b, h]
+        Returns:
+            Tensor: Full gradient after All-Gather with restored shape [s, b, h],
+                   aggregating gradients from all devices in model parallel group.
+        """
+        return all_gather_group(grad, group=ctx.group)
+
+
+class AllGatherGroupOp(PyLayer):
+    """
+    Perform group allgather.
+    """
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """Forward pass: All-Gather operation
+        Args:
+            input (Tensor):  Partitioned tensor with shape [s/n, b, h]
+                            The 's' dimension is distributed across devices
+            group (ProcessGroup): Model parallel process group,
+                                uses global group by default
+        Returns:
+            Tensor: Assembled tensor after All-Gather with shape [s, b, h],
+                   containing full parameter from all devices
+        """
+        ctx.group = group
+        return all_gather_group(input, group=group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """Backward pass: Reduce-Scatter operation
+        Args:
+            grad (Tensor): Full gradient tensor with shape [s, b, h]
+        Returns:
+            Tensor: Scattered gradient with shape [s/n, b, h],
+                   distributing reduced gradients to each device
+        """
+        return reduce_scatter_group(grad, group=ctx.group)
+
+
+class RRColumnSequenceParallelLinear(ColumnSequenceParallelLinear):
+    """
+    ColumnSequenceParallelLinear with refined recompute.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight_attr=None,
+        has_bias=None,
+        gather_output=True,
+        fuse_matmul_bias=False,
+        mp_group=None,
+        use_rr=False,
+        name=None,
+    ):
+        """
+        Initializes a ColumnSequenceParallelLinear module.
+
+        Args:
+            in_features (int): The number of input features.
+            out_features (int): The number of output features.
+            weight_attr (ParamAttr, optional): The parameter attribute for the learnable
+                weight matrix. Default: None.
+            has_bias (bool, optional): Whether the layer uses a bias. By default, it is set to False.
+                If ``has_bias`` is set to False, no bias term is used. If ``has_bias`` is set to True,
+                a bias vector is used. Default: None, which means inherit the value of `has_bias`
+                from the current instance's `has_bias`.
+            gather_output (bool, optional): Whether to gather all outputs from all ranks during forward pass.
+                Default: True. If True, all outputs from all ranks are gathered during forward pass, which
+                makes sure that each example's output is produced only once. If False, all outputs are
+                produced on each rank separately, and the outputs from different ranks may overlap.
+                This can save communication time but may cause slower convergence. Default: True.
+            fuse_matmul_bias (bool, optional): Whether to fuse matmul and bias into one op. Default: False.
+            mp_group (paddle.distributed.Group, optional): The group for model parallel. Default: None.
+            use_rr (bool, optional): Whether to use refined rcompute. Default: False.
+            name (str, optional): Name for the instance to use in tracebacks. Default: None.
+        """
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            weight_attr=weight_attr,
+            has_bias=has_bias,
+            gather_output=gather_output,
+            fuse_matmul_bias=fuse_matmul_bias,
+            mp_group=mp_group,
+            name=name,
+        )
+
+        self._rr_column_ln = RefinedRecomputeFunction() if use_rr else None
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+        if has_bias and self.bias.is_distributed:
+            self.bias.split_axis = 0
+
+    def forward(self, x):
+        """
+        Forward pass function that computes the product of the input tensor and model parameters.
+
+        Args:
+            x (paddle.Tensor): Input tensor with shape (batch_size, seq_len, hidden_size) or (batch_size, hidden_size).
+            If sequence parallel is True, the shape is (seq_len, batch_size, hidden_size).
+
+        Returns:
+            paddle.Tensor: Returns a tensor with shape (batch_size, seq_len, hidden_size) or (batch_size, hidden_size).
+            If sequence parallel is True, the shape is (seq_len, batch_size, hidden_size).
+        """
+        # sequence parallelism is same as model parallelism
+        # if sequence parallel is true, input shape is [s, b, h]
+        # else input shape is [b, s, h]
+        if self.is_mp:
+            input_parallel = AllGatherOp.apply(x)
+        else:
+            input_parallel = x
+
+        if (
+            self._rr_column_ln is not None and self.training
+        ):  # in eval mode, do not use refined recompute
+            output = self._rr_column_ln(
+                self.linear,
+                x=input_parallel,
+                weight=self.weight,
+                bias=self.bias,
+            )
+        else:
+            output = self.linear(
+                input_parallel, self.weight, self.bias, name=self._name
+            )
+        return output
+
+
+class RRRowSequenceParallelLinear(RowSequenceParallelLinear):
+    """
+    RowSequenceParallelLinear with refined recompute.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight_attr=None,
+        has_bias=True,
+        input_is_parallel=False,
+        fuse_matmul_bias=False,
+        mp_group=None,
+        use_rr=False,
+        name=None,
+    ):
+        """
+        Args:
+            in_features (int): The number of input features.
+            out_features (int): The number of output features.
+            weight_attr (ParamAttr, optional): The parameter attribute for the learnable
+                weight matrix. Defaults to None. If it is None, the system will
+                generate a default Attribute object.
+            has_bias (bool, optional): Whether the layer uses a bias term. Defaults to True.
+            input_is_parallel (bool, optional): Whether the input is parallel. Defaults to False.
+            fuse_matmul_bias (bool, optional): Whether to fuse matmul and bias into one kernel. Defaults to False.
+            mp_group (Group, optional): Model parallel group. Defaults to None.
+            use_rr (bool, optional): Whether to use refined rr. Defaults to False.
+            name (str, optional): Name of the layer. Defaults to None.
+        """
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            weight_attr=weight_attr,
+            has_bias=has_bias,
+            input_is_parallel=input_is_parallel,
+            fuse_matmul_bias=fuse_matmul_bias,
+            mp_group=mp_group,
+            name=name,
+        )
+
+        self._rr_row_ln = RefinedRecomputeFunction() if use_rr else None
+
+        if self.weight.is_distributed:
+            self.weight.split_axis = 0
+
+    def forward(self, x):
+        """
+        Forward pass function that computes the product of the input tensor and model parameters.
+
+        Args:
+            x (paddle.Tensor): Input tensor with shape (batch_size, in_features).
+
+        Returns:
+            paddle.Tensor: Returns a tensor with shape (batch_size, out_features).
+        """
+        input_parallel = x
+        if self.is_mp:
+            if self.mp_scale is not None:
+                bias = self.mp_scale(self.bias, self.world_size)
+            else:
+                bias = None
+
+            def linear_reduce_scatter(input, weight, bias=None, name=None):
+                output = self.linear(input, weight=weight, bias=bias, name=name)
+                return ReduceScatterOp.apply(output)
+
+            if (
+                self._rr_row_ln is not None and self.training
+            ):  # in eval mode, do not use refined recompute
+                output_ = self._rr_row_ln(
+                    linear_reduce_scatter,
+                    input_parallel,
+                    self.weight,
+                    bias=bias,
+                    name=self._name,
+                )
+            else:
+                output_ = linear_reduce_scatter(
+                    input_parallel, self.weight, bias=bias, name=self._name
+                )
+
+            # if self.bias is not none, sequence parallel will use
+            # register_hook to all_reduce self.bias
+            if bias is None and self.bias is not None:
+                output = output_ + self.bias
+            else:
+                output = output_
+        else:
+            output = self.linear(
+                input_parallel, self.weight, self.bias, name=self._name
+            )
+        return output
+
+
+class AllGatherVarlenOp(PyLayer):
+    """
+    A custom PyLayer that performs variable-length allgather operation.
+
+    This operation handles tensors with different shapes across ranks by:
+    1. Gathering shape information from all ranks
+    2. Padding tensors to maximum size
+    3. Performing allgather
+    4. Reconstructing the original variable-length tensors
+    """
+
+    @staticmethod
+    def forward(ctx, input):
+        """Forward pass for variable-length allgather operation.
+
+        Args:
+            ctx: PyLayer context for saving state
+            input (Tensor): Input tensor to be gathered (may have different sizes across ranks)
+
+        Returns:
+            Tensor: Concatenated output from all ranks with original lengths
+        """
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+
+        shape0 = paddle.to_tensor([input.shape[0]])
+        shape0_all = paddle.empty(shape=[group.nranks], dtype=shape0.dtype)
+        dist.stream.all_gather(shape0_all, shape0, group=group, use_calc_stream=True)
+        shape0_all = shape0_all.numpy()
+        max_shape0 = shape0_all.max()
+
+        indices = []
+        for idx, s in enumerate(shape0_all):
+            offset = idx * max_shape0
+            indices.extend(list(range(offset, offset + s)))
+        indices = paddle.to_tensor(indices)
+
+        padding = max_shape0 - input.shape[0]
+
+        ctx.shape0 = input.shape[0]
+        ctx.max_shape0 = max_shape0
+        ctx.shape0_all = shape0_all
+        ctx.padding = padding
+        ctx.indices = indices
+
+        if padding > 0:
+            input_shape = input.shape
+            input_shape[0] = padding
+            padding_tensor = paddle.empty(shape=input_shape, dtype=input.dtype)
+            input = paddle.concat([input, padding_tensor], axis=0)
+        output = all_gather(input)
+        output = paddle.gather(output, indices, axis=0)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad):
+        """Backward pass for variable-length allgather operation.
+
+        Args:
+            ctx: PyLayer context with saved state
+            grad (Tensor): Gradient flowing back through the graph
+
+        Returns:
+            Tensor: Scattered gradient with original variable lengths
+        """
+        input_shape = grad.shape
+        input_shape[0] = ctx.max_shape0 * ctx.shape0_all.shape[0]
+        output = paddle.zeros(shape=input_shape, dtype=grad.dtype)
+
+        # grad = paddle.put_along_axis(output, ctx.indices, grad, axis=0)
+        grad = paddle.scatter(output, ctx.indices, grad)
+        grad = scatter(grad)
+
+        if ctx.padding > 0:
+            grad = grad[: ctx.shape0]
+        return grad
+
+
+def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
+    """
+    Processes sparse labels in sequence parallel training by gathering non-ignored labels across all ranks.
+
+    This function handles the case where labels may contain ignored values (typically -100) by:
+    1. Distributing labels across model parallel ranks
+    2. Identifying and gathering only valid (non-ignored) labels
+    3. Performing a variable-length allgather operation to collect all valid labels
+
+    Args:
+        labels (paddle.Tensor): The input label tensor which may contain ignore_label values.
+                              Shape should be compatible with model parallel distribution.
+        ignore_label (int, optional): The value used to indicate labels that should be ignored.
+                                     Defaults to -100 (common convention in NLP tasks).
+
+    Returns:
+        tuple: Contains two elements:
+            - labels_all_gather (paddle.Tensor): Concatenated tensor of all non-ignored labels
+                                               from all model parallel ranks.
+            - tgt_index (paddle.Tensor): Indices of the non-ignored labels in the local rank's
+                                        portion of the original labels tensor.
+
+    Note:
+        - This function assumes sequence parallel training is being used.
+        - If a rank has no valid labels (all ignored), it will still contribute one dummy label
+          (index 0) to maintain consistency in the distributed computation.
+        - The returned tgt_index can be used to reconstruct the original label positions.
+    """
+    hcg = fleet.get_hybrid_communicate_group()
+    group = hcg.get_model_parallel_group()
+    labels = labels.flatten()
+    labels_local = paddle.split(labels, group.nranks)[group.rank]
+
+    tgt_index = paddle.nonzero(labels_local != ignore_label).reshape([-1])
+    if tgt_index.numel() == 0:
+        tgt_index = paddle.to_tensor([0])
+
+    labels_local_gather = paddle.gather(labels_local, tgt_index, axis=0)
+    labels_all_gather = AllGatherVarlenOp.apply(labels_local_gather)
+    return labels_all_gather, tgt_index
+
+
+async_loader = None
+
+
+def get_async_loader():
+    """get_async_loader"""
+    global async_loader
+    if not hasattr(fleet.fleet, "_hcg"):
+        if async_loader is None:
+            async_loader = create_async_load()
+        return async_loader
+
+    hcg = get_hcg()
+    if not hasattr(hcg, "async_loader"):
+        hcg.async_loader = create_async_load()
+    return hcg.async_loader
+
+
+def hack_offload_wait(task):
+    """hack_offload_wait"""
+    task.cpu_wait()
+
+
+def hack_reload_wait(task):
+    """hack_offload_wait"""
+    task.cuda_wait()
+
+
+def all_gather_group(input, group=None, axis=0):
+    """Perform collective all-gather operation across a process group with axis control.
+
+    Functional Behavior:
+      - Aggregates input tensors from all processes in the specified group
+      - Supports concatenation along arbitrary dimensions (axis parameter)
+      - Optimizes for axis=0 via direct shape expansion to avoid concatenation overhead
+
+    Args:
+        input (Tensor):        Local tensor to be gathered (shape: [..., D, ...])
+        group (ProcessGroup):  Communication group (defaults to model parallel group)
+        axis (int):            Concatenation dimension (default=0)
+
+    Returns:
+        Tensor: Concatenated tensor combining inputs from all processes:
+                - When axis=0: shape [D*N, ...] (N = group size)
+                - Otherwise:   shape [..., D*N, ...] along specified axis
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    output_shape = input.shape
+    if axis == 0:
+        output_shape[axis] = output_shape[axis] * parallelism
+        output = paddle.empty(shape=output_shape, dtype=input.dtype)
+        dist.stream.all_gather(output, input, group=group, use_calc_stream=True)
+        return output
+    outputs = [
+        paddle.empty(output_shape, dtype=input.dtype) for _ in range(parallelism)
+    ]
+    dist.stream.all_gather(outputs, input, group=group, use_calc_stream=True)
+    output = paddle.concat(outputs, axis=axis)
+    return output
+
+
+def reduce_scatter_group(input, group=None):
+    """Perform reduce-scatter collective operation across a process group.
+
+    Functional Behavior:
+      - Aggregates (sums) input tensors across all processes in the group
+      - Scatters the reduced result equally to all participants
+      - Operates along the first dimension (axis=0) of the input tensor
+
+    Args:
+        input (Tensor):        Local tensor to reduce (shape: [N*K, ...] where N=group_size)
+        group (ProcessGroup): Communication group (defaults to model parallel group)
+
+    Returns:
+        Tensor: Scattered portion of reduced tensor with shape [K, ...]
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    output_shape = input.shape
+    assert (
+        input.shape[0] % parallelism == 0
+    ), f"Input sequence length {input.shape[0]} can't be divided exactly by sequence parallelism {parallelism}"
+    output_shape[0] = output_shape[0] // parallelism
+    output = paddle.empty(shape=output_shape, dtype=input.dtype)
+    dist.stream.reduce_scatter(
+        output, input, op=dist.ReduceOp.SUM, group=group, use_calc_stream=True
+    )
+    return output
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py
new file mode 100644
index 0000000000..7e43b3cff0
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_ernie.py
@@ -0,0 +1,2363 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Paddle Ernie model"""
+
+import contextlib
+import functools
+from functools import partial
+from typing import Optional, Tuple
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import incubate, nn, tensor
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+from paddle.distributed.fleet.layers.mpu import mp_ops
+from paddle.distributed.fleet.layers.mpu.mp_layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+)
+from paddle.distributed.fleet.meta_parallel import (
+    ParallelCrossEntropy,
+    get_rng_state_tracker,
+)
+from paddle.distributed.fleet.utils import recompute
+
+from ......utils import logging
+from ....common.vlm.transformers import PretrainedModel
+from ....common.vlm.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+)
+from ._config import PaddleOCRVLConfig
+from ._distributed import (
+    AllGatherVarlenOp,
+    ColumnParallelLinear,
+    ColumnSequenceParallelLinear,
+    GatherOp,
+    RowParallelLinear,
+    RowSequenceParallelLinear,
+    RRColumnSequenceParallelLinear,
+    RRRowSequenceParallelLinear,
+    mark_as_sequence_parallel_parameter,
+    parallel_matmul,
+    sequence_parallel_sparse_mask_labels,
+)
+from ._fusion_ops import (
+    Linear,
+    fused_rms_norm_ext,
+    fused_swiglu,
+    fusion_flash_attention,
+)
+from ._sequence_parallel_utils import ScatterOp
+
+
+def calc_lm_head_logits(
+    config, hidden_states, weight, bias, tensor_parallel_output=None, training=True
+):
+    """
+    Calculate language model head logits with support for various parallelization strategies.
+
+    This is the core function that computes the final output logits for a language model,
+    handling sequence parallelism and tensor parallelism configurations.
+
+    Args:
+        config (PaddleOCRVLConfig): Model configuration.
+        hidden_states (Tensor): Hidden states from the transformer layers
+        weight (Tensor): Weight matrix for the language model head
+        bias (Tensor): Bias vector for the language model head
+        tensor_parallel_output (bool, optional): Override for tensor parallel output behavior.
+                                               If None, uses config.tensor_parallel_output.
+                                               Defaults to None.
+        training (bool, optional): Whether in training mode. Defaults to True.
+
+    Returns:
+        Tensor: The computed logits for language modeling.
+    """
+    if config.sequence_parallel:
+        if config.use_sparse_head_and_loss_fn:
+            pass  # Nothing needs to be done.
+        else:
+            hidden_states = GatherOp.apply(hidden_states)
+            max_sequence_length = config.max_sequence_length
+            hidden_states = hidden_states.reshape(
+                [-1, max_sequence_length, hidden_states.shape[-1]]
+            )
+
+    if tensor_parallel_output is None:
+        tensor_parallel_output = config.tensor_parallel_output
+    logits = parallel_matmul(
+        hidden_states,
+        weight,
+        bias=bias,
+        transpose_y=config.tie_word_embeddings,
+        tensor_parallel_degree=config.tensor_parallel_degree,
+        tensor_parallel_output=tensor_parallel_output,
+        fuse_linear=config.fuse_linear,
+        training=training,
+    )
+
+    return logits
+
+
+def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_arg_idx={}):
+    """
+    Converts a function to one that applies to subbatch of an input dimension.
+    This is useful for processing large tensors in smaller chunks to reduce memory usage.
+
+    Args:
+        f (Callable): Original function to be converted to subbatch processing.
+        arg_idx ([int]): Indices of the inputs to be subbatched.
+        axis ([int]): Indices of the dimensions to be subbatched for each input.
+        bs (int): Subbatch size (number of elements to process at once).
+        out_idx (int): Index of the output dimension that needs stacking.
+        use_recompute (bool, optional): Whether to use recomputation for memory savings. Defaults to False.
+        same_arg_idx (dict, optional): Mapping of argument indices that share the same tensor.
+                                     e.g. {1: 0} means args[1] == args[0], avoiding duplicate slicing.
+
+    Returns:
+        Callable: Converted function that processes inputs in subbatches.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+
+        assert len(arg_idx) == len(
+            axis
+        ), "Number of batching args and number of batching dims should match."
+
+        inps = [args[i] for i in arg_idx]
+        axis_width = [inp.shape[d] for inp, d in zip(inps, axis)]
+        assert len(set(axis_width)) == 1, "Batch sizes should be kept equal."
+
+        inp_axis = {inp: d for inp, d in zip(inps, axis)}
+
+        axis_width = axis_width[0]
+        if axis_width < bs:
+            return f(*args, **kwargs)
+
+        outs = []
+        for slice_at in np.arange(0, axis_width, bs):
+            _args = []
+            for i, inp in enumerate(args):
+                if i in same_arg_idx:
+                    assert (
+                        i > same_arg_idx[i]
+                    ), f"expect i > same_arg_idx[i], but got i: {i} and same_arg_idx[i]: {same_arg_idx[i]}"
+                    _args.append(_args[same_arg_idx[i]])
+                elif i in arg_idx:
+                    inp = inp.slice(
+                        [inp_axis[inp]],
+                        [slice_at],
+                        [min(inp.shape[inp_axis[inp]], slice_at + bs)],
+                    )
+                    _args.append(inp)
+                else:
+                    _args.append(inp)
+            if use_recompute:
+                out = paddle.distributed.fleet.utils.recompute(f, *_args, **kwargs)
+            else:
+                out = f(*_args, **kwargs)
+            outs.append(out)
+
+        return paddle.concat(outs, out_idx)
+
+    return wrapper
+
+
+def _rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat((-x2, x1), axis=-1)
+
+
+def _apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    mrope_section = mrope_section * 2
+    cos = paddle.concat(
+        [m[i % 3] for i, m in enumerate(cos.split(mrope_section, axis=-1))], axis=-1
+    ).unsqueeze(unsqueeze_dim)
+    sin = paddle.concat(
+        [m[i % 3] for i, m in enumerate(sin.split(mrope_section, axis=-1))], axis=-1
+    ).unsqueeze(unsqueeze_dim)
+
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class FusedDropoutImpl(nn.Layer):
+    """
+    Fused dropout implementation with residual connection support.
+
+    This layer combines dropout and residual addition in a single operation for better performance,
+    particularly on GPU devices. The dropout is conditionally applied based on the probability.
+
+    Args:
+        prob (float): Dropout probability (between 0 and 1)
+        mode (str): Dropout mode, either 'upscale_in_train' or 'downscale_in_infer'
+
+    Attributes:
+        prob (float): Stores the dropout probability
+        mode (str): Stores the dropout mode
+        dropout (nn.Dropout): The actual dropout layer instance
+    """
+
+    def __init__(self, prob, mode):
+        """
+        Initialize the fused dropout layer.
+
+        Args:
+            prob (float): Dropout probability (0 means no dropout)
+            mode (str): Dropout mode ('upscale_in_train' or 'downscale_in_infer')
+        """
+        super().__init__()
+        self.prob = prob
+        self.mode = mode
+        self.dropout = nn.Dropout(p=prob, mode=mode)
+
+    def forward(self, x, y):
+        """
+        Forward pass of the fused dropout layer.
+
+        Args:
+            x (Tensor): Input tensor to potentially apply dropout on
+            y (Tensor): Residual tensor to add to the (possibly dropped out) x
+
+        Returns:
+            Tensor: Result of x (with optional dropout) + y
+        """
+        if self.prob > 0:
+            x = self.dropout(x)
+        output = x + y
+
+        return output
+
+
+class RMSNorm(nn.Layer):
+    """
+    Root Mean Square Layer Normalization (RMSNorm) implementation.
+
+    RMSNorm is a simplified version of LayerNorm that focuses on the root mean square of inputs,
+    omitting the mean-centering operation. This provides computational efficiency while maintaining
+    good performance.
+
+    """
+
+    def __init__(self, config):
+        """
+        Initialize RMSNorm layer.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configuration.
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to input hidden states.
+
+        Args:
+            hidden_states (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+
+        Returns:
+            Tensor: Normalized output tensor of same shape as input
+
+        Note:
+            - Uses fused kernel if config.fuse_rms_norm is True for better performance
+            - Otherwise computes RMSNorm manually:
+                1. Compute variance of features
+                2. Apply reciprocal square root normalization
+                3. Scale by learned weight parameter
+            - Maintains original dtype for numerical stability during computation
+        """
+        if self.config.fuse_rms_norm:
+            return fused_rms_norm_ext(
+                hidden_states, self.weight, self.variance_epsilon
+            )[0].astype(self.weight.dtype)
+        with paddle.amp.auto_cast(False):
+            variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+            hidden_states = (
+                paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+            )
+        return hidden_states.astype(self.weight.dtype) * self.weight
+
+
+class LayerNorm(nn.LayerNorm):
+    """
+    Layer Normalization (LayerNorm) implementation with optional optimizations.
+
+    This extends PaddlePaddle's built-in LayerNorm with:
+    1. Sequence parallelism support
+    2. Fast fused kernel implementation option
+    3. Configurable epsilon value
+
+    """
+
+    def __init__(self, config):
+        """
+        Initialize LayerNorm with configuration.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configuration contains normalization parameters and flags.
+        """
+        super().__init__(config.hidden_size, epsilon=config.rms_norm_eps)
+        self.config = config
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+            mark_as_sequence_parallel_parameter(self.bias)
+
+
+class KeyeRotaryEmbedding(nn.Layer):
+    def __init__(self, config: PaddleOCRVLConfig, device=None):
+        super().__init__()
+        self.rope_kwargs = {}
+        if config is None:
+            raise NotImplementedError
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get(
+                    "rope_type", config.rope_scaling.get("type")
+                )
+            else:
+                self.rope_type = "default"
+
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type")
+            )
+        else:
+            self.rope_type = "default"
+
+        self.config = config
+        if self.rope_type == "default":
+            dim = config.head_dim
+            inv_freq = 1.0 / (
+                config.rope_theta
+                ** (paddle.arange(0, dim, 2, dtype="int64").astype("float32") / dim)
+            )
+            self.attention_scaling = 1.0
+        else:
+            raise ValueError(f"Unsupported rope type: {self.rope_type}")
+
+        self.register_buffer("inv_freq", inv_freq, persistable=False)
+        self.original_inv_freq = self.inv_freq
+
+    @paddle.no_grad()
+    def forward(self, x, position_ids):
+        # Core RoPE block. In contrast to other models, Keye has different position ids for the grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = (
+            self.inv_freq[None, None, :, None]
+            .cast("float32")
+            .expand((3, position_ids.shape[1], -1, 1))
+        )
+        position_ids_expanded = position_ids[:, :, None, :].cast(
+            "float32"
+        )  # shape (3, bs, 1, positions)
+        with paddle.amp.auto_cast(enable=False):
+            freqs = (
+                inv_freq_expanded.cast("float32")
+                @ position_ids_expanded.cast("float32")
+            ).transpose((0, 1, 3, 2))
+            emb = paddle.concat((freqs, freqs), axis=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.astype(x.dtype), sin.astype(x.dtype)
+
+
+class Ernie4_5MLP(nn.Layer):
+    """
+    Ernie4_5MLP - Gated Multi-Layer Perceptron module used in Ernie model.
+    """
+
+    def __init__(self, config, layer_idx=0):
+        """
+        Initialize the MLP module with configuration options.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configurations.
+            layer_idx (int): Index of current layer (default: 0)
+        """
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        if config.tensor_parallel_degree > 1:
+            ColumnLN = (
+                ColumnSequenceParallelLinear
+                if config.sequence_parallel
+                else ColumnParallelLinear
+            )
+            RowLN = (
+                RowSequenceParallelLinear
+                if config.sequence_parallel
+                else RowParallelLinear
+            )
+
+            column_ln_configs = {}
+            if (
+                config.recompute
+                and config.sequence_parallel
+                and config.skip_recompute_ops[layer_idx].get("mlp_column_ln", False)
+            ):
+                ColumnLN = RRColumnSequenceParallelLinear
+                column_ln_configs = {"use_rr": True}
+            self.up_gate_proj = ColumnLN(
+                self.hidden_size,
+                self.intermediate_size * 2,
+                gather_output=False,
+                has_bias=config.use_bias,
+                fuse_matmul_bias=config.fuse_linear,
+                **column_ln_configs,
+            )
+        else:
+            LinearFN = paddle.incubate.nn.FusedLinear if config.fuse_linear else Linear
+            self.up_gate_proj = LinearFN(
+                self.hidden_size, self.intermediate_size * 2, bias_attr=config.use_bias
+            )
+
+        if config.tensor_parallel_degree > 1:
+            row_ln_configs = {}
+            if (
+                config.recompute
+                and config.sequence_parallel
+                and config.skip_recompute_ops[layer_idx].get("mlp_row_ln", False)
+            ):
+                RowLN = RRRowSequenceParallelLinear
+                row_ln_configs = {"use_rr": True}
+            self.down_proj = RowLN(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=config.use_bias,
+                fuse_matmul_bias=config.fuse_linear,
+                **row_ln_configs,
+            )
+        else:
+            LinearFN = paddle.incubate.nn.FusedLinear if config.fuse_linear else Linear
+            self.down_proj = LinearFN(
+                self.intermediate_size, self.hidden_size, bias_attr=config.use_bias
+            )
+
+        self.fuse_swiglu = config.fuse_swiglu
+        if self.fuse_swiglu:
+            assert fused_swiglu is not None, "fused_swiglu operator is not found."
+
+    def forward(self, x):
+        """
+        Forward pass through the MLP module.
+
+        Args:
+            x (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+
+        Returns:
+            Tensor: Output tensor of shape [batch_size, seq_len, hidden_size]
+
+        Note:
+            Implements SwiGLU activation: swish(Wx) * (Vx) where W and V are
+            the first and second halves of up_gate_proj output respectively.
+        """
+        if self.fuse_swiglu:
+            x = self.up_gate_proj(x)
+            x = fused_swiglu(x)
+        else:
+            gate, x = self.up_gate_proj(x).chunk(2, axis=-1)
+            x = F.silu(gate) * x
+        return self.down_proj(x)
+
+
+class Ernie4_5Attention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config, layer_idx=0):
+        """Initialize the attention layer.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configuration.
+            layer_idx (int, optional): Index in transformer stack. Defaults to 0.
+        """
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        if getattr(config, "head_dim", None) is None:
+            self.head_dim = self.hidden_size // self.num_heads
+        else:
+            self.head_dim = config.head_dim
+        self.is_gqa = (
+            config.num_key_value_heads is not None
+            and config.num_key_value_heads != self.num_heads
+        )
+
+        self.rope_scaling = config.rope_scaling
+
+        self.freq_allocation = config.get("freq_allocation", 0)
+
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+            if self.is_gqa:
+                assert (
+                    self.num_key_value_heads % config.tensor_parallel_degree == 0
+                ), f"num_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+                self.num_key_value_heads = (
+                    self.num_key_value_heads // config.tensor_parallel_degree
+                )
+        if self.is_gqa:
+            logging.info(
+                f"use GQA - num_heads: {self.num_heads}- num_key_value_heads: {self.num_key_value_heads}"
+            )
+            assert (
+                self.num_heads % self.num_key_value_heads == 0
+            ), f"num_heads: {self.num_heads}, num_key_value_heads: {self.num_key_value_heads}"
+            if getattr(config, "head_dim", None) is None:
+                kv_hidden_size = (
+                    self.hidden_size // self.num_heads * self.num_key_value_heads
+                )
+            else:
+                kv_hidden_size = self.head_dim * config.num_key_value_heads
+                q_hidden_size = self.head_dim * config.num_attention_heads
+        else:
+            q_hidden_size = kv_hidden_size = self.head_dim * config.num_attention_heads
+
+        if config.tensor_parallel_degree > 1:
+            column_ln_configs = {}
+            ColumnLN = (
+                ColumnSequenceParallelLinear
+                if config.sequence_parallel
+                else ColumnParallelLinear
+            )
+            RowLN = (
+                RowSequenceParallelLinear
+                if config.sequence_parallel
+                else RowParallelLinear
+            )
+            if (
+                config.recompute
+                and config.sequence_parallel
+                and config.skip_recompute_ops[layer_idx].get(
+                    "attention_column_ln", False
+                )
+            ):
+                ColumnLN = RRColumnSequenceParallelLinear
+                column_ln_configs = {"use_rr": True}
+
+            if getattr(config, "head_dim", None) is None:
+                qkv_hidden_size = (
+                    self.hidden_size * 3
+                    if not self.is_gqa
+                    else self.hidden_size + kv_hidden_size * 2
+                )
+            else:
+                qkv_hidden_size = q_hidden_size + kv_hidden_size * 2
+            self.qkv_proj = ColumnLN(
+                self.hidden_size,
+                qkv_hidden_size,
+                has_bias=config.use_bias,
+                gather_output=False,
+                fuse_matmul_bias=config.fuse_linear,
+                **column_ln_configs,
+            )
+        else:
+            LinearFN = paddle.incubate.nn.FusedLinear if config.fuse_linear else Linear
+            if getattr(config, "head_dim", None) is None:
+                qkv_hidden_size = (
+                    self.hidden_size * 3
+                    if not self.is_gqa
+                    else self.hidden_size + kv_hidden_size * 2
+                )
+            else:
+                qkv_hidden_size = q_hidden_size + kv_hidden_size * 2
+            self.qkv_proj = LinearFN(
+                self.hidden_size,
+                qkv_hidden_size,
+                bias_attr=config.use_bias,
+            )
+
+        if config.tensor_parallel_degree > 1:
+            row_ln_configs = {}
+            if (
+                config.recompute
+                and config.sequence_parallel
+                and config.skip_recompute_ops[layer_idx].get("attention_row_ln", False)
+            ):
+                RowLN = RRRowSequenceParallelLinear
+                row_ln_configs = {"use_rr": True}
+
+            self.o_proj = RowLN(
+                (
+                    self.hidden_size
+                    if getattr(config, "head_dim", None) is None
+                    else q_hidden_size
+                ),
+                self.hidden_size,
+                has_bias=config.use_bias,
+                input_is_parallel=True,
+                fuse_matmul_bias=config.fuse_linear,
+                **row_ln_configs,
+            )
+        else:
+            LinearFN = paddle.incubate.nn.FusedLinear if config.fuse_linear else Linear
+            self.o_proj = LinearFN(
+                (
+                    self.hidden_size
+                    if getattr(config, "head_dim", None) is None
+                    else q_hidden_size
+                ),
+                self.hidden_size,
+                bias_attr=config.use_bias,
+            )
+        self.config = config
+
+        self._rr_flash_attn = None
+        if config.recompute and config.skip_recompute_ops[layer_idx].get(
+            "flash_attn", False
+        ):
+            # TODO
+            raise NotImplementedError
+
+        self.set_attn_func()
+
+    def set_attn_func(self):
+        """Configure attention function based on settings.
+
+        Selects between flash/core attention.
+        """
+        config = self.config
+        if config.use_flash_attention:
+            self.attn_func = self._flash_attention_wrapper
+        else:
+            self.attn_func = self.core_attn
+
+        if config.cachekv_quant:
+            # TODO: Support `cachekv_quant`
+            raise NotImplementedError
+
+    def forward(
+        self,
+        hidden_states,
+        position_embeddings,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        attn_mask_start_row_indices: Optional[paddle.Tensor] = None,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        token_type_ids: Optional[Tuple[paddle.Tensor]] = None,  # MLLM
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Compute attention outputs.
+
+        Args:
+            hidden_states (paddle.Tensor): Input tensor [bsz, seq_len, hidden_size]
+            position_embeddings (paddle.Tensor): Position embeddings
+            past_key_value (Optional[Tuple[paddle.Tensor, paddle.Tensor]]): Cached key/value states
+            attention_mask (Optional[paddle.Tensor]): Attention mask tensor
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Variable length attention indices
+            position_ids (Optional[paddle.Tensor]): Position indices for RoPE
+            output_attentions (bool): Return attention weights if True
+            use_cache (bool): Cache key/value states if True
+
+        Returns:
+            Tuple containing:
+                - attention_output: [bsz, seq_len, hidden_size]
+                - attention_weights: Optional attention probabilities
+                - updated_key_value_cache: Optional updated cache
+        """
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, :-1]
+        if self.config.sequence_parallel:
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids.reshape([-1])
+                token_type_ids = ScatterOp.apply(token_type_ids)
+                token_type_ids.stop_gradient = True
+            max_sequence_length = self.config.max_sequence_length
+            bsz = (
+                hidden_states.shape[0]
+                * self.config.tensor_parallel_degree
+                // max_sequence_length
+            )
+            q_len = max_sequence_length
+        else:
+            bsz, q_len, _ = hidden_states.shape
+        query_states = key_states = value_states = mix_layer = None
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.is_gqa:
+            query_states, key_states, value_states = paddle.split(
+                mix_layer.reshape([bsz, q_len, -1, self.head_dim]),
+                [self.num_heads, self.num_key_value_heads, self.num_key_value_heads],
+                axis=2,
+            )
+            mix_layer = None
+        else:
+            mix_layer = mix_layer.reshape(
+                [bsz, q_len, self.num_heads, 3 * self.head_dim]
+            )
+
+        if mix_layer is not None:
+            has_gradient = not mix_layer.stop_gradient
+        else:
+            has_gradient = not (
+                query_states.stop_gradient
+                and key_states.stop_gradient
+                and value_states.stop_gradient
+            )
+        if (
+            self.config.recompute
+            and self.config.recompute_granularity == "core_attn"
+            and has_gradient
+        ):
+            assert past_key_value is None, "do not use kv cache in recompute"
+            assert not use_cache
+            attn_output, attn_weights, past_key_value = recompute(
+                self.rope_attn,
+                mix_layer,
+                query_states,
+                key_states,
+                value_states,
+                position_embeddings,
+                attention_mask,
+                position_ids,
+                output_attentions,
+                past_key_value,
+                use_cache,
+                attn_mask_start_row_indices,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            attn_output, attn_weights, past_key_value = self.rope_attn(
+                mix_layer=mix_layer,
+                query_states=query_states,
+                key_states=key_states,
+                value_states=value_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                attn_mask_start_row_indices=attn_mask_start_row_indices,
+            )
+        if self.config.sequence_parallel:
+            attn_output = attn_output.reshape([-1, attn_output.shape[-1]])
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_wrapper(
+        self,
+        q,
+        k,
+        v,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        seq_length=None,
+    ):
+        """Optimized flash attention implementation.
+
+        Args:
+            q (paddle.Tensor): Query tensor
+            k (paddle.Tensor): Key tensor
+            v (paddle.Tensor): Value tensor
+            attention_mask (Optional[paddle.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Variable length indices
+            seq_length (Optional[int]): Sequence length
+
+        Returns:
+            paddle.Tensor: Attention output tensor
+        """
+        return fusion_flash_attention(
+            q,
+            k,
+            v,
+            self.training,
+            self.config.attention_probs_dropout_prob,
+            self.config.use_sparse_flash_attn,
+            attention_mask,
+            attn_mask_start_row_indices,
+            seq_length,
+            self.config.use_var_len_flash_attn,
+            self._rr_flash_attn if self.training else None,
+        )
+
+    def core_attn(
+        self,
+        q,
+        k,
+        v,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        seq_length=None,
+    ):
+        """Standard self-attention implementation.
+
+        Args:
+            q (paddle.Tensor): Query tensor
+            k (paddle.Tensor): Key tensor
+            v (paddle.Tensor): Value tensor
+            attention_mask (Optional[paddle.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Variable length indices
+            seq_length (Optional[int]): Sequence length
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]: Attention output and weights
+        """
+        perm = [
+            0,
+            2,
+            1,
+            3,
+        ]  # [1, 2, 0, 3] if self.sequence_parallel else [0, 2, 1, 3]
+        origin_dtype = q.dtype
+
+        q = tensor.transpose(x=q, perm=perm)
+        k = tensor.transpose(x=k, perm=perm)
+        v = tensor.transpose(x=v, perm=perm)
+
+        replicate = self.config.num_attention_heads // self.config.num_key_value_heads
+        k = paddle.repeat_interleave(k, replicate, axis=1)
+        v = paddle.repeat_interleave(v, replicate, axis=1)
+
+        scale_qk_coeff = self.config.scale_qk_coeff * self.head_dim**0.5
+        product = paddle.matmul(x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)
+
+        product = product.cast(paddle.float32)
+        if self.config.scale_qk_coeff != 1.0:
+            product = product.scale(self.config.scale_qk_coeff)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.cast(paddle.float32)
+            if self.config.fuse_softmax_mask:
+                weights = incubate.softmax_mask_fuse(product, attention_mask)
+            else:
+                product = product + attention_mask
+                weights = F.softmax(product)
+        else:
+            weights = incubate.softmax_mask_fuse_upper_triangle(product)
+
+        weights = weights.cast(origin_dtype)
+
+        if self.config.attention_probs_dropout_prob:
+            with get_rng_state_tracker().rng_state("local_seed"):
+                weights = F.dropout(
+                    weights,
+                    self.config.attention_probs_dropout_prob,
+                    training=self.training,
+                    mode="upscale_in_train",
+                )
+
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])
+        # If sequence_parallel is true, out shape is [s, b, h] after reshape
+        # else out shape is [b, s, h]
+        out = tensor.reshape(x=out, shape=[0, 0, -1])
+
+        return out, weights
+
+    def rope_attn(
+        self,
+        mix_layer,
+        query_states,
+        key_states,
+        value_states,
+        position_embeddings,
+        attention_mask,
+        position_ids,
+        output_attentions=False,
+        past_key_value=None,
+        use_cache=False,
+        attn_mask_start_row_indices=None,
+    ):
+        if mix_layer is not None:
+            query_states, key_states, value_states = paddle.split(mix_layer, 3, axis=-1)
+        query_states_dtype = query_states.dtype
+
+        kv_seq_len = position_ids.max() + 1
+        offset = 0
+        if past_key_value is not None:
+            # LLM
+            offset = past_key_value[0].shape[-3]
+            kv_seq_len += offset
+
+        query_states = query_states.astype(query_states_dtype)
+        key_states = key_states.astype(query_states_dtype)
+
+        if position_ids.dim() == 3 and position_ids.shape[0] > 1:
+            position_ids = position_ids[0:1]
+
+        cos, sin = position_embeddings
+        query_states, key_states = _apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"], 2
+        )
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        # NOTE(for generation): use list instead of tuple to store the cache
+        # tensors, so that we can clear the cache tensors for memory efficiency.
+        past_key_value = [key_states, value_states] if use_cache else None
+        seq_length = query_states.shape[1]
+        attn_output, attn_weights = self.attn_func(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            attn_mask_start_row_indices,
+            seq_length,
+        )
+        return attn_output, attn_weights, past_key_value
+
+
+class FusedHeadParallelCrossEntropy(PyLayer):
+    """Fused parallel cross-entropy loss computation for large sequence lengths.
+
+    Combines head projection and loss computation with optimized memory usage for long sequences,
+    supporting tensor parallel training.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        hidden_states,
+        weight,
+        bias,
+        labels,
+        tensor_parallel_degree,
+        mp_group=None,
+        ignore_index=-100,
+        seq_chunk_size=8192,
+        transpose_y=False,
+        fuse_linear=False,
+        training=True,
+    ):
+        """Forward pass for parallel cross-entropy computation.
+
+        Args:
+            ctx: Context object for saving tensors between forward/backward
+            hidden_states (paddle.Tensor): Input tensor of shape [batch_size*seq_len, hidden_size]
+            weight (paddle.Tensor): Weight matrix for projection
+            bias (Optional[paddle.Tensor]): Optional bias vector
+            labels (paddle.Tensor): Target labels tensor of shape [batch_size*seq_len]
+            tensor_parallel_degree (int): Degree of tensor parallelism
+            mp_group (Optional[dist.Group]): Model parallel group. Defaults to None (auto-detect)
+            ignore_index (int): Index to ignore in loss computation. Defaults to -100
+            seq_chunk_size (int): Chunk size for processing long sequences. Defaults to 8192
+            transpose_y (bool): Whether to transpose weight matrix. Defaults to False
+            fuse_linear (bool): Whether to use fused linear ops. Defaults to False
+            training (bool): Whether in training mode. Defaults to True
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]:
+                - loss: Computed loss tensor
+                - gathered_labels: Concatenated labels from all parallel groups
+        """
+
+        ctx.tensor_parallel_degree = tensor_parallel_degree
+        ctx.ignore_index = ignore_index
+        ctx.seq_chunk_size = seq_chunk_size
+        ctx.transpose_y = transpose_y
+        ctx.fuse_linear = fuse_linear
+        ctx.training = training
+
+        ctx.hidden_states_shape = hidden_states.shape
+
+        ctx.mp_group = (
+            fleet.get_hybrid_communicate_group().get_model_parallel_group()
+            if mp_group is None
+            else mp_group
+        )
+        ctx.rank = ctx.mp_group.rank
+        ctx.world_size = ctx.mp_group.nranks
+
+        loss_all = []
+        labels_all = []
+        with paddle.no_grad():
+            labels = labels.reshape_([-1])
+            hidden_states = hidden_states.reshape_([-1, hidden_states.shape[-1]])
+
+            num_tokens_per_rank = []
+            dist.stream.all_gather(
+                num_tokens_per_rank,
+                paddle.to_tensor(hidden_states.shape[0], dtype=paddle.int32),
+                group=ctx.mp_group,
+            )
+            ctx.num_tokens_per_rank = num_tokens_per_rank
+
+            for idx in range(ctx.world_size):
+                if idx == ctx.rank:
+                    hidden_states_recv = hidden_states
+                    labels_recv = labels
+                else:
+                    hidden_states_recv = paddle.empty(
+                        [ctx.num_tokens_per_rank[idx], hidden_states.shape[-1]],
+                        dtype=hidden_states.dtype,
+                    )
+                    labels_recv = paddle.empty(
+                        [ctx.num_tokens_per_rank[idx]], dtype=labels.dtype
+                    )
+
+                dist.stream.broadcast(
+                    hidden_states_recv, src=ctx.mp_group.ranks[idx], group=ctx.mp_group
+                )
+                dist.stream.broadcast(
+                    labels_recv, src=ctx.mp_group.ranks[idx], group=ctx.mp_group
+                )
+
+                seq_len = hidden_states_recv.shape[0]
+                num_chunk = (seq_len + ctx.seq_chunk_size - 1) // ctx.seq_chunk_size
+
+                loss_chunk = []
+                for chunk_idx in range(num_chunk):
+                    start = chunk_idx * ctx.seq_chunk_size
+                    end = min(start + ctx.seq_chunk_size, seq_len)
+                    hidden_states_chunk = hidden_states_recv._slice(start, end)
+                    labels_chunk = labels_recv._slice(start, end)
+
+                    logits = parallel_matmul(
+                        hidden_states_chunk,
+                        weight,
+                        bias=bias,
+                        transpose_y=ctx.transpose_y,
+                        tensor_parallel_degree=ctx.tensor_parallel_degree,
+                        tensor_parallel_output=True,
+                        fuse_linear=ctx.fuse_linear,
+                        training=ctx.training,
+                    )
+
+                    with paddle.amp.auto_cast(False):
+                        loss = mp_ops._c_softmax_with_cross_entropy(
+                            logits.cast("float32"),
+                            labels_chunk.unsqueeze(-1),
+                            group=ctx.mp_group,
+                            ignore_index=ctx.ignore_index,
+                        )
+                        loss_chunk.append(loss)
+                loss_all.append(paddle.concat(loss_chunk, axis=0))
+                labels_all.append(labels_recv)
+
+            ctx.loss_concat_sections = [loss.shape[0] for loss in loss_all]
+            loss_all = paddle.concat(loss_all, axis=0)
+            labels_all = paddle.concat(labels_all, axis=0)
+
+            tensor_inputs = [hidden_states, weight, bias, labels]
+            ctx.save_for_backward(*tensor_inputs)
+
+        return loss_all, labels_all
+
+    @staticmethod
+    def backward(ctx, loss_all_grad, labels_all_grad):
+        """Backward pass for parallel cross-entropy computation.
+
+        Args:
+            ctx: Context object with saved tensors from forward
+            loss_all_grad (paddle.Tensor): Gradient of loss
+            labels_all_grad (paddle.Tensor): Gradient of labels (unused)
+
+        Returns:
+            Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[paddle.Tensor], None]:
+                - hidden_states_grad: Gradient for input hidden states
+                - weight_grad: Gradient for weight matrix (None if not trainable)
+                - bias_grad: Gradient for bias vector (None if not trainable or not provided)
+                - None: Placeholder for labels gradient
+        """
+
+        hidden_states, weight, bias, labels = ctx.saved_tensor()
+
+        loss_all_grad_list = paddle.split(
+            loss_all_grad, ctx.loss_concat_sections, axis=0
+        )
+
+        def detach_variable(inp):
+            if inp is None:
+                return None
+            x = inp.detach()
+            x.stop_gradient = inp.stop_gradient
+            return x
+
+        if weight.stop_gradient is False:
+            weight_main_grad = paddle.zeros(weight.shape, dtype=paddle.float32)
+        else:
+            weight_main_grad = None
+        if bias is not None and bias.stop_gradient is False:
+            bias_main_grad = paddle.zeros(bias.shape, dtype=paddle.float32)
+        else:
+            bias_main_grad = None
+
+        hidden_states = detach_variable(hidden_states)
+        weight = detach_variable(weight)
+        bias = detach_variable(bias)
+        labels = detach_variable(labels)
+
+        with paddle.base.dygraph.guard():
+            tracer = paddle.base.framework._dygraph_tracer()
+            tracer._has_grad = True
+
+            for idx in range(ctx.world_size):
+                if idx == ctx.rank:
+                    hidden_states_recv = hidden_states
+                    labels_recv = labels
+                else:
+                    hidden_states_recv = paddle.empty(
+                        [ctx.num_tokens_per_rank[idx], hidden_states.shape[-1]],
+                        dtype=hidden_states.dtype,
+                    )
+                    labels_recv = paddle.empty(
+                        [ctx.num_tokens_per_rank[idx]], dtype=labels.dtype
+                    )
+
+                dist.stream.broadcast(
+                    hidden_states_recv, src=ctx.mp_group.ranks[idx], group=ctx.mp_group
+                )
+                dist.stream.broadcast(
+                    labels_recv, src=ctx.mp_group.ranks[idx], group=ctx.mp_group
+                )
+                hidden_states_recv.stop_gradient = False
+
+                seq_len = hidden_states_recv.shape[0]
+                num_chunk = (seq_len + ctx.seq_chunk_size - 1) // ctx.seq_chunk_size
+
+                for chunk_idx in range(num_chunk):
+                    start = chunk_idx * ctx.seq_chunk_size
+                    end = min(start + ctx.seq_chunk_size, seq_len)
+                    hidden_states_chunk = hidden_states_recv.slice(
+                        axes=[0], starts=[start], ends=[end]
+                    )
+                    labels_chunk = labels_recv._slice(start, end)
+                    loss_grad_chunk = loss_all_grad_list[idx]._slice(start, end)
+
+                    logits = parallel_matmul(
+                        hidden_states_chunk,
+                        weight,
+                        bias=bias,
+                        transpose_y=ctx.transpose_y,
+                        tensor_parallel_degree=ctx.tensor_parallel_degree,
+                        tensor_parallel_output=True,
+                        fuse_linear=ctx.fuse_linear,
+                        training=ctx.training,
+                    )
+
+                    with paddle.amp.auto_cast(False):
+                        loss_chunk = mp_ops._c_softmax_with_cross_entropy(
+                            logits.cast("float32"),
+                            labels_chunk.unsqueeze(-1),
+                            group=ctx.mp_group,
+                            ignore_index=ctx.ignore_index,
+                        )
+
+                    with paddle.amp.auto_cast(enable=False):
+                        paddle.autograd.backward(loss_chunk, loss_grad_chunk)
+
+                    if weight_main_grad is not None:
+                        weight_main_grad.add_(weight.grad.cast(paddle.float32))
+                        weight.clear_gradient(True)
+                    if bias_main_grad is not None:
+                        bias_main_grad.add_(bias.grad.cast(paddle.float32))
+                        bias.clear_gradient(True)
+
+                if idx == ctx.rank:
+                    hidden_states_grad = hidden_states_recv.grad
+                    hidden_states_grad = hidden_states_grad.reshape(
+                        ctx.hidden_states_shape
+                    )
+
+        if weight_main_grad is not None:
+            weight_main_grad = weight_main_grad.astype(weight.dtype)
+        if bias_main_grad is not None:
+            bias_main_grad = bias_main_grad.astype(bias.dtype)
+
+        return (
+            hidden_states_grad,
+            weight_main_grad,
+            bias_main_grad,
+            None,
+        )
+
+
+class ErniePretrainingCriterion(paddle.nn.Layer):
+    """Criterion for ERNIE pretraining task."""
+
+    def __init__(self, config, return_tuple=True):
+        """Initialize the pretraining criterion.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configuration.
+            return_tuple (bool): Whether to return loss as tuple (loss, loss_sum). Defaults to True.
+        """
+        super(ErniePretrainingCriterion, self).__init__()
+        self.ignored_index = getattr(config, "ignored_index", -100)
+        self.config = config
+        self.return_tuple = return_tuple
+        self.enable_parallel_cross_entropy = (
+            config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+        )
+
+        if (
+            self.enable_parallel_cross_entropy
+        ):  # and False: # and lm_head is distributed
+            logging.info("using parallel cross entroy, take care")
+            self.loss_func = ParallelCrossEntropy()
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(
+                reduction="none",
+            )
+        self.token_balance_loss = config.token_balance_loss
+
+    def forward(self, prediction_scores, masked_lm_labels, loss_mask=None):
+        """Compute the pretraining loss.
+
+        Args:
+            prediction_scores (Union[paddle.Tensor, Tuple[paddle.Tensor, ...]]):
+                Either:
+                - Direct logits tensor [batch_size, seq_len, vocab_size]
+                - Tuple of (hidden_states, weight, bias) for sparse head computation
+            masked_lm_labels (paddle.Tensor): Target labels tensor [batch_size, seq_len]
+            loss_mask (Optional[paddle.Tensor]): Optional mask for valid tokens. Defaults to None.
+
+        Returns:
+            Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
+                - If return_tuple=False: Single loss tensor
+                - If return_tuple=True: Tuple of (normalized_loss, sum_loss)
+        """
+
+        if self.config.use_sparse_head_and_loss_fn:
+            hidden_states, outlinear_weight, outlinear_bias = prediction_scores[:3]
+
+            if self.config.sequence_parallel:
+                masked_lm_labels, sparse_label_idx = (
+                    sequence_parallel_sparse_mask_labels(
+                        masked_lm_labels, self.ignored_index
+                    )
+                )
+                sparse_label_idx = sparse_label_idx.reshape([-1, 1])
+                hidden_states = paddle.gather(hidden_states, sparse_label_idx, axis=0)
+                hidden_states = AllGatherVarlenOp.apply(hidden_states)
+            else:
+                masked_lm_labels = masked_lm_labels.flatten()
+                sparse_label_idx = paddle.nonzero(
+                    masked_lm_labels != self.ignored_index
+                ).flatten()
+                masked_lm_labels = paddle.take_along_axis(
+                    masked_lm_labels, sparse_label_idx, axis=0
+                )
+
+                hidden_states = hidden_states.reshape([-1, hidden_states.shape[-1]])
+                hidden_states = paddle.take_along_axis(
+                    hidden_states, sparse_label_idx.reshape([-1, 1]), axis=0
+                )
+
+            # `loss_mask` must be reset to None and re-calculate it in ErnieBotPretrainingCriterion
+            # when use use_sparse_head_and_loss_fn.
+            loss_mask = None
+            if self.config.use_recompute_loss_fn:
+                offload_kwargs = {}
+                if self.config.get("offload_lm_head", False):
+                    offload_kwargs["offload_indices"] = [1]
+                res = recompute(
+                    self.forward_impl_with_calc_logits,
+                    masked_lm_labels,
+                    loss_mask,
+                    hidden_states,
+                    outlinear_weight,
+                    outlinear_bias,
+                    **offload_kwargs,
+                )
+            else:
+                logits = calc_lm_head_logits(
+                    self.config,
+                    hidden_states,
+                    outlinear_weight,
+                    outlinear_bias,
+                    training=self.training,
+                )
+                res = self.forward_impl(logits, masked_lm_labels, loss_mask)
+        elif self.config.use_recompute_loss_fn:
+            if self.config.use_fused_head_and_loss_fn:
+                res = self.forward_impl_with_fused_head_loss_fn(
+                    masked_lm_labels, loss_mask, *prediction_scores
+                )
+            else:
+                assert isinstance(prediction_scores, tuple) and len(
+                    prediction_scores
+                ) in [3, 4], prediction_scores
+                res = recompute(
+                    self.forward_impl_with_calc_logits,
+                    masked_lm_labels,
+                    loss_mask,
+                    *prediction_scores,
+                )
+        else:
+            res = self.forward_impl(prediction_scores, masked_lm_labels, loss_mask)
+
+        return res
+
+    def forward_impl_with_fused_head_loss_fn(
+        self,
+        masked_lm_labels,
+        loss_mask,
+        hidden_states,
+        outlinear_weight,
+        outlinear_bias,
+    ):
+        """Compute loss with fused head and parallel cross-entropy.
+
+        Args:
+            masked_lm_labels (paddle.Tensor): Target labels tensor [batch_size, seq_len]
+            loss_mask (Optional[paddle.Tensor]): Optional mask for valid tokens
+            hidden_states (paddle.Tensor): Hidden states from transformer [batch_size, seq_len, hidden_size]
+            outlinear_weight (paddle.Tensor): Weight matrix for output projection
+            outlinear_bias (Optional[paddle.Tensor]): Optional bias for output projection
+
+        Returns:
+            Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
+                Same return format as forward()
+        """
+        assert (
+            self.config.tensor_parallel_degree > 0
+        ), "use_fused_head_and_loss_fn require tensor_parallel_degree > 0"
+        masked_lm_loss, masked_lm_labels_all = FusedHeadParallelCrossEntropy.apply(
+            hidden_states,
+            outlinear_weight,
+            outlinear_bias,
+            masked_lm_labels,
+            self.config.tensor_parallel_degree,
+            ignore_index=self.ignored_index,
+            seq_chunk_size=self.config.get("loss_subbatch_seqlen", 32768),
+            transpose_y=self.config.tie_word_embeddings,
+            fuse_linear=self.config.fuse_linear,
+            training=self.training,
+        )
+        if loss_mask is None:
+            loss_mask = masked_lm_labels_all != self.ignored_index
+        if (~loss_mask).all():  # empty span
+            logging.warning(
+                f"encounter empty span when calculate loss, ignored_index={self.ignored_index}"
+            )
+            loss = paddle.mean(masked_lm_loss) * 0.0
+            loss_sum = masked_lm_loss.sum().detach()
+        else:
+            loss_mask = loss_mask.reshape([-1]).cast(paddle.float32)
+            # 逐位对齐, 全精度聚合
+            masked_lm_loss = paddle.sum(
+                masked_lm_loss.cast(paddle.float32).reshape([-1]) * loss_mask
+            )
+            loss = masked_lm_loss / loss_mask.sum()
+            if self.token_balance_loss:
+                _loss = masked_lm_loss / self.config.token_balance_seqlen
+                loss = _loss - _loss.detach() + loss.detach()  # for 对线
+            loss_sum = masked_lm_loss.sum().detach()
+        if not self.return_tuple:  # only used in pp
+            if self.training:
+                return loss
+            return loss_sum
+        return loss, loss_sum
+
+    def forward_impl_with_calc_logits(
+        self,
+        masked_lm_labels,
+        loss_mask,
+        hidden_states,
+        outlinear_weight,
+        outlinear_bias,
+    ):
+        """Compute logits then calculate loss.
+
+        Args:
+            Same as forward_impl_with_fused_head_loss_fn()
+
+        Returns:
+            Same return format as forward()
+        """
+
+        logits = calc_lm_head_logits(
+            self.config,
+            hidden_states,
+            outlinear_weight,
+            outlinear_bias,
+            training=self.training,
+        )
+
+        return self.forward_impl(logits, masked_lm_labels, loss_mask)
+
+    def loss_impl(self, prediction_scores, masked_lm_labels):
+        """Core loss computation without reduction.
+
+        Args:
+            prediction_scores (paddle.Tensor): Logits tensor [batch_size, seq_len, vocab_size]
+            masked_lm_labels (paddle.Tensor): Target labels tensor [batch_size, seq_len]
+
+        Returns:
+            paddle.Tensor: Unreduced loss tensor
+        """
+        prediction_scores = prediction_scores.cast("float32")
+        masked_lm_loss = self.loss_func(
+            prediction_scores, masked_lm_labels.unsqueeze(-1)
+        )
+        return masked_lm_loss
+
+    def forward_impl(self, prediction_scores, masked_lm_labels, loss_mask=None):
+        """Standard loss computation with reduction and masking.
+
+        Args:
+            prediction_scores (paddle.Tensor): Logits tensor [batch_size, seq_len, vocab_size]
+            masked_lm_labels (paddle.Tensor): Target labels tensor [batch_size, seq_len]
+            loss_mask (Optional[paddle.Tensor]): Optional mask for valid tokens
+
+        Returns:
+            Same return format as forward()
+        """
+        if self.enable_parallel_cross_entropy:
+            assert prediction_scores.shape[-1] != self.config.vocab_size, (
+                f"enable_parallel_cross_entropy, the vocab_size should be splited:"
+                f" {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+            )
+
+        with paddle.amp.auto_cast(False):
+            prediction_scores_dims = len(prediction_scores.shape)
+            if prediction_scores_dims == 2 and prediction_scores.shape[
+                0
+            ] > self.config.get("loss_subbatch_seqlen", 32768):
+                sb_loss_func = subbatch(
+                    self.loss_impl,
+                    [0, 1],
+                    [0, 0],
+                    self.config.get("loss_subbatch_seqlen", 32768),
+                    0,
+                )
+                masked_lm_loss = sb_loss_func(prediction_scores, masked_lm_labels)
+            elif prediction_scores_dims == 3 and prediction_scores.shape[
+                1
+            ] > self.config.get("loss_subbatch_seqlen", 32768):
+                sb_loss_func = subbatch(
+                    self.loss_impl,
+                    [0, 1],
+                    [1, 1],
+                    self.config.get("loss_subbatch_seqlen", 32768),
+                    1,
+                )
+                masked_lm_loss = sb_loss_func(prediction_scores, masked_lm_labels)
+            else:
+                masked_lm_loss = self.loss_impl(prediction_scores, masked_lm_labels)
+
+            if loss_mask is None:
+                loss_mask = masked_lm_labels != self.ignored_index
+
+            lossmask = masked_lm_labels != self.ignored_index
+            if (~lossmask).all():  # empty span
+                logging.warning(
+                    f"encounter empty span when calculate loss, ignored_index={self.ignored_index}"
+                )
+                loss = paddle.mean(masked_lm_loss) * 0.0
+                loss_sum = masked_lm_loss.sum().detach()
+            else:
+                loss_mask = loss_mask.reshape([-1]).cast(paddle.float32)
+                # 逐位对齐, 全精度聚合
+                masked_lm_loss = paddle.sum(
+                    masked_lm_loss.cast(paddle.float32).reshape([-1]) * loss_mask
+                )
+                loss = masked_lm_loss / loss_mask.sum()
+                if self.token_balance_loss:
+                    _loss = masked_lm_loss / self.config.token_balance_seqlen
+                    loss = _loss - _loss.detach() + loss.detach()  # for 对线
+                loss_sum = masked_lm_loss.sum().detach()
+        if not self.return_tuple:  # only used in pp
+            if self.training:
+                return loss
+            return loss_sum
+        return loss, loss_sum
+
+
+class Ernie4_5LMHead(nn.Layer):
+    """Language model head for ERNIE with support for tensor parallelism."""
+
+    def __init__(self, config):
+        """Initialize the language model head.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configuration containing:
+                - vocab_size: Size of vocabulary
+                - hidden_size: Dimension of hidden states
+                - tensor_parallel_degree: Degree of tensor parallelism
+                - tie_word_embeddings: Whether to tie input/output embeddings
+                - weight_share_add_bias: Whether to add bias when weight sharing
+                - use_bias: Whether to use bias term
+                - use_recompute_loss_fn: Whether to defer logits computation to loss function
+                - use_sparse_head_and_loss_fn: Whether to use sparse head computation
+        """
+
+        super(Ernie4_5LMHead, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        self.weight = self.create_parameter(
+            shape=(
+                [vocab_size, config.hidden_size]
+                if config.tie_word_embeddings
+                else [config.hidden_size, vocab_size]
+            ),
+            dtype=paddle.get_default_dtype(),
+        )
+        logging.info(
+            f"output-weight:{self.weight.shape} config.tie_word_embeddings={config.tie_word_embeddings}"
+        )
+        if config.weight_share_add_bias and config.use_bias:
+            self.bias = self.create_parameter(
+                shape=[vocab_size],
+                dtype=paddle.get_default_dtype(),
+                attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.constant.Constant(0.0)
+                ),
+            )
+        else:
+            self.bias = None
+
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = (
+            True if (vocab_size != config.vocab_size) else False
+        )
+        if config.weight_share_add_bias and config.use_bias:
+            self.bias.is_distributed = (
+                True if (vocab_size != config.vocab_size) else False
+            )
+
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+        if (
+            config.weight_share_add_bias
+            and config.use_bias
+            and self.bias.is_distributed
+        ):
+            self.bias.split_axis = 0
+
+        if self.config.use_recompute_loss_fn:
+            logging.info(
+                "Using recompute_loss_fn, the calculation of logits will be moved into "
+                "loss_fn for memory optimization"
+            )
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        """Project hidden states to vocabulary logits.
+
+        Args:
+            hidden_states (paddle.Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+            tensor_parallel_output (Optional[bool]): Whether to output parallel results. Defaults to None.
+
+        Returns:
+            Union[
+                Tuple[paddle.Tensor, paddle.Tensor, Optional[paddle.Tensor]]:
+                    # When use_recompute_loss_fn or use_sparse_head_and_loss_fn
+                    - hidden_states: Original input
+                    - weight: Projection weights
+                    - bias: Optional bias term
+                Tuple[paddle.Tensor, paddle.Tensor, Optional[paddle.Tensor], bool]:  # With tensor_parallel_output
+                    Same as above plus tensor_parallel_output flag
+                paddle.Tensor:  # Normal case
+                    Logits tensor of shape [batch_size, seq_len, vocab_size]
+            ]
+        """
+        #  will enter this branch when:
+        # 1. use_recompute_loss_fn or use_sparse_head_and_loss_fn
+        # 2. dpo training
+        if self.config.use_recompute_loss_fn or self.config.use_sparse_head_and_loss_fn:
+            return (
+                hidden_states,
+                self.weight,
+                self.bias,
+                self.config.tie_word_embeddings,
+            )
+
+        return calc_lm_head_logits(
+            self.config,
+            hidden_states,
+            self.weight,
+            self.bias,
+            tensor_parallel_output,
+            training=self.training,
+        )
+
+
+class Ernie4_5DecoderLayer(nn.Layer):
+    """A single transformer decoder layer in ERNIE model.
+
+    Contains self-attention and feed-forward components,
+    support, residual connections, and layer normalization.
+    """
+
+    def __init__(self, config, layer_idx):
+        """Initialize the decoder layer.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configuration.
+            layer_idx (int): Index of this layer in the transformer stack
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.config = config
+
+        self.self_attn = Ernie4_5Attention(config, layer_idx)
+        self.mlp = Ernie4_5MLP(config)
+
+        Norm = RMSNorm if config.use_rmsnorm else LayerNorm
+
+        self.input_layernorm = Norm(config)
+        self.post_attention_layernorm = Norm(config)
+
+        self.residual_add1 = FusedDropoutImpl(
+            config.hidden_dropout_prob, mode="upscale_in_train"
+        )
+        self.residual_add2 = FusedDropoutImpl(
+            config.hidden_dropout_prob, mode="upscale_in_train"
+        )
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.post_attention_layernorm.weight)
+            if not hasattr(config, "disable_ffn_model_parallel"):
+                mark_as_sequence_parallel_parameter(self.input_layernorm.weight)
+                if config.use_bias:
+                    mark_as_sequence_parallel_parameter(self.self_attn.o_proj.bias)
+                    mark_as_sequence_parallel_parameter(self.mlp.down_proj.bias)
+
+            if not config.use_rmsnorm and config.use_bias:
+                mark_as_sequence_parallel_parameter(self.post_attention_layernorm.bias)
+                mark_as_sequence_parallel_parameter(self.input_layernorm.bias)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_embeddings: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        attn_mask_start_row_indices: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """Forward pass through the decoder layer.
+
+        Args:
+            hidden_states (paddle.Tensor): Input tensor [batch_size, seq_len, hidden_size]
+            position_embeddings (paddle.Tensor): Position embeddings
+            attention_mask (Optional[paddle.Tensor]): Attention mask tensor
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Indices for variable length attention
+            position_ids (Optional[paddle.Tensor]): Position indices for rotary embeddings
+            output_attentions (Optional[bool]): Whether to return attention weights
+            past_key_value (Optional[Tuple[paddle.Tensor]]): Cached key/value states
+            use_cache (Optional[bool]): Whether to cache key/value states
+
+        Returns:
+            Union: Various output combinations depending on arguments:
+                - Base case: Hidden states tensor
+                - With attention: Tuple of (hidden_states, attention_weights)
+                - With cache: Tuple of (hidden_states, cached_key_value)
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.config.recompute
+            and self.config.recompute_granularity == "full_attn"
+            and has_gradient
+        ):
+            hidden_states, self_attn_weights, present_key_value = recompute(
+                self.self_attn,
+                hidden_states,
+                position_embeddings,
+                past_key_value,
+                attention_mask,
+                attn_mask_start_row_indices,
+                position_ids,
+                output_attentions,
+                use_cache,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            hidden_states, self_attn_weights, present_key_value = self.self_attn(
+                hidden_states=hidden_states,
+                position_embeddings=position_embeddings,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                attn_mask_start_row_indices=attn_mask_start_row_indices,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                token_type_ids=token_type_ids,
+            )
+
+        with self.model_parallel_dropout():
+            hidden_states = self.residual_add1(hidden_states, residual)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        with self.model_parallel_dropout():
+            hidden_states = self.residual_add2(hidden_states, residual)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+        return outputs
+
+    def model_parallel_dropout(self):
+        """Get context manager for model-parallel dropout with proper seed control.
+
+        Returns:
+            Context manager for dropout operation
+        """
+        if (
+            self.config.tensor_parallel_degree > 1
+            and self.config.hidden_dropout_prob > 0.0
+        ):
+            current_seed = (
+                "local_seed" if self.config.sequence_parallel else "global_seed"
+            )
+            return get_rng_state_tracker().rng_state(current_seed)
+        return contextlib.nullcontext()
+
+
+class Ernie4_5PretrainedModel(PretrainedModel):
+    """Base class for ERNIE pretrained models."""
+
+    config_class = PaddleOCRVLConfig
+    base_model_prefix = "ernie"
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+        """Generate tensor parallel mappings for model conversion.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configuration.
+            is_split (bool): Whether to generate split mappings (True)
+                            or merge mappings (False). Defaults to True.
+
+        Returns:
+            Dict[str, Callable[[Any], Any]]: Dictionary mapping parameter names
+                to their corresponding split/merge functions for tensor parallelism.
+        """
+
+        from ..conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def gqa_qkv_split_func(
+            weight,
+            tensor_parallel_degree,
+            tensor_parallel_rank,
+            num_attention_heads,
+            num_key_value_heads,
+            head_dim,
+            is_quant=False,
+            is_split=True,
+        ):
+            if is_quant:
+                weight = weight.T
+
+            def get_shape(tensor):
+                return (
+                    tensor.get_shape() if hasattr(tensor, "get_shape") else tensor.shape
+                )
+
+            def slice_tensor(tensor, start, end):
+                shape = get_shape(tensor)
+                if len(shape) == 1:
+                    return tensor[start:end]
+                else:
+                    return tensor[..., start:end]
+
+            q_end = num_attention_heads * head_dim
+            k_end = q_end + num_key_value_heads * head_dim
+            v_end = k_end + num_key_value_heads * head_dim
+
+            q = slice_tensor(weight, 0, q_end)
+            k = slice_tensor(weight, q_end, k_end)
+            v = slice_tensor(weight, k_end, v_end)
+
+            def split_tensor(tensor, degree):
+                shape = get_shape(tensor)
+                size = shape[-1]
+                block_size = size // degree
+                if hasattr(tensor, "get_shape"):
+                    return [
+                        slice_tensor(tensor, i * block_size, (i + 1) * block_size)
+                        for i in range(degree)
+                    ]
+                else:
+                    return np.split(tensor, degree, axis=-1)
+
+            q_list = split_tensor(q, tensor_parallel_degree)
+            k_list = split_tensor(k, tensor_parallel_degree)
+            v_list = split_tensor(v, tensor_parallel_degree)
+
+            if tensor_parallel_rank is None:
+                out = [
+                    np.concatenate([q_i, k_i, v_i], axis=-1)
+                    for q_i, k_i, v_i in zip(q_list, k_list, v_list)
+                ]
+            else:
+                out = np.concatenate(
+                    [
+                        q_list[tensor_parallel_rank],
+                        k_list[tensor_parallel_rank],
+                        v_list[tensor_parallel_rank],
+                    ],
+                    axis=-1,
+                )
+            if is_quant:
+                out = out.T
+            return out
+
+        def gqa_qkv_merge_func(
+            weight_list,
+            num_attention_heads,
+            num_key_value_heads,
+            head_dim,
+            is_quant=False,
+            is_split=False,
+        ):
+            tensor_parallel_degree = len(weight_list)
+            num_attention_heads = num_attention_heads // tensor_parallel_degree
+            num_key_value_heads = num_key_value_heads // tensor_parallel_degree
+
+            is_paddle_tensor = not isinstance(weight_list[0], np.ndarray)
+
+            def get_shape(tensor):
+                return (
+                    tensor.get_shape() if hasattr(tensor, "get_shape") else tensor.shape
+                )
+
+            def slice_tensor(tensor, start, end):
+                if len(get_shape(tensor)) == 1:
+                    return tensor[start:end]
+                else:
+                    return tensor[..., start:end]
+
+            q_list, k_list, v_list = [], [], []
+
+            for weight in weight_list:
+                if is_quant:
+                    weight = weight.T
+                q_end = num_attention_heads * head_dim
+                k_end = q_end + num_key_value_heads * head_dim
+                v_end = k_end + num_key_value_heads * head_dim
+
+                q = slice_tensor(weight, 0, q_end)
+                k = slice_tensor(weight, q_end, k_end)
+                v = slice_tensor(weight, k_end, v_end)
+
+                q_list.append(q)
+                k_list.append(k)
+                v_list.append(v)
+
+            merged = q_list + k_list + v_list
+
+            if is_paddle_tensor:
+                tensor = paddle.concat(merged, axis=-1)
+                if tensor.place.is_gpu_place():
+                    tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+
+            else:
+                tensor = np.concatenate(merged, axis=-1)
+            if is_quant:
+                tensor = tensor.T
+            return tensor
+
+        if (
+            config.num_key_value_heads is not None
+            and config.num_key_value_heads != config.num_attention_heads
+        ):
+            if is_split:
+                qkv_fn = partial(
+                    gqa_qkv_split_func,
+                    tensor_parallel_degree=config.tensor_parallel_degree,
+                    tensor_parallel_rank=config.tensor_parallel_rank,
+                    num_attention_heads=config.num_attention_heads,
+                    num_key_value_heads=config.num_key_value_heads,
+                    head_dim=(
+                        config.hidden_size // config.num_attention_heads
+                        if config.head_dim is None
+                        else config.head_dim
+                    ),
+                    is_quant=False,
+                    is_split=True,
+                )
+            else:
+                qkv_fn = partial(
+                    gqa_qkv_merge_func,
+                    num_attention_heads=config.num_attention_heads,
+                    num_key_value_heads=config.num_key_value_heads,
+                    head_dim=(
+                        config.hidden_size // config.num_attention_heads
+                        if config.head_dim is None
+                        else config.head_dim
+                    ),
+                    is_quant=False,
+                    is_split=False,
+                )
+        else:
+            qkv_fn = partial(fn, is_column=True)
+
+        def get_tensor_parallel_split_mappings(num_hidden_layers):
+            final_actions = {}
+
+            base_actions = {
+                # Column Linear
+                "layers.0.self_attn.qkv_proj.weight": qkv_fn,
+                "layers.0.mlp.up_gate_proj.weight": partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                ),
+                "lm_head.weight": partial(fn, is_column=not config.tie_word_embeddings),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            if config.use_bias:
+                base_actions.update(
+                    {
+                        # Column Linear
+                        "layers.0.self_attn.qkv_proj.bias": qkv_fn,
+                        "layers.0.mlp.up_gate_proj.bias": partial(
+                            fn, is_column=True, is_naive_2fuse=True
+                        ),
+                        "layers.0.mlp.down_proj.bias": lambda x: x[
+                            :
+                        ],  # convert PySafeSlice to ndarray.
+                        "lm_head.bias": partial(fn, is_column=True),
+                    }
+                )
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_hidden_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                else:
+                    final_actions[key] = action
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+        return mappings
+
+
+class Ernie4_5Model(Ernie4_5PretrainedModel):
+    """The core ERNIE transformer model"""
+
+    def __init__(self, config: PaddleOCRVLConfig):
+        """Initialize the ERNIE model architecture.
+
+        Args:
+            config (PaddleOCRVLConfig): Model configuration.
+        """
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.config = config
+
+        if config.tensor_parallel_degree > 1:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+
+        self.layers = nn.LayerList(
+            [Ernie4_5DecoderLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+        Norm = RMSNorm if config.use_rmsnorm else LayerNorm
+        self.norm = Norm(config)
+        self.rotary_emb = KeyeRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        """Get the input embedding layer.
+
+        Returns:
+            nn.Embedding: The embedding layer for input tokens
+        """
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        """Set new input embeddings.
+
+        Args:
+            value (nn.Embedding): New embedding layer to use
+        """
+        self.embed_tokens = value
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        layer_module,
+        hidden_states,
+        position_embeddings,
+        attention_mask,
+        attn_mask_start_row_indices,
+        position_ids,
+        token_type_ids,
+        output_attentions,
+        past_key_value,
+        use_cache,
+    ):
+        """Perform gradient checkpointing for memory-efficient training.
+
+        Args:
+            layer_module (nn.Layer): Transformer layer to recompute
+            hidden_states (paddle.Tensor): Input hidden states
+            position_embeddings (paddle.Tensor): Position embeddings
+            attention_mask (paddle.Tensor): Attention mask
+            attn_mask_start_row_indices (paddle.Tensor): Variable length indices
+            position_ids (paddle.Tensor): Position indices
+            output_attentions (bool): Whether to output attention weights
+            past_key_value (Optional[Tuple[paddle.Tensor]]): Cached key/value states
+            use_cache (bool): Whether to cache key/value states
+
+        Returns:
+            paddle.Tensor: Output hidden states after recomputation
+        """
+
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs, output_gate_logits=False)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            position_embeddings,
+            attention_mask,
+            attn_mask_start_row_indices,
+            position_ids,
+            token_type_ids,
+            output_attentions,
+            past_key_value,
+            use_cache,
+        )
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        inputs_embeds=None,
+        use_cache=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+    ):
+        """Forward pass through the ERNIE model.
+
+        Args:
+            input_ids (Optional[paddle.Tensor]): Input token IDs
+            position_ids (Optional[paddle.Tensor]): Position indices
+            attention_mask (Optional[paddle.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Variable length attention indices
+            inputs_embeds (Optional[paddle.Tensor]): Precomputed embeddings
+            use_cache (Optional[bool]): Whether to cache key/value states
+            past_key_values (Optional[Tuple[Tuple[paddle.Tensor]]]): Cached key/value states
+            output_attentions (Optional[bool]): Whether to output attention weights
+            output_hidden_states (Optional[bool]): Whether to output all hidden states
+            return_dict (Optional[bool]): Whether to return dict or tuple
+
+        Returns:
+            Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+                Various outputs depending on configuration, including:
+                - last_hidden_state: Final layer hidden states
+                - past_key_values: Cached key/value states if use_cache=True
+                - hidden_states: All hidden states if output_hidden_states=True
+                - attentions: Attention weights if output_attentions=True
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+
+        if batch_size != 1:
+            raise NotImplementedError
+
+        layers = self.layers[: self.config.num_hidden_layers]
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(layers))
+            kv_seq_len = 0
+        else:
+            kv_seq_len = past_key_values[0][0].shape[1]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = inputs_embeds.astype(self.embed_tokens.weight.dtype)
+
+        if self.config.sequence_parallel:
+            inputs_embeds = inputs_embeds.reshape([-1, inputs_embeds.shape[-1]])
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+
+        hidden_states = inputs_embeds
+
+        if position_ids is None or position_ids.dim() == 2:
+            raise NotImplementedError
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        if attention_mask is None:
+            raise NotImplementedError
+        causal_mask = self._update_causal_mask(
+            attention_mask.astype("int64"),
+            inputs_embeds,
+            past_key_values,
+            output_attentions,
+        )
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.config.recompute
+                and self.config.recompute_granularity == "full"
+                and has_gradient
+            ):
+                layer_outputs = self.recompute_training(
+                    decoder_layer,
+                    hidden_states,
+                    position_embeddings,
+                    causal_mask,
+                    attn_mask_start_row_indices,
+                    position_ids,
+                    token_type_ids,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings,
+                    causal_mask,
+                    attn_mask_start_row_indices,
+                    position_ids,
+                    token_type_ids,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                )
+
+            if isinstance(layer_outputs, (tuple, list)):
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=None,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_tensor: paddle.Tensor,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]],
+        output_attentions: bool = False,
+    ):
+        past_seen_tokens = (
+            past_key_values[0][0].shape[1]
+            if past_key_values is not None and past_key_values[0] is not None
+            else 0
+        )
+
+        dtype = input_tensor.dtype
+        min_dtype = paddle.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, paddle.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+        cache_position = paddle.arange(
+            past_seen_tokens, past_seen_tokens + sequence_length
+        )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask,
+        sequence_length: int,
+        target_length: int,
+        dtype,
+        cache_position,
+        batch_size: int,
+    ):
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = paddle.finfo(dtype).min
+            causal_mask = paddle.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype
+            )
+            diagonal_attend_mask = paddle.arange(
+                target_length
+            ) > cache_position.reshape((-1, 1))
+            diagonal_attend_mask = diagonal_attend_mask.astype(causal_mask.dtype)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand((batch_size, 1, -1, -1))
+            if attention_mask is not None:
+                causal_mask = (
+                    causal_mask.clone()
+                )  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[
+                    :, None, None, :
+                ].astype(causal_mask.dtype)
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[
+                    :, :, :, :mask_length
+                ].masked_fill(padding_mask, min_dtype)
+        return causal_mask
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/__init__.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/__init__.py
new file mode 100644
index 0000000000..65e5c58ac8
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/__init__.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fusion operators
+"""
+import paddle
+from paddle.incubate.nn.functional import fused_rms_norm_ext
+from paddle.incubate.nn.functional import fused_rotary_position_embedding as fused_rope
+from paddle.incubate.nn.functional import swiglu as fused_swiglu
+
+from .common_fusion_ops import Linear, matmul
+
+if paddle.device.is_compiled_with_custom_device("npu"):
+    from .npu_fusion_ops import npu_cal_aux_loss_func as cal_aux_loss
+else:
+    from paddle.incubate.nn.functional import cal_aux_loss
+
+__all__ = [
+    "fused_rope",
+    "fused_swiglu",
+    "fused_rms_norm_ext",
+    "Linear",
+    "matmul",
+    "cal_aux_loss",
+]
+
+
+def fusion_flash_attention(
+    q,
+    k,
+    v,
+    training_mode,
+    attention_probs_dropout_prob,
+    use_sparse_flash_attn,
+    attention_mask=None,
+    attn_mask_start_row_indices=None,
+    seq_length=None,
+    use_var_len_flash_attn=False,
+    rr_flash_attn=None,
+):
+    """
+    Args:
+        q (Tensor): Query tensor.
+        k (Tensor): Key tensor.
+        v (Tensor): Value tensor.
+        training_mode (bool): Whether in training mode.
+        attention_probs_dropout_prob (float): Dropout probability for attention probabilities.
+        use_sparse_flash_attn (bool): Whether to use sparse flash attention.
+        attention_mask (Tensor, optional): Attention mask. Defaults to None.
+        attn_mask_start_row_indices (Tensor, optional): Start row indices for attention mask. Defaults to None.
+        seq_length (int, optional): Sequence length. Defaults to None.
+        use_var_len_flash_attn (bool, optional): Whether to use variable length flash attention. Defaults to False.
+        rr_flash_attn (bool, optional): Whether to use round-robin flash attention. Defaults to None.
+
+    Returns:
+        Tensor: Output tensor after applying fusion flash attention.
+    """
+    from .common_fusion_ops import _fusion_flash_attention
+
+    return _fusion_flash_attention(
+        q,
+        k,
+        v,
+        training_mode=training_mode,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        use_sparse_flash_attn=use_sparse_flash_attn,
+        attention_mask=attention_mask,
+        attn_mask_start_row_indices=attn_mask_start_row_indices,
+        rr_flash_attn=rr_flash_attn,
+    )
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/common_fusion_ops.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/common_fusion_ops.py
new file mode 100644
index 0000000000..fb70acfae3
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/common_fusion_ops.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Common fusion operators.
+"""
+
+# TODO: Support XPU
+
+import paddle
+import paddle.nn.functional as F
+from paddle import matmul, tensor
+from paddle.nn import Linear
+from paddle.nn.functional.flash_attention import flashmask_attention
+
+__all__ = [
+    "matmul",
+    "Linear",
+]
+
+
+def _fusion_flash_attention(
+    q,
+    k,
+    v,
+    training_mode,
+    attention_probs_dropout_prob,
+    use_sparse_flash_attn,
+    attention_mask=None,
+    attn_mask_start_row_indices=None,
+    rr_flash_attn=None,
+):
+    """
+    Performs fused flash attention with multiple implementation variants.
+
+    Args:
+        q (paddle.Tensor): Query tensor with shape [batch, heads, seq_len, dim_head]
+        k (paddle.Tensor): Key tensor with shape [batch, heads, seq_len, dim_head]
+        v (paddle.Tensor): Value tensor with shape [batch, heads, seq_len, dim_head]
+        training_mode (bool): Whether in training mode (affects dropout)
+        attention_probs_dropout_prob (float): Dropout probability for attention weights
+        use_sparse_flash_attn (bool): Whether to use sparse flash attention optimization
+        attention_mask (Optional[paddle.Tensor]): Dense attention mask (default: None)
+        attn_mask_start_row_indices (Optional[paddle.Tensor]): Sparse mask indices (default: None)
+        rr_flash_attn (Optional[Callable]): Recomputation wrapper for flash attention (default: None)
+
+    Returns:
+        Tuple[paddle.Tensor, Optional[paddle.Tensor]]:
+            - Output tensor with shape [batch, seq_len, heads*dim_head]
+            - Attention weights (None for flash attention implementations)
+
+    Raises:
+        Warning: If sparse flash attention is requested but unavailable
+        ValueError: If invalid combination of mask inputs is provided
+    """
+
+    version = paddle.version.full_version
+    if attn_mask_start_row_indices is not None:
+        if use_sparse_flash_attn:
+            if rr_flash_attn is None:
+                out = flashmask_attention(
+                    q,
+                    k,
+                    v,
+                    startend_row_indices=attn_mask_start_row_indices.unsqueeze(-1),
+                    causal=True,
+                )
+            else:
+                out = rr_flash_attn(
+                    flashmask_attention,
+                    q,
+                    k,
+                    v,
+                    startend_row_indices=attn_mask_start_row_indices.unsqueeze(-1),
+                    causal=True,
+                )
+        else:
+            attention_mask = _gen_from_sparse_attn_mask_indices(
+                attn_mask_start_row_indices, q.dtype
+            )
+            if rr_flash_attn is None:
+                out = F.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=attention_mask,
+                    is_causal=False,
+                )
+            else:
+                out = rr_flash_attn(
+                    F.scaled_dot_product_attention,
+                    q,
+                    k,
+                    v,
+                    attn_mask=attention_mask,
+                    is_causal=False,
+                )
+        weights = None
+    else:
+        if rr_flash_attn is None:
+            out = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None and q.shape[1] != 1,
+            )
+            weights = None
+        else:
+            out = rr_flash_attn(
+                F.scaled_dot_product_attention,
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None and q.shape[1] != 1,
+            )
+            weights = None
+
+    out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+    return out, weights
+
+
+def _gen_from_sparse_attn_mask_indices(attn_mask_start_row_indices, dtype):
+    """
+    Recover 4-D attention_mask from attn_mask_start_row_indices.
+
+    Args:
+        attn_mask_start_row_indices (paddle.Tensor): The start row indices for the attention mask.
+        dtype (str): The data type of the tensor.
+
+    Returns:
+        paddle.Tensor: The dense attention mask recovered from attn_mask_start_row_indices.
+    """
+    batch_size, _, max_seq_len = attn_mask_start_row_indices.shape
+    base = (
+        paddle.arange(max_seq_len, dtype="int32")
+        .unsqueeze(1)
+        .expand([batch_size, -1, max_seq_len])
+        .unsqueeze(1)
+    )
+    mask_indices = attn_mask_start_row_indices.unsqueeze(1)
+
+    tril = paddle.tril(
+        paddle.ones([max_seq_len, max_seq_len], dtype="bool").expand(
+            [batch_size, 1, max_seq_len, max_seq_len]
+        )
+    )
+    attention_mask = paddle.logical_and(base < mask_indices, tril)
+    attention_mask = paddle.scale(
+        x=attention_mask.astype(dtype),
+        scale=1000000.0,
+        bias=-1.0,
+        bias_after_scale=False,
+    )
+
+    return attention_mask
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/npu_fusion_ops.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/npu_fusion_ops.py
new file mode 100644
index 0000000000..0547e2d344
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_fusion_ops/npu_fusion_ops.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+npu fusion operators.
+
+"""
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+
+
+def npu_combining(x, combine_weights, scatter_index, hard_gate=False):
+    """
+    Args:
+        x: Tensor[seq, dim]
+        combine_weights: [seq, k]
+        scatter_index:  ** [seq, k] **
+    Returns:
+        y: Tensor[s, dim]
+    """
+    x_gatherd = F.embedding(scatter_index, x)  # [s,k,dim]
+    if hard_gate:
+        return x_gatherd.squeeze(-2)
+    y = (combine_weights.unsqueeze(-1) * x_gatherd).sum(1)
+    return y
+
+
+def npu_cal_aux_loss_func(
+    gate_prob,
+    dispatch_mask,
+    tokens_mask,
+    dispatch_tokens_mask,
+    num_experts,
+    use_group,
+    moe_k,
+    global_aux_loss=False,
+    rank=None,
+    group=None,
+    clip_min=1e-6,
+):
+    """cal_aux_loss_func"""
+    if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype:
+        tokens_mask = tokens_mask.astype(gate_prob.dtype)
+
+    scale = None
+    if dispatch_tokens_mask is not None:
+        seqlen_float = dispatch_tokens_mask.astype(gate_prob.dtype).sum()
+        if (
+            tokens_mask is not None
+            and gate_prob.shape[0] != dispatch_tokens_mask.shape[0]
+        ):
+            scale = seqlen_float / paddle.clip(tokens_mask.sum(), min=1e-6)
+    elif tokens_mask is not None:
+        seqlen_float = tokens_mask.sum()
+    else:
+        seqlen_float = gate_prob.numel().astype(gate_prob.dtype) / num_experts
+    seqlen_float = paddle.clip(seqlen_float, min=1e-6)
+    if len(dispatch_mask.shape) == 2:
+        dispatch_mask = dispatch_mask.sum(0)
+    ce = dispatch_mask.astype(gate_prob.dtype).detach() / seqlen_float
+    me = paddle.sum(gate_prob, axis=0) / seqlen_float
+
+    if global_aux_loss:
+        me_list, ce_list = [], []
+        dist.all_gather(me_list, me, group=group)
+        dist.all_gather(ce_list, ce, group=group)
+        me_list[rank] = me
+        ce_list[rank] = ce
+        me = paddle.stack(me_list).mean(0)
+        ce = paddle.stack(ce_list).mean(0)
+
+    l_aux = paddle.sum(me * ce) * num_experts
+    if use_group:
+        l_aux = l_aux / moe_k
+    if scale is not None:
+        l_aux = l_aux + (scale - 1) * l_aux.detach()
+    return l_aux, None, None
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py
new file mode 100644
index 0000000000..b8385c8f75
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_paddleocr_vl.py
@@ -0,0 +1,846 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/modeling_keye.py
+# Original header:
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from contextvars import ContextVar
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+from ....common.vlm.generation import GenerationMixin
+from ....common.vlm.transformers.model_outputs import (
+    CausalLMOutputWithCrossAttentions,
+    ModelOutput,
+)
+from ._config import PaddleOCRVLConfig
+from ._ernie import Ernie4_5Model, Ernie4_5PretrainedModel
+from ._projector import Projector
+from ._siglip import SiglipVisionModel
+
+
+@dataclass
+class PaddleOCRVLCausalLMOutputWithPast(ModelOutput):
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    rope_deltas: Optional[paddle.Tensor] = None
+
+
+class PaddleOCRVLForConditionalGeneration(Ernie4_5PretrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    config_class = PaddleOCRVLConfig
+    _no_split_modules = ["Ernie4_5DecoderLayer", "SiglipEncoderLayer"]
+
+    base_model_prefix = ""
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mlp_AR = Projector(config, config.vision_config)
+        self.visual = SiglipVisionModel(config.vision_config)
+        self.model = Ernie4_5Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+        self.rope_deltas_var = ContextVar("rope_deltas", default=None)
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def get_rope_index(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        image_grid_thw: Optional[paddle.Tensor] = None,
+        video_grid_thw: Optional[paddle.Tensor] = None,
+        second_per_grid_ts: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embedding for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`paddle.Tensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`paddle.Tensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            second_per_grid_ts (`paddle.Tensor` of shape `(num_videos)`, *optional*):
+                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+        Returns:
+            position_ids (`paddle.Tensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`paddle.Tensor` of shape `(batch_size)`)
+        """
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (
+            image_grid_thw is not None or video_grid_thw is not None
+        ):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = paddle.ones_like(total_input_ids)
+            position_ids = paddle.ones(
+                [3, input_ids.shape[0], input_ids.shape[1]],
+                dtype=input_ids.dtype,
+            )
+            image_index, video_index = 0, 0
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = paddle.nonzero(
+                    input_ids == vision_start_token_id
+                ).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    llm_pos_ids_list.append(
+                        paddle.arange(text_len).reshape((1, -1)).expand((3, -1))
+                        + st_idx
+                    )
+
+                    if paddle.is_tensor(second_per_grid_t):
+                        second_per_grid_t = second_per_grid_t.detach().item()
+                    range_tensor = paddle.arange(llm_grid_t).reshape((-1, 1))
+                    expanded_range = range_tensor.expand((-1, llm_grid_h * llm_grid_w))
+
+                    time_tensor = (
+                        expanded_range
+                        * second_per_grid_t
+                        * self.config.vision_config.tokens_per_second
+                    )
+
+                    time_tensor_long = time_tensor.astype("int64")
+                    t_index = time_tensor_long.flatten()
+
+                    h_index = (
+                        paddle.arange(llm_grid_h)
+                        .reshape((1, -1, 1))
+                        .expand((llm_grid_t, -1, llm_grid_w))
+                        .flatten()
+                    )
+                    w_index = (
+                        paddle.arange(llm_grid_w)
+                        .reshape((1, 1, -1))
+                        .expand((llm_grid_t, llm_grid_h, -1))
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        paddle.stack([t_index, h_index, w_index]) + text_len + st_idx
+                    )
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+                if st < len(input_tokens):
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(
+                        paddle.arange(text_len).reshape((1, -1)).expand((3, -1))
+                        + st_idx
+                    )
+
+                llm_positions = paddle.concat(llm_pos_ids_list, axis=1).reshape((3, -1))
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions
+                mrope_position_deltas.append(
+                    llm_positions.max() + 1 - len(total_input_ids[i])
+                )
+            mrope_position_deltas = paddle.to_tensor(mrope_position_deltas).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand((3, -1, -1))
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                    -1, keepdim=True
+                )[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    paddle.arange(input_ids.shape[1])
+                    .reshape((1, 1, -1))
+                    .expand((3, input_ids.shape[0], -1))
+                )
+                mrope_position_deltas = paddle.zeros(
+                    [input_ids.shape[0], 1],
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+
+    def prepare_attention_mask_for_generation(
+        self, input_ids, pad_token_id, eos_token_id
+    ):
+        """Avoid using attention_mask with flash_attn on generation."""
+        if self.config.use_flash_attention:
+            return None
+        return super().prepare_attention_mask_for_generation(
+            input_ids, pad_token_id, eos_token_id
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        use_cache=False,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        pixel_values_videos=None,
+        position_ids=None,
+        **kwargs,
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+            pixel_values = None
+            pixel_values_videos = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "pixel_values": pixel_values,
+                "pixel_values_videos": pixel_values_videos,
+                "position_ids": None,
+                **kwargs,
+            }
+        )
+
+        return model_inputs
+
+    def update_model_kwargs_for_generation(
+        self, outputs, model_kwargs, is_encoder_decoder=False
+    ):
+        """
+        Updates model kwargs for generation.
+
+        Args:
+            outputs (Any): Model outputs.
+            model_kwargs (dict): Current model kwargs.
+            is_encoder_decoder (bool): Whether using encoder-decoder architecture.
+
+        Returns:
+            dict: Updated model kwargs.
+        """
+        # update cache
+        if (
+            isinstance(outputs, tuple)
+            and len(outputs) > 1
+            and not isinstance(outputs[1], paddle.Tensor)
+        ):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if (
+            isinstance(outputs, CausalLMOutputWithCrossAttentions)
+            and "past_key_values" in outputs
+        ):
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        if (
+            not is_encoder_decoder
+            and model_kwargs.get("attention_mask", None) is not None
+        ):
+            # update attention mask
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [
+                    attention_mask,
+                    paddle.ones(
+                        [attention_mask.shape[0], 1], dtype=attention_mask.dtype
+                    ),
+                ],
+                axis=-1,
+            )
+
+        return model_kwargs
+
+    def get_transpose_weight_keys(self):
+        t_layers = [
+            "out_proj",
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "lm_head",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+            "o_proj",
+            "lm_head",
+            "linear_1",
+            "linear_2",
+            "fc",
+            "in_proj",
+        ]
+        keys = []
+        for key, _ in self.get_hf_state_dict().items():
+            for t_layer in t_layers:
+                if t_layer in key and key.endswith("weight"):
+                    keys.append(key)
+        return keys
+
+    def get_hf_state_dict(self, *args, **kwargs):
+        def _merge_attention_weights(
+            q_weight=None,
+            k_weight=None,
+            v_weight=None,
+            q_bias=None,
+            k_bias=None,
+            v_bias=None,
+        ):
+            if q_weight is not None and k_weight is not None and v_weight is not None:
+                return paddle.concat([q_weight, k_weight, v_weight], axis=1)
+            elif q_bias is not None and k_bias is not None and v_bias is not None:
+                return paddle.concat([q_bias, k_bias, v_bias], axis=0)
+            else:
+                raise ValueError
+
+        def _convert_to_hf_state_dict(current_state_dict):
+            hf_state_dict = {}
+
+            for key in list(current_state_dict.keys()):
+                if "up_gate_proj" in key:
+                    combined_weights = current_state_dict[key]
+                    split_size = combined_weights.shape[-1] // 2
+                    gate_proj = combined_weights[..., :split_size]
+                    up_proj = combined_weights[..., split_size:]
+
+                    hf_state_dict[key.replace("up_gate_proj", "gate_proj")] = gate_proj
+                    hf_state_dict[key.replace("up_gate_proj", "up_proj")] = up_proj
+                    continue
+
+                if "qkv_proj" in key and ("weight" in key or "bias" in key):
+                    combined_weights = current_state_dict[key]
+                    if getattr(self.config, "head_dim", None) is None:
+                        head_dim = self.hidden_size // self.num_heads
+                    else:
+                        head_dim = self.config.head_dim
+                    num_heads = self.config.num_attention_heads
+                    num_kv_heads = self.config.num_key_value_heads
+                    q_proj, k_proj, v_proj = paddle.split(
+                        combined_weights,
+                        [
+                            num_heads * head_dim,
+                            num_kv_heads * head_dim,
+                            num_kv_heads * head_dim,
+                        ],
+                        axis=-1,
+                    )
+
+                    if "weight" in key:
+                        hf_state_dict[
+                            key.replace("qkv_proj.weight", "q_proj.weight")
+                        ] = q_proj
+                        hf_state_dict[
+                            key.replace("qkv_proj.weight", "k_proj.weight")
+                        ] = k_proj
+                        hf_state_dict[
+                            key.replace("qkv_proj.weight", "v_proj.weight")
+                        ] = v_proj
+                    else:  # bias
+                        hf_state_dict[key.replace("qkv_proj.bias", "q_proj.bias")] = (
+                            q_proj
+                        )
+                        hf_state_dict[key.replace("qkv_proj.bias", "k_proj.bias")] = (
+                            k_proj
+                        )
+                        hf_state_dict[key.replace("qkv_proj.bias", "v_proj.bias")] = (
+                            v_proj
+                        )
+                    continue
+
+                if "up_gate_proj" not in key and "qkv_proj" not in key:
+                    hf_state_dict[key] = current_state_dict[key]
+
+            new_hf_state_dict = {}
+            keys_to_remove = set()
+
+            for key, value in hf_state_dict.items():
+                if "head.attention" in key and "out_proj" not in key:
+                    if "weight" in key:
+                        q_key = key
+                        k_key = key.replace("q_proj", "k_proj")
+                        v_key = key.replace("q_proj", "v_proj")
+
+                        if (
+                            q_key in hf_state_dict
+                            and k_key in hf_state_dict
+                            and v_key in hf_state_dict
+                        ):
+                            merged_weights = _merge_attention_weights(
+                                q_weight=hf_state_dict[q_key],
+                                k_weight=hf_state_dict[k_key],
+                                v_weight=hf_state_dict[v_key],
+                            )
+                            new_key = key.replace("q_proj.weight", "in_proj_weight")
+                            new_hf_state_dict[new_key] = merged_weights
+                            keys_to_remove.update([q_key, k_key, v_key])
+
+                    elif "bias" in key:
+                        q_key = key
+                        k_key = key.replace("q_proj", "k_proj")
+                        v_key = key.replace("q_proj", "v_proj")
+
+                        if (
+                            q_key in hf_state_dict
+                            and k_key in hf_state_dict
+                            and v_key in hf_state_dict
+                        ):
+                            merged_bias = _merge_attention_weights(
+                                q_bias=hf_state_dict[q_key],
+                                k_bias=hf_state_dict[k_key],
+                                v_bias=hf_state_dict[v_key],
+                            )
+                            new_key = key.replace("q_proj.bias", "in_proj_bias")
+                            new_hf_state_dict[new_key] = merged_bias
+                            keys_to_remove.update([q_key, k_key, v_key])
+                else:
+                    new_hf_state_dict[key] = value
+
+            for key in keys_to_remove:
+                if key in new_hf_state_dict:
+                    del new_hf_state_dict[key]
+
+            return new_hf_state_dict
+
+        current_state_dict = self.state_dict(*args, **kwargs)
+
+        hf_state_dict = _convert_to_hf_state_dict(current_state_dict)
+
+        return hf_state_dict
+
+    def set_hf_state_dict(self, state_dict, *args, **kwargs):
+        def _split_attention_weights(weight=None, bias=None):
+            if weight is not None:
+                split_size = weight.shape[1] // 3
+                q_weight = weight[:, :split_size]
+                k_weight = weight[:, split_size : 2 * split_size]
+                v_weight = weight[:, 2 * split_size :]
+                return q_weight, k_weight, v_weight
+            elif bias is not None:
+                split_size = bias.shape[0] // 3
+                q_bias = bias[:split_size]
+                k_bias = bias[split_size : 2 * split_size]
+                v_bias = bias[2 * split_size :]
+                return q_bias, k_bias, v_bias
+
+        def _convert_state_dict(old_state_dict):
+            new_state_dict = {}
+            for key, value in old_state_dict.items():
+                if "head.attention.in_proj" in key:
+                    if key.endswith("weight"):
+                        q_w, k_w, v_w = _split_attention_weights(weight=value)
+                        new_state_dict[
+                            key.replace("in_proj_weight", "q_proj.weight")
+                        ] = q_w
+                        new_state_dict[
+                            key.replace("in_proj_weight", "k_proj.weight")
+                        ] = k_w
+                        new_state_dict[
+                            key.replace("in_proj_weight", "v_proj.weight")
+                        ] = v_w
+                    elif key.endswith("bias"):
+                        q_b, k_b, v_b = _split_attention_weights(bias=value)
+                        new_state_dict[key.replace("in_proj_bias", "q_proj.bias")] = q_b
+                        new_state_dict[key.replace("in_proj_bias", "k_proj.bias")] = k_b
+                        new_state_dict[key.replace("in_proj_bias", "v_proj.bias")] = v_b
+                    else:
+                        raise ValueError(f"Unexpected key: {key}")
+                else:
+                    new_state_dict[key] = value
+
+            for key in list(new_state_dict.keys()):
+                if key.startswith("model."):
+                    if "mlp.gate_proj." in key:
+                        gate_proj = new_state_dict.pop(key)
+                        up_proj = new_state_dict.pop(
+                            key.replace("gate_proj", "up_proj")
+                        )
+                        new_state_dict[key.replace("gate_proj", "up_gate_proj")] = (
+                            paddle.concat([gate_proj, up_proj], axis=-1)
+                        )
+
+                    if "self_attn.q_proj" in key:
+                        q_proj = new_state_dict.pop(key)
+                        k_proj = new_state_dict.pop(key.replace("q_proj", "k_proj"))
+                        v_proj = new_state_dict.pop(key.replace("q_proj", "v_proj"))
+                        new_state_dict[key.replace("q_proj", "qkv_proj")] = (
+                            paddle.concat([q_proj, k_proj, v_proj], axis=-1)
+                        )
+
+            return new_state_dict
+
+        state_dict = _convert_state_dict(state_dict)
+
+        std_state_dict = self.state_dict()
+        assert std_state_dict.keys() == state_dict.keys()
+        for key in std_state_dict:
+            v1 = std_state_dict[key]
+            state_dict[key] = state_dict[key].to(v1.place)
+
+        return self.set_state_dict(state_dict, *args, **kwargs)
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        pixel_values_videos: Optional[paddle.Tensor] = None,
+        image_grid_thw: Optional[paddle.Tensor] = None,
+        video_grid_thw: Optional[paddle.Tensor] = None,
+        rope_deltas: Optional[paddle.Tensor] = None,
+        second_per_grid_ts: Optional[paddle.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, PaddleOCRVLCausalLMOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        curr_rope_deltas = self.rope_deltas_var.get()
+
+        if inputs_embeds is None:
+            if input_ids.shape[0] != 1:
+                raise NotImplementedError
+            inputs_embeds = self.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.astype(inputs_embeds.dtype)
+                pixel_values = pixel_values.unsqueeze(0)
+                siglip_position_ids = list()
+                image_grid_hws = list()
+                sample_indices = list()
+                cu_seqlens = [0]
+
+                pro = 0
+                for idx, thw in enumerate(image_grid_thw):
+                    thw_tuple = tuple(thw.detach().cpu().numpy().tolist())
+                    numel = np.prod(thw_tuple)
+                    image_grid_hws.append(thw_tuple)
+                    image_position_ids = paddle.arange(numel) % np.prod(thw_tuple[1:])
+                    siglip_position_ids.append(image_position_ids)
+                    sample_indices.append(
+                        paddle.full((numel,), idx, dtype=paddle.int64)
+                    )
+                    cu_seqlens.append(cu_seqlens[-1] + numel)
+
+                siglip_position_ids = paddle.concat(siglip_position_ids, axis=0)
+                cu_seqlens = paddle.to_tensor(cu_seqlens, dtype=paddle.int32)
+                sample_indices = paddle.concat(sample_indices, axis=0)
+
+                vision_outputs = self.visual(
+                    pixel_values=pixel_values,
+                    image_grid_thw=image_grid_hws,
+                    position_ids=siglip_position_ids,
+                    vision_return_embed_list=True,
+                    interpolate_pos_encoding=True,
+                    sample_indices=sample_indices,
+                    cu_seqlens=cu_seqlens,
+                    return_pooler_output=False,
+                    use_rope=True,
+                    window_size=-1,
+                )
+                image_embeds = vision_outputs.last_hidden_state
+
+                image_embeds = self.mlp_AR(image_embeds, image_grid_thw)
+
+                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+                image_embeds = paddle.concat(image_embeds, axis=0)
+                n_image_features = image_embeds.shape[0]
+                if n_image_tokens != n_image_features:
+                    raise ValueError(
+                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                    )
+
+                mask = input_ids == self.config.image_token_id
+                mask_unsqueezed = mask.unsqueeze(-1)
+                mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+                image_mask = mask_expanded
+
+                image_embeds = image_embeds.astype(inputs_embeds.dtype)
+
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        else:
+            if inputs_embeds.shape[0] != 1:
+                raise NotImplementedError
+
+        if attention_mask is not None and attention_mask.dtype != paddle.bool:
+            attention_mask = paddle.cast(attention_mask, paddle.bool)
+
+        # position_ids = None
+        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
+        if position_ids is None and (
+            attention_mask is None or attention_mask.ndim == 2
+        ):
+            # calculate RoPE index once per generation in the pre-fill stage only
+            if curr_rope_deltas is None or (
+                past_key_values is None or past_key_values[0] is None
+            ):
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    second_per_grid_ts,
+                    attention_mask,
+                )
+                self.rope_deltas_var.set(rope_deltas)
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (past_key_values[0][0].shape[1] + curr_rope_deltas)
+                    if past_key_values is not None and past_key_values[0] is not None
+                    else 0
+                )
+                position_ids = paddle.arange(seq_length)
+                position_ids = position_ids.reshape((1, -1)).expand((batch_size, -1))
+                if (
+                    past_key_values is not None and past_key_values[0] is not None
+                ):  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(
+                        batch_size // delta.shape[0], axis=0
+                    )
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand((3, -1, -1))
+
+        outputs = self.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.astype("float32")
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            shift_logits = shift_logits.reshape((-1, self.config.vocab_size))
+            shift_labels = shift_labels.reshape((-1,))
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return PaddleOCRVLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=curr_rope_deltas,
+        )
+
+    def generate(self, inputs, **kwargs):
+        gen_kwargs = {
+            "max_new_tokens": kwargs.get("max_new_tokens", 8192),
+            "use_cache": kwargs.get("use_cache", True),
+        }
+        gen_kwargs = {**inputs, **gen_kwargs}
+        with paddle.no_grad():
+            generated_ids = super().generate(**gen_kwargs)
+        return generated_ids
+
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[paddle.Tensor],
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+        Returns:
+            image_nums (`paddle.Tensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`paddle.Tensor` of shape `(batch_size, num_videos_sample)`)
+        """
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+
+        vision_start_mask = input_ids == vision_start_token_id
+        vision_first_mask = paddle.roll(vision_start_mask, shifts=1, axis=1)
+        image_mask = input_ids == image_token_id
+        video_mask = input_ids == video_token_id
+        image_nums = paddle.sum(vision_first_mask & image_mask, axis=1)
+        video_nums = paddle.sum(vision_first_mask & video_mask, axis=1)
+
+        return image_nums, video_nums
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_projector.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_projector.py
new file mode 100644
index 0000000000..a33cb0ca71
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_projector.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/modeling_keye.py
+# Original header:
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input):
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input):
+        return self.act(input)
+
+
+class Projector(nn.Layer):
+
+    def __init__(self, text_config, vision_config):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (
+            self.vision_config.hidden_size
+            * self.merge_kernel_size[0]
+            * self.merge_kernel_size[1]
+        )
+
+        self.pre_norm = nn.LayerNorm(self.vision_config.hidden_size, epsilon=1e-05)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(self.hidden_size, self.text_config.hidden_size)
+
+    def forward(self, image_features, image_grid_thw):
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features, image_grid_thw):
+                image_feature = self.pre_norm(image_feature)  # shape: (T*H*W, D)
+                t, h, w = image_grid
+                from einops import rearrange
+
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=int(t),
+                    h=int(h // m1),
+                    p1=int(m1),
+                    w=int(w // m2),
+                    p2=int(m2),
+                )
+                hidden_states = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = paddle.reshape(image_features, [-1, dim])
+        hidden_states = self.pre_norm(image_features)
+        hidden_states = paddle.reshape(hidden_states, [-1, self.hidden_size])
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return paddle.reshape(hidden_states, [*dims, -1])
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_refined_recompute/__init__.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_refined_recompute/__init__.py
new file mode 100644
index 0000000000..b64cf01fdc
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_refined_recompute/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_refined_recompute/utils.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_refined_recompute/utils.py
new file mode 100644
index 0000000000..e1e32169cd
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_refined_recompute/utils.py
@@ -0,0 +1,346 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""refined recompute"""
+
+import inspect
+import queue
+from collections import defaultdict
+
+import paddle
+from paddle import framework
+from paddle.base import core
+
+__all__ = [
+    "RefinedRcomputeQueue",
+    "global_rr_queue_log",
+    "RefinedRecomputeFunction",
+    "create_skip_config_for_refined_recompute",
+]
+
+
+_is_second_fwd = False
+
+
+def is_second_fwd():
+    """
+    Determine if it is the second forward propagation
+    """
+    global _is_second_fwd
+    return _is_second_fwd
+
+
+def set_second_fwd(value=True):
+    """
+    Set whether to perform the second forward propagation based on the value
+    """
+    global _is_second_fwd
+    _is_second_fwd = value
+
+
+class CustomSavedTensorsHooks:
+    """
+    Customize saved_tensors_hooks, add logic for switching
+    variables related to the second forward propagation
+    """
+
+    def __init__(self, pack_hook, unpack_hook) -> None:
+        """
+        initialize the CustomSavedTensorsHooks object
+        """
+        self.pack_hook = pack_hook
+        self.unpack_hook = unpack_hook
+
+        self._prev = is_second_fwd()
+        pack_hook_name = f"{pack_hook.__module__}.{pack_hook.__name__}"
+        unpack_hook_name = f"{unpack_hook.__module__}.{unpack_hook.__name__}"
+        self._is_second_fwd = (
+            pack_hook_name == "paddle.distributed.fleet.recompute.recompute.inner_pack"
+            and unpack_hook_name
+            == "paddle.distributed.fleet.recompute.recompute.inner_unpack"
+        )
+
+    def __enter__(self) -> None:
+        """
+        enter the context of CustomSavedTensorsHooks
+        """
+        set_second_fwd(self._is_second_fwd)
+        core.eager.register_saved_tensors_hooks(self.pack_hook, self.unpack_hook)
+
+    def __exit__(self, *args: object) -> None:
+        """
+        exit the context of CustomSavedTensorsHooks
+        """
+        set_second_fwd(self._prev)
+        core.eager.reset_saved_tensors_hooks()
+
+
+# hack saved_tensors_hooks add set_second_fwd decorator
+paddle.autograd.saved_tensors_hooks = CustomSavedTensorsHooks
+
+
+def create_skip_config_for_refined_recompute(layer_idx, config):
+    """
+    Creates a configuration for skipping recomputation based on the configuration file,
+    effective only at the specified layer index.
+
+    Args:
+        layer_idx (int): The layer index used to check whether recomputation should be skipped.
+        config (dict): The configuration file of the input model.
+
+    Returns:
+        dict: Returns an updated configuration file containing the following key-value pairs:
+            - skip_recompute_ops (dict): A dictionary with each model layer's each operation's name and a boolean
+                                         indicating whether to skip recomputation, defaults to None.
+            - If the refined_recompute key does not exist or recompute is set to False,
+              the original configuration file is returned.
+
+    """
+    if not config.recompute:
+        return config
+    skip_config = dict()
+
+    if len(config.refined_recompute) > 0 and config.recompute_granularity != "full":
+        raise ValueError(
+            "Selective recompute only support full recompute now, "
+            "please set recompute_granularity to `full`."
+        )
+
+    for op_name, skip_num in config.refined_recompute.items():
+        if skip_num == 0:  # 0 means all recompute
+            skip_config[op_name] = False
+        elif skip_num < 0:  # < 0 means all skip recompute
+            skip_config[op_name] = True
+        else:
+            if layer_idx < skip_num:  # < the number of layers to skip recompute
+                skip_config[op_name] = True
+            else:
+                skip_config[op_name] = False
+
+    config.skip_recompute_ops[layer_idx] = skip_config
+    return config
+
+
+class RefinedRcomputeQueue:
+    """
+    Thread-safe queue management system for recomputation operations.
+
+    Provides a mechanism to track and validate multiple recomputation queues
+    with automatic naming and existence checking capabilities.
+    """
+
+    def __init__(self):
+        """
+        Initializes an empty queue registry.
+        """
+        self.rr_queue = defaultdict(queue.Queue)
+
+    def update(self, queue: queue.Queue, queue_name="unknown"):
+        """
+        Registers a new queue in the management system.
+
+        Args:
+            queue (queue.Queue): The queue object to register
+            queue_name (str): Base identifier for the queue (default: "unknown")
+                Note: Automatically appends the queue's memory address for uniqueness
+
+        Raises:
+            ValueError: If a queue with the generated name already exists
+        """
+        queue_name = f"{queue_name}_{id(queue)}"
+        if queue_name in self.rr_queue:
+            raise ValueError(f"Queue name '{queue_name}' already exists.")
+        self.rr_queue[queue_name] = queue
+
+    def check(self):
+        """
+        Validates all registered queues are empty.
+
+        Raises:
+            ValueError: If any registered queue contains pending items
+                Reports all non-empty queue names in the error message
+        """
+        non_empty_queues = [
+            name for name, queue in self.rr_queue.items() if queue.qsize() != 0
+        ]
+        if non_empty_queues:
+            raise ValueError(f"Queues {', '.join(non_empty_queues)} are not empty.")
+
+
+global_rr_queue_log = RefinedRcomputeQueue()
+
+
+class _NoopSaveInputs(paddle.autograd.PyLayer):
+    """
+    This layer does nothing but save all input tensors.
+    This is used to prevent the gradients of the inputs being computed.
+    """
+
+    @staticmethod
+    def forward(ctx, *args):
+        """This function does nothing but save all input tensors."""
+        tensors = [o.detach() for o in args if isinstance(o, paddle.Tensor)]
+        ctx.save_for_backward(*tensors)
+        # Return a dummy tensor which will be automatically released by the framework.
+        return paddle.empty((0,), dtype=tensors[0].dtype)
+
+    @staticmethod
+    def backward(ctx, *args):
+        """Should not be called since we don't support backward on this graph."""
+        raise AssertionError("Did not expect to backward on this graph")
+
+
+class RefinedRecomputeFunction:
+    """refined recompute for function"""
+
+    def __init__(self):
+        """
+        initialize the RefinedRecomputeFunction object.
+        """
+        self.is_init = False
+
+    def post_init(self, function, function_name=None):
+        """
+        post init the RefinedRecomputeFunction object.
+        """
+        if not self.is_init:
+            if function_name is None:
+                function_name = f"{function.__module__}.{function.__name__}"
+            self._hold_tensors_queue = queue.Queue()
+            global_rr_queue_log.update(self._hold_tensors_queue, function_name)
+            self.function = function
+            self.function_name = function_name
+            self.is_init = True
+
+    def __call__(self, function, *args, **kwargs):
+        """
+        call the RefinedRecomputeFunction object.
+        """
+        # in paddle.no_grad(), return the original output
+        if not framework._dygraph_tracer()._has_grad:
+            return function(*args, **kwargs)
+        self.post_init(function)
+        return self.forward(*args, **kwargs)
+
+    def forward(self, *args, **kwargs):
+        """Refined Recompute Forward"""
+        if is_second_fwd():
+            output = self._second_fwd(*args, **kwargs)
+        else:
+            output = self._first_fwd(*args, **kwargs)
+        return output
+
+    def _first_fwd(self, *args, **kwargs):
+        """
+        do the first forward
+        """
+        input_args = self.parse_to_args(*args, **kwargs)
+
+        # chose the right function
+        if self.function_name in [
+            "paddle.nn.functional.linear",
+            "paddle.nn.functional.common.linear",
+            "paddle.incubate.nn.functional.fused_linear",
+            "paddle.incubate.nn.functional.fused_matmul_bias.fused_linear",
+        ] or self.function_name.endswith("linear_reduce_scatter"):
+            # is linear function
+            outputs = self.function(*input_args)
+            self._hold_tensors_queue.put([outputs])
+            return outputs
+        else:
+            if (
+                self.function_name
+                == "paddle.nn.functional.flash_attention.flashmask_attention"
+            ):
+                kwargs["return_softmax_lse"] = True
+                kwargs["return_seed_offset"] = True
+                outputs = self.function(
+                    *args, **kwargs
+                )  # outputs is [out, result_softmax_lse, result_seed_offset]
+            elif (
+                self.function_name
+                == "paddle.nn.functional.flash_attention.flash_attention_with_sparse_mask"
+            ):
+                kwargs["return_softmax"] = False
+                kwargs["return_softmax_lse"] = True
+                kwargs["return_seed_offset"] = True
+                outputs = self.function(
+                    *args, **kwargs
+                )  # outputs is [out, result_softmax_lse, result_seed_offset]
+            elif self.function_name in [
+                "paddle.nn.functional.scaled_dot_product_attention",
+                "paddle.nn.functional.flash_attention.scaled_dot_product_attention",
+            ]:
+                fixed_seed_offset = (None,)
+                return_softmax = False
+                rng_name = ""
+                outputs = list(
+                    paddle._C_ops.flash_attn(
+                        *input_args[:3],
+                        fixed_seed_offset,
+                        *input_args[3:6],
+                        return_softmax,
+                        not input_args[6],
+                        rng_name,
+                    )
+                )
+                outputs.pop(
+                    1
+                )  # outputs is [out, result_softmax_lse, result_seed_offset]
+            else:
+                raise ValueError(
+                    f"Unknown function: {self.function_name}, please implement it first!"
+                )
+            self._hold_tensors_queue.put(outputs)
+            return outputs[0]
+
+    def _second_fwd(self, *args, **kwargs):
+        """
+        do the second forward
+        """
+        assert not self._hold_tensors_queue.empty(), "queue should not be empty"
+        input_args = self.parse_to_args(*args, **kwargs)
+        hold_tensors = self._hold_tensors_queue.get()
+        if len(hold_tensors) == 1:  # is linear function
+            _NoopSaveInputs.apply(*input_args[:2])
+        else:  # is flash function
+            _NoopSaveInputs.apply(*input_args, *hold_tensors)
+        return hold_tensors[0]
+
+    def parse_to_args(self, *args, **kwargs):
+        """
+        parse the input arguments and keywords to a list of arguments.
+        """
+        input_args = []
+        dyfunc_sig = inspect.signature(self.function)
+        bound_args = dyfunc_sig.bind(*args, **kwargs)
+        bound_args.apply_defaults()
+
+        for arg, param in zip(
+            bound_args.arguments.values(), dyfunc_sig.parameters.values()
+        ):
+            if param.kind == param.VAR_POSITIONAL:
+                input_args.extend(arg)
+            elif param.kind in (
+                param.POSITIONAL_ONLY,
+                param.POSITIONAL_OR_KEYWORD,
+            ):
+                input_args.append(arg)
+            elif param.kind == param.VAR_KEYWORD:
+                input_args.extend(arg.values())
+            elif param.kind == param.KEYWORD_ONLY:
+                input_args.append(arg)
+            else:
+                raise ValueError("Unknown parameter kind.")
+        return input_args
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_sequence_parallel_utils.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_sequence_parallel_utils.py
new file mode 100644
index 0000000000..19f062e415
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_sequence_parallel_utils.py
@@ -0,0 +1,339 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+
+import numpy as np
+import paddle
+from paddle import distributed as dist
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+
+from ._distributed.common_dist_utils import (
+    all_gather_group,
+    all_gather_varlen,
+    mp_slice,
+    reduce_scatter_group,
+    scatter_axis,
+)
+
+if not hasattr(paddle.Tensor, "contiguous"):
+
+    def contiguous(self):
+        """
+        Make the tensor contiguous.
+        """
+        return self
+
+    paddle.Tensor.contiguous = contiguous
+
+
+if not hasattr(paddle.Tensor, "_md5sum"):
+
+    def _md5sum(self):
+        """
+        Calculate the md5sum of the Tensor.
+        """
+        numpy_array = np.array(self)
+        array_bytes = numpy_array.tobytes()
+        return hashlib.md5(array_bytes).hexdigest()
+
+    paddle.Tensor._md5sum = _md5sum
+
+
+class _AllToAll(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        group,
+        output_split_sizes=None,
+        input_split_sizes=None,
+    ):
+        """
+        All-to-all communication in the group
+
+        Args:
+            ctx (Any): Context object.
+            input (Tensor): Input tensor.
+            group (Group): The group object.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+
+        ctx.group = group
+        ctx.input_split_sizes = input_split_sizes
+        ctx.output_split_sizes = output_split_sizes
+        # return input
+        if dist.get_world_size(group) <= 1:
+            return input
+        if input_split_sizes is None and output_split_sizes is None:
+            output = paddle.empty_like(input)
+            task = dist.stream.alltoall_single(
+                output, input, None, None, group, True, True
+            )
+            task.wait()
+        else:
+            out_sizes = [sum(output_split_sizes)]
+            out_sizes.extend(input.shape[1:])
+            output = paddle.empty(out_sizes, dtype=input.dtype)
+            task = dist.stream.alltoall_single(
+                output,
+                input,
+                output_split_sizes,
+                input_split_sizes,
+                group,
+                sync_op=False,
+            )
+            task.wait()
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        """
+        all-to-all backward
+
+        """
+        # return grad_output
+        if ctx.input_split_sizes is None and ctx.output_split_sizes is None:
+            return _AllToAll.apply(*grad_output, ctx.group)
+        else:
+            return _AllToAll.apply(
+                *grad_output, ctx.group, ctx.input_split_sizes, ctx.output_split_sizes
+            )
+
+
+class AllGatherVarlenOpV2(PyLayer):
+    """
+    Custom PyLayer for variable-length all-gather operation with autograd support.
+    """
+
+    @staticmethod
+    def forward(ctx, input, indices, axis=0, group=None):
+        """forward"""
+        ctx.axis = axis
+        ctx.group = group
+        ctx.indices = indices
+        return all_gather_varlen(input, indices, axis=axis, group=group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return mp_slice(grad, ctx.indices, axis=ctx.axis, group=ctx.group)
+
+
+class SliceVarlenOp(PyLayer):
+    """
+    Each rank slices a variable-length portion from the **same** sequence.
+    During backward pass, gradients from all ranks are aggregated to restore
+    the mp (model parallelism) synchronization state.
+
+    This is the variable-length version of `ScatterOp`. The inverse operation is `VarlenGatherOp`.
+
+    Args:
+        input: Tensor [S,*]
+        indices: Slice lengths for each rank
+        minimum_size: If slice is empty, return `minimum_size` dummy elements.
+    Returns:
+        Sliced Tensor
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        indices,
+        group=None,
+    ):
+        """forward"""
+        ctx.indices = indices
+        ctx.group = group
+        ret = mp_slice(input, indices, group=ctx.group)
+        return ret
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return all_gather_varlen(grad, axis=ctx.axis, group=ctx.group)
+
+
+class ScatterOp(PyLayer):
+    """
+    Each rank slices its own portion from the **same** sequence (uniformly split).
+    During backward pass, gradients from all ranks are aggregated to restore
+    the mp (model parallelism) synchronization state.
+    The inverse operation is `GatherOp`.
+
+    input: Tensor [S,*]
+
+    Note: Not related to `distributed.scatter`.
+    """
+
+    @staticmethod
+    def forward(ctx, input, axis=0, group=None):
+        """forward"""
+        ctx.axis = axis
+        ctx.group = group
+        return scatter_axis(input, axis=axis, group=ctx.group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return all_gather_group(grad, axis=ctx.axis, group=ctx.group)
+
+
+SliceOp = ScatterOp  # `ScatterOp` similar to Sclice
+
+
+class GatherOp(PyLayer):
+    """
+    input shape: [s/n, b, h], n is mp parallelism
+    after forward shape: [s, b, h]
+    Behavior is similar to `AllGather`, but gradients will not be aggregated in backward, from MP asynchronous state to MP synchronous state.
+    """
+
+    @staticmethod
+    def forward(ctx, input, axis=0, group=None):
+        """forward"""
+        ctx.axis = axis
+        ctx.group = group
+        return all_gather_group(input, axis=axis, group=group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return scatter_axis(grad, axis=ctx.axis, group=ctx.group)
+
+
+class AllGatherOp(PyLayer):
+    """
+    input shape: [s/n, b, h], n is mp parallelism
+    after forward shape: [s, b, h]
+    The behavior is similar to `AllGather`, and the gradients will be aggregated in backward. After AllGather, it is still in MP asynchronous state.
+    """
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """forward"""
+        ctx.group = group
+        return all_gather_group(input, group=group)
+
+    # grad shape: [s, b, h], n is mp parallelism
+    # after forward shape: [s/n, b, h]
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return reduce_scatter_group(grad, group=ctx.group)
+
+
+class AllGatherVarlenOp(PyLayer):
+    """the shape of allgather can be not same for each rank"""
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """forward"""
+        hcg = fleet.get_hybrid_communicate_group()
+        if group is None:
+            group = hcg.get_model_parallel_group()
+
+        shape0 = paddle.to_tensor([input.shape[0]])
+        shape0_all = paddle.empty(shape=[group.nranks], dtype=shape0.dtype)
+        dist.stream.all_gather(shape0_all, shape0, group=group, use_calc_stream=True)
+        shape0_all = shape0_all.numpy()
+        max_shape0 = shape0_all.max()
+
+        indices = []
+        for idx, s in enumerate(shape0_all):
+            offset = idx * max_shape0
+            indices.append(list(range(offset, offset + s)))
+        indices = np.concatenate(indices, axis=0)
+        indices = indices.reshape([-1] + [1] * (len(input.shape) - 1))
+        indices = paddle.to_tensor(indices, dtype=paddle.int32)
+
+        padding = max_shape0 - input.shape[0]
+
+        ctx.shape0 = input.shape[0]
+        ctx.max_shape0 = max_shape0
+        ctx.shape0_all = shape0_all
+        ctx.padding = padding
+        ctx.indices = indices
+        ctx.group = group
+
+        if padding > 0:
+            input_shape = input.shape
+            input_shape[0] = padding
+            padding_tensor = paddle.empty(shape=input_shape, dtype=input.dtype)
+            input = paddle.concat([input, padding_tensor], axis=0)
+        output = all_gather_group(input, group)
+        output = paddle.take_along_axis(output, indices, axis=0)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        input_shape = grad.shape
+        input_shape[0] = ctx.max_shape0 * ctx.shape0_all.shape[0]
+        output = paddle.zeros(shape=input_shape, dtype=grad.dtype)
+
+        grad = paddle.scatter(output, ctx.indices, grad)
+
+        grad = scatter_axis(grad, ctx.group)
+
+        if ctx.padding > 0:
+            grad = grad[: ctx.shape0]
+        return grad
+
+
+def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
+    """allgather sparse label and return sparse idx"""
+    hcg = fleet.get_hybrid_communicate_group()
+    group = hcg.get_model_parallel_group()
+    # parallelism = group.nranks
+    labels = labels.flatten()
+    labels_local = paddle.split(labels, group.nranks)[group.rank]
+
+    tgt_index = paddle.nonzero(labels_local != ignore_label).squeeze()
+    if tgt_index.numel() == 0:
+        tgt_index = paddle.to_tensor([0])
+
+    tgt_index = tgt_index.reshape([-1]).astype(paddle.int32)
+    labels_local_gather = paddle.take_along_axis(labels_local, tgt_index, axis=0)
+    labels_all_gather = AllGatherVarlenOp.apply(labels_local_gather)
+    return labels_all_gather, tgt_index.reshape([-1, 1])
+
+
+###################################################
+#                                                 #
+#        Modified Parallel Linear Operator        #
+#                                                 #
+###################################################
+
+
+def mark_as_sequence_parallel_parameter(parameter):
+    parameter.sequence_parallel = True
+
+
+class MPScale(PyLayer):
+    @staticmethod
+    def forward(ctx, x, mp_degree):
+        """forward"""
+        out = paddle.scale(x, 1.0 / mp_degree)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        """backward"""
+        return dout
diff --git a/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py
new file mode 100644
index 0000000000..a4a3c4a0c1
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/paddleocr_vl/_siglip.py
@@ -0,0 +1,860 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/modeling_keye.py
+# Original header:
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: Weight initialization
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....common.vlm.activations import ACT2FN
+from ....common.vlm.transformers import PretrainedModel
+from ....common.vlm.transformers.model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+)
+from ._config import PaddleOCRVisionConfig, PaddleOCRVLConfig
+
+
+def rotate_half(x):
+    Dh = x.shape[-1]
+    x1 = x[..., : Dh // 2]
+    x2 = x[..., Dh // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)
+
+
+def _ensure_cos_sin_dim(cos, sin, dim_needed):
+    last = cos.shape[-1]
+    if last == dim_needed:
+        return cos, sin
+    elif last * 2 == dim_needed:
+        cos = paddle.concat([cos, cos], axis=-1)
+        sin = paddle.concat([sin, sin], axis=-1)
+        return cos, sin
+    else:
+        raise ValueError(
+            f"Unexpected cos/sin last-dim: {last}, expected {dim_needed} or {dim_needed//2}"
+        )
+
+
+def apply_rotary_pos_emb_vision(q, k, cos, sin):
+    orig_q_dtype, orig_k_dtype = q.dtype, k.dtype
+    q = q.astype("float32")
+    k = k.astype("float32")
+
+    Dh = q.shape[-1]
+    cos = cos.astype("float32")
+    sin = sin.astype("float32")
+    cos, sin = _ensure_cos_sin_dim(cos, sin, Dh)
+
+    cos = cos.unsqueeze(-2)
+    sin = sin.unsqueeze(-2)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.astype(orig_q_dtype), k_embed.astype(orig_k_dtype)
+
+
+def eager_attention_forward(
+    module,
+    query,
+    key,
+    value,
+    attention_mask,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = paddle.matmul(query, key.transpose((0, 1, 3, 2))) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query.dtype)
+    attn_weights = F.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = paddle.matmul(attn_weights, value)
+    attn_output = attn_output.transpose((0, 2, 1, 3)).contiguous()
+
+    return attn_output, attn_weights
+
+
+class SiglipAttention(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        assert self.head_dim * self.num_heads == self.embed_dim
+        self.scale = self.head_dim**-0.5
+        self.dropout = getattr(config, "attention_dropout", 0.0)
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,  # [B, L, D]
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        cu_seqlens: Optional[List[paddle.Tensor]] = None,
+        rope_emb: Optional[Tuple[paddle.Tensor, paddle.Tensor]] = None,  # (cos, sin)
+    ):
+        B, L, D = hidden_states.shape
+
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+
+        # [B, L, H, Dh]
+
+        q = q.reshape([B, L, self.num_heads, self.head_dim])
+        k = k.reshape([B, L, self.num_heads, self.head_dim])
+        v = v.reshape([B, L, self.num_heads, self.head_dim])
+        if rope_emb is not None:
+            cos, sin = rope_emb
+            q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+
+        # → [B, H, L, Dh]
+        q = q.transpose([0, 2, 1, 3])
+        k = k.transpose([0, 2, 1, 3])
+        v = v.transpose([0, 2, 1, 3])
+
+        attn_output, attn_weights = eager_attention_forward(
+            self,
+            q,
+            k,
+            v,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+        attn_output = attn_output.reshape([B, L, D]).contiguous()
+
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class SiglipVisionEmbeddings(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size  # 1152
+        self.image_size = config.image_size  # 384
+        self.patch_size = config.patch_size  # 14
+
+        # 注意：Paddle 要用 "VALID" 或 0
+        self.patch_embedding = nn.Conv2D(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="VALID",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2  # 729
+        self.num_positions = self.num_patches
+        self.cache_position_embedding = dict()
+        self.cache_position_count = dict()
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+        self.register_buffer(
+            "position_ids",
+            paddle.arange(self.num_positions).unsqueeze(0),
+            persistable=False,
+        )
+
+    def interpolate_pos_encoding(
+        self, embeddings, height: int, width: int, is_after_patchify: bool = False
+    ):
+
+        num_positions = self.position_embedding.weight.shape[0]
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        if is_after_patchify:
+            new_height = height
+            new_width = width
+        else:
+            new_height = height // self.patch_size
+            new_width = width // self.patch_size
+
+        sqrt_num_positions = paddle.to_tensor(num_positions**0.5, dtype=paddle.int64)
+        patch_pos_embed = patch_pos_embed.reshape(
+            (1, sqrt_num_positions, sqrt_num_positions, dim)
+        )
+        patch_pos_embed = patch_pos_embed.transpose((0, 3, 1, 2))
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.transpose((0, 2, 3, 1)).reshape((1, -1, dim))
+        return patch_pos_embed
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def fetch_position_embedding_lfu_cache(self, embeddings, h, w, max_cache=20):
+        grid = (h, w)
+        if grid in self.cache_position_embedding:
+            self.cache_position_count[grid] += 1
+            return self.cache_position_embedding[grid]
+
+        if len(self.cache_position_embedding) >= max_cache:
+            min_hit_grid = min(
+                self.cache_position_count, key=self.cache_position_count.get
+            )
+            self.cache_position_count.pop(min_hit_grid)
+            self.cache_position_embedding.pop(min_hit_grid)
+
+        position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+        self.cache_position_count[grid] = 1
+        self.cache_position_embedding[grid] = position_embedding
+        return position_embedding
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,  # [B, L, C, H, W]
+        position_ids: Optional[paddle.Tensor] = None,  # [B or 1, S]
+        image_grid_thw: Optional[
+            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
+        ] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> paddle.Tensor:
+        if pixel_values.dim() == 5:
+            assert position_ids is not None
+            from einops import rearrange
+
+            batch_size, squence_len, channel, height, width = pixel_values.shape
+            target_dtype = self.patch_embedding.weight.dtype
+            pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype)
+            )  # shape = [*, width, grid, grid]
+            embeddings = patch_embeds.flatten(-2).squeeze(-1)
+            embeddings = rearrange(
+                embeddings, "(b l) d -> b l d", b=batch_size, l=squence_len
+            )
+
+            # todo: not dubug
+            if interpolate_pos_encoding and image_grid_thw is not None:
+                flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+                assert batch_size == 1
+                start = 0
+                image_embedding_list = list()
+
+                assert (
+                    sum([np.prod(x) for x in flatten_image_grid_thw])
+                    == embeddings.shape[1]
+                ), (flatten_image_grid_thw, embeddings.shape)
+                embeddings = embeddings.squeeze(0)
+                tmp_embeddings = list()
+                for image_grid in image_grid_thw:
+                    t, h, w = image_grid
+                    end = start + t * h * w
+                    image_embeddings = embeddings[int(start) : int(end), :]
+                    position_embedding = (
+                        self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                        .squeeze(0)
+                        .tile((t, 1))
+                    )
+                    image_embeddings = image_embeddings + position_embedding
+                    tmp_embeddings.append(image_embeddings)
+                    start = end
+                embeddings = paddle.concat(tmp_embeddings, axis=0).unsqueeze(0)
+            else:
+                embeddings = embeddings + self.packing_position_embedding(position_ids)
+            return embeddings
+        else:
+            raise NotImplementedError(str(pixel_values.shape))
+
+
+class SiglipMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(paddle.nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = paddle.nn.LayerNorm(
+            self.embed_dim, epsilon=config.layer_norm_eps
+        )
+        self.self_attn = SiglipAttention(config)
+        self.layer_norm2 = paddle.nn.LayerNorm(
+            self.embed_dim, epsilon=config.layer_norm_eps
+        )
+        self.mlp = SiglipMLP(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        cu_seqlens=None,
+        rope_emb=None,
+    ):
+
+        residual = hidden_states
+        ############################
+        ln1_out = self.layer_norm1(hidden_states)
+
+        x, attn_w = self.self_attn(
+            hidden_states=ln1_out,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            cu_seqlens=cu_seqlens,
+            rope_emb=rope_emb,
+        )
+
+        hs_post_attn = residual + x
+
+        residual = hs_post_attn
+        ln2_out = self.layer_norm2(residual)
+
+        mlp_out = self.mlp(ln2_out)
+
+        hidden_states_out = residual + mlp_out
+
+        outputs = (hidden_states_out,)
+        if output_attentions:
+            outputs += (attn_w,)
+        return outputs
+
+
+class SigLIPRotaryEmbedding(nn.Layer):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.rope_init()
+
+    def rope_init(self):
+        arange = paddle.arange(0, self.dim, 2, dtype="float32")
+        inv_freq = 1.0 / (self.theta ** (arange / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistable=False)
+
+    def forward(self, seqlen: int) -> paddle.Tensor:
+        seq = paddle.arange(seqlen, dtype=self.inv_freq.dtype)
+        freqs = paddle.outer(seq, self.inv_freq)
+        return freqs
+
+
+class SiglipEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        num_heads = config.num_attention_heads
+        head_dim = embed_dim // num_heads
+        self.layers = nn.LayerList(
+            [SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+        self.gradient_checkpointing = False
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def build_window_index(self, image_grid, window_size):
+        """
+        返回：
+          window_indices: int64 [sum(t*h*w_valid)]
+          cu_seqlens_within_windows: int32 [num_windows_total*t]，首位补 0 的前缀和
+        """
+        from einops import rearrange
+
+        window_indices = list()
+        pad_values = -100
+        start_window_index = 0
+        cu_seqlens_within_windows = list()
+
+        for t, h, w in map(int, image_grid):
+            window_index = paddle.arange(t * h * w).reshape((t, h, w))
+            pad_h = (-h) % window_size
+            pad_w = (-w) % window_size
+            assert pad_h >= 0 and pad_w >= 0, (pad_h, pad_w)
+            window_index = F.pad(window_index, (0, pad_w, 0, pad_h), value=pad_values)
+            window_index = rearrange(
+                window_index,
+                "t (h p1) (w p2) -> t (h w) (p1 p2)",
+                p1=window_size,
+                p2=window_size,
+            )
+            window_seqlens = (window_index != pad_values).long().sum(-1).reshape(-1)
+            window_index = window_index.reshape(-1)
+            window_index = window_index[window_index != pad_values]
+            window_indices.append(window_index + start_window_index)
+            cu_seqlens_within_windows.append(
+                window_seqlens.cumsum(0) + start_window_index
+            )
+            start_window_index += t * h * w
+        window_indices = paddle.concat(window_indices, axis=0)
+        cu_seqlens_within_windows = paddle.concat(cu_seqlens_within_windows, axis=0)
+        cu_seqlens_within_windows = F.pad(
+            cu_seqlens_within_windows, (1, 0), value=0
+        ).astype("int32")
+        return window_indices, cu_seqlens_within_windows
+
+    def forward(
+        self,
+        inputs_embeds: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cu_seqlens: Optional[paddle.Tensor] = None,
+        image_grid_thw: Optional[
+            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
+        ] = None,
+        height_position_ids: Optional[paddle.Tensor] = None,
+        width_position_ids: Optional[paddle.Tensor] = None,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[int] = -1,
+        vision_or_text: str = "vision",
+    ):
+
+        vision_or_text = "vision"
+        assert vision_or_text in ["vision", "text"]
+        use_window_attn = window_size > 0 and vision_or_text == "vision"
+        use_rope = (use_rope is True) and (vision_or_text == "vision")
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        attention_mask = (
+            attention_mask.to(inputs_embeds.dtype)
+            if attention_mask is not None
+            else None
+        )
+
+        if use_rope is True:
+            flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+            assert (
+                sum([np.prod(x) for x in flatten_image_grid_thw])
+                == hidden_states.shape[1]
+            ), (flatten_image_grid_thw, hidden_states.shape)
+
+            if width_position_ids is None or height_position_ids is None:
+                split_hids = list()
+                split_wids = list()
+                for t, h, w in flatten_image_grid_thw:
+                    t, h, w = map(int, (t, h, w))
+                    image_pids = paddle.arange(t * h * w) % (h * w)
+                    sample_hids = image_pids // w
+                    sample_wids = image_pids % w
+                    split_hids.append(sample_hids)
+                    split_wids.append(sample_wids)
+                width_position_ids = paddle.concat(split_wids, axis=0)
+                height_position_ids = paddle.concat(split_hids, axis=0)
+
+            window_indices, cu_seqlens_within_windows = None, None
+
+            if use_window_attn:
+                window_indices, cu_seqlens_within_windows = self.build_window_index(
+                    flatten_image_grid_thw, window_size
+                )
+                reversed_window_indices = window_indices.argsort()
+                height_position_ids = height_position_ids[window_indices]
+                width_position_ids = width_position_ids[window_indices]
+
+            pids = paddle.stack(
+                [height_position_ids, width_position_ids], axis=-1
+            ).astype(paddle.int64)
+            max_grid_size = pids.max() + 1
+            rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+
+            rope_emb = rope_emb_max_grid[pids].flatten(1)
+
+            rope_emb = rope_emb.tile((1, 2))
+            rope_emb = (rope_emb.cos(), rope_emb.sin())
+
+        else:
+            rope_emb = None
+
+            window_indices, cu_seqlens_within_windows = None, None
+
+            if use_window_attn:
+                flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+                assert (
+                    sum(
+                        [
+                            np.prod(x.astype("float32").cpu().numpy())
+                            for x in flatten_image_grid_thw
+                        ]
+                    )
+                    == hidden_states.shape[1]
+                ), (flatten_image_grid_thw, hidden_states.shape)
+
+                window_indices, cu_seqlens_within_windows = self.build_window_index(
+                    flatten_image_grid_thw, window_size
+                )
+                reversed_window_indices = window_indices.argsort()
+
+        if use_window_attn:
+            assert cu_seqlens_within_windows is not None
+            attn_cu_seqlens = cu_seqlens_within_windows
+            hidden_states = hidden_states[:, window_indices, :]
+        else:
+            attn_cu_seqlens = cu_seqlens
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (
+                    (hidden_states[:, reversed_window_indices, :],)
+                    if use_window_attn
+                    else (hidden_states,)
+                )
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+                cu_seqlens=attn_cu_seqlens,
+                rope_emb=rope_emb,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if use_window_attn:
+            hidden_states = hidden_states[:, reversed_window_indices, :]
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Layer):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: PaddleOCRVisionConfig):
+        super().__init__()
+
+        self.probe = self.create_parameter(
+            shape=(1, 1, config.hidden_size),
+            default_initializer=paddle.nn.initializer.Normal(),
+        )
+        self.attention = nn.MultiHeadAttention(
+            config.hidden_size, config.num_attention_heads
+        )
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+
+    def forward(self, hidden_state, key_padding_mask=None):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.tile((batch_size, 1, 1))
+
+        hidden_state = self.attention(
+            probe, hidden_state, hidden_state, key_padding_mask=key_padding_mask
+        )[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class SiglipVisionTransformer(nn.Layer):
+    def __init__(self, config: PaddleOCRVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+        self.use_head = (
+            True if not hasattr(config, "vision_use_head") else config.vision_use_head
+        )
+        if self.use_head:
+            self.head = SiglipMultiheadAttentionPoolingHead(config)
+
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        attention_mask=None,
+        sample_indices=None,
+        image_indices=None,
+        position_ids=None,
+        height_position_ids=None,
+        width_position_ids=None,
+        cu_seqlens=None,
+        padding_mask=None,
+        vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[
+            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
+        ] = None,
+        return_pooler_output: Optional[bool] = True,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+    ) -> BaseModelOutputWithPooling:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+        )
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            attention_mask=attention_mask,
+            cu_seqlens=cu_seqlens,
+            image_grid_thw=image_grid_thw,
+            use_rope=use_rope,
+            height_position_ids=height_position_ids,
+            width_position_ids=width_position_ids,
+            window_size=window_size,
+            vision_or_text="vision",
+        )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if return_pooler_output is True:
+            if sample_indices is not None:
+                assert self.use_head is True
+                dim = last_hidden_state.shape[-1]
+                sample_hidden_state_list = list()
+
+                hidden_state = last_hidden_state.squeeze(0)
+                sample_index = sample_indices
+                unique_sample_index = (
+                    paddle.unique(sample_index).sort().values.unbind(0)
+                )
+                unique_sample_index = list(unique_sample_index)
+                if len(unique_sample_index) > 0 and unique_sample_index[0] == -1:
+                    unique_sample_index = unique_sample_index[1:]
+                for sample_idx in unique_sample_index:
+                    token_indices = (sample_index == sample_idx).nonzero().flatten()
+                    sample_hidden_state = hidden_state[token_indices]
+                    sample_hidden_state_list.append(sample_hidden_state)
+
+                if not vision_return_embed_list:
+                    max_length = max(
+                        [_state.shape[0] for _state in sample_hidden_state_list]
+                    )
+                    tmp_sample_hidden_state_list = list()
+                    padding_mask = list()
+                    for idx, _state in enumerate(sample_hidden_state_list):
+                        padding_length = max_length - _state.shape[0]
+                        mask = _state.new_zeros(size=(max_length,), dtype=paddle.int64)
+                        mask[-padding_length:] = 1
+                        padding_mask.append(mask)
+                        padding = _state.new_zeros(size=(padding_length, dim))
+                        new_state = paddle.concat([_state, padding], axis=0)
+                        tmp_sample_hidden_state_list.append(new_state)
+                    sample_hidden_state = paddle.stack(
+                        tmp_sample_hidden_state_list, axis=0
+                    )
+                    padding_mask = (
+                        paddle.stack(padding_mask, axis=0)
+                        .astype("float32")
+                        .to(last_hidden_state.dtype)
+                    )
+                    pooler_output = self.head(
+                        sample_hidden_state, key_padding_mask=padding_mask
+                    )
+                else:
+                    pooler_output = list()
+                    for state in sample_hidden_state_list:
+                        sample_pooler_output = self.head(state.unsqueeze(0))
+                        pooler_output.append(sample_pooler_output)
+                    pooler_output = paddle.concat(pooler_output, axis=0)
+                    sample_hidden_state = sample_hidden_state_list
+
+                return BaseModelOutputWithPooling(
+                    last_hidden_state=sample_hidden_state,
+                    pooler_output=pooler_output,
+                    hidden_states=encoder_outputs.hidden_states,
+                    attentions=encoder_outputs.attentions,
+                )
+            else:
+                pooler_output = self.head(last_hidden_state) if self.use_head else None
+
+            return BaseModelOutputWithPooling(
+                last_hidden_state=last_hidden_state,
+                pooler_output=pooler_output,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+        sample_hidden_state = list()
+        assert cu_seqlens is not None
+        for i in range(cu_seqlens.shape[0] - 1):
+            start = cu_seqlens[i]
+            end = cu_seqlens[i + 1]
+            tensor = last_hidden_state[:, start:end, :].squeeze(0)
+            sample_hidden_state.append(tensor)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sample_hidden_state,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class SiglipPreTrainedModel(PretrainedModel):
+    config_class = PaddleOCRVLConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    _no_split_modules = [
+        "SiglipTextEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+
+class SiglipVisionModel(SiglipPreTrainedModel):
+    config_class = PaddleOCRVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: PaddleOCRVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = SiglipVisionTransformer(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        sample_indices=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        position_ids=None,
+        vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[
+            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
+        ] = None,
+        cu_seqlens=None,
+        return_pooler_output: Optional[bool] = True,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+    ) -> BaseModelOutputWithPooling:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            vision_return_embed_list=vision_return_embed_list,
+            image_grid_thw=image_grid_thw,
+            sample_indices=sample_indices,
+            cu_seqlens=cu_seqlens,
+            return_pooler_output=return_pooler_output,
+            use_rope=use_rope,
+            window_size=window_size,
+        )
diff --git a/paddlex/inference/models/doc_vlm/predictor.py b/paddlex/inference/models/doc_vlm/predictor.py
index c7fb3a0c77..9d252b454d 100644
--- a/paddlex/inference/models/doc_vlm/predictor.py
+++ b/paddlex/inference/models/doc_vlm/predictor.py
@@ -12,12 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import base64
 import copy
+import io
 import os
 import warnings
-from typing import List
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from threading import Lock
+from typing import List, Optional
+
+import numpy as np
 
 from ....modules.doc_vlm.model_list import MODELS
+from ....utils import logging
+from ....utils.deps import require_genai_client_plugin
 from ....utils.device import TemporaryDeviceChanger
 from ....utils.env import get_device_type
 from ...common.batch_sampler import DocVLMBatchSampler
@@ -32,6 +41,7 @@ class DocVLMPredictor(BasePredictor):
         "PP-DocBee": {"PP-DocBee-2B", "PP-DocBee-7B"},
         "PP-DocBee2": {"PP-DocBee2-3B"},
         "PP-Chart2Table": {"PP-Chart2Table"},
+        "PaddleOCR-VL": {"PaddleOCR-VL-0.9B"},
     }
 
     def __init__(self, *args, **kwargs):
@@ -40,18 +50,34 @@ def __init__(self, *args, **kwargs):
             *args: Arbitrary positional arguments passed to the superclass.
             **kwargs: Arbitrary keyword arguments passed to the superclass.
         """
-        import paddle
-
         super().__init__(*args, **kwargs)
-        self.device = kwargs.get("device", None)
-        self.dtype = (
-            "bfloat16"
-            if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
-            and (self.device is None or "cpu" not in self.device)
-            else "float32"
-        )
 
-        self.infer, self.processor = self._build(**kwargs)
+        if self._use_local_model:
+            import paddle
+
+            self.device = kwargs.get("device", None)
+            self.dtype = (
+                "bfloat16"
+                if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
+                and (self.device is None or "cpu" not in self.device)
+                else "float32"
+            )
+
+            self.infer, self.processor = self._build(**kwargs)
+
+            if (
+                self.model_name == "PaddleOCR-VL-0.9B"
+                and self.batch_sampler.batch_size > 1
+            ):
+                logging.warning(
+                    "Currently, the PaddleOCR-VL-0.9B local model only supports batch size of 1. The batch size will be updated to 1."
+                )
+                self.batch_sampler.batch_size = 1
+        else:
+            if self.batch_sampler.batch_size > 1:
+                self._thread_pool = ThreadPoolExecutor(
+                    max_workers=min(self.batch_sampler.batch_size, os.cpu_count() or 1)
+                )
 
     def _build_batch_sampler(self):
         """Builds and returns an DocVLMBatchSampler instance.
@@ -77,6 +103,7 @@ def _build(self, **kwargs):
             processor: The correspounding processor for the model.
         """
         from .modeling import (
+            PaddleOCRVLForConditionalGeneration,
             PPChart2TableInference,
             PPDocBee2Inference,
             PPDocBeeInference,
@@ -116,52 +143,130 @@ def _build(self, **kwargs):
                     self.model_dir,
                     dtype=self.dtype,
                 )
+        elif self.model_name in self.model_group["PaddleOCR-VL"]:
+            if kwargs.get("use_hpip", False):
+                warnings.warn(
+                    "The PaddelOCR-VL series does not support `use_hpip=True` for now."
+                )
+            with TemporaryDeviceChanger(self.device):
+                model = PaddleOCRVLForConditionalGeneration.from_pretrained(
+                    self.model_dir,
+                    dtype=self.dtype,
+                    convert_from_hf=True,
+                )
         else:
             raise NotImplementedError(f"Model {self.model_name} is not supported.")
 
         return model, processor
 
-    def process(self, data: List[dict], **kwargs):
+    def process(
+        self,
+        data: List[dict],
+        max_new_tokens: Optional[int] = None,
+        skip_special_tokens: Optional[bool] = None,
+        repetition_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ):
         """
         Process a batch of data through the preprocessing, inference, and postprocessing.
 
         Args:
             data (List[dict]): A batch of input data, must be a dict (e.g. {"image": /path/to/image, "query": some question}).
-            kwargs (Optional[dict]): Arbitrary keyword arguments passed to model.generate.
 
         Returns:
             dict: A dictionary containing the raw sample information and prediction results for every instance of the batch.
         """
-        assert all(isinstance(i, dict) for i in data)
+        # TODO: Sampling settings
+        # FIXME: When `skip_special_tokens` is `True`, the results from different backends may differ.
 
-        src_data = copy.copy(data)
-        # preprocess
-        data = self.processor.preprocess(data)
-        data = self._switch_inputs_to_device(data)
+        assert all(isinstance(i, dict) for i in data)
 
-        # do infer
-        with TemporaryDeviceChanger(self.device):
-            preds = self.infer.generate(data, **kwargs)
+        if self._use_local_model:
+            src_data = copy.copy(data)
+            # preprocess
+            data = self.processor.preprocess(data)
+            data = self._switch_inputs_to_device(data)
+
+            # do infer
+            generate_kwargs = {}
+            if max_new_tokens is not None:
+                generate_kwargs["max_new_tokens"] = max_new_tokens
+            elif self.model_name in self.model_group["PaddleOCR-VL"]:
+                generate_kwargs["max_new_tokens"] = 8192
+            if repetition_penalty is not None:
+                warnings.warn(
+                    "`repetition_penalty` is currently not supported by the local model and will be ignored."
+                )
+            if temperature is not None:
+                warnings.warn(
+                    "`temperature` is currently not supported by the local model and will be ignored."
+                )
+            if top_p is not None:
+                warnings.warn(
+                    "`top_p` is currently not supported by the local model and will be ignored."
+                )
+            if min_pixels is not None:
+                warnings.warn(
+                    "`min_pixels` is currently not supported by the local model and will be ignored."
+                )
+            if max_pixels is not None:
+                warnings.warn(
+                    "`max_pixels` is currently not supported by the local model and will be ignored."
+                )
+            if use_cache is not None:
+                generate_kwargs["use_cache"] = use_cache
+            with TemporaryDeviceChanger(self.device):
+                preds = self.infer.generate(
+                    data,
+                    **generate_kwargs,
+                )
 
-        # postprocess
-        preds = self.processor.postprocess(preds)
+            # postprocess
+            postprocess_kwargs = {}
+            if skip_special_tokens is not None:
+                postprocess_kwargs["skip_special_tokens"] = skip_special_tokens
+            preds = self.processor.postprocess(preds, **postprocess_kwargs)
+        else:
+            require_genai_client_plugin()
+
+            src_data = data
+
+            preds = self._genai_client_process(
+                data,
+                max_new_tokens=max_new_tokens,
+                skip_special_tokens=skip_special_tokens,
+                repetition_penalty=repetition_penalty,
+                temperature=temperature,
+                top_p=top_p,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+            )
 
         result_dict = self._format_result_dict(preds, src_data)
         return result_dict
 
     def build_processor(self, **kwargs):
         from ..common.tokenizer import (
+            LlamaTokenizer,
             MIXQwen2_5_Tokenizer,
             MIXQwen2Tokenizer,
             QWenTokenizer,
         )
+        from ..common.tokenizer.tokenizer_utils import ChatTemplate
         from .processors import (
             GOTImageProcessor,
+            PaddleOCRVLProcessor,
             PPChart2TableProcessor,
             PPDocBee2Processor,
             PPDocBeeProcessor,
             Qwen2_5_VLImageProcessor,
             Qwen2VLImageProcessor,
+            SiglipImageProcessor,
         )
 
         if self.model_name in self.model_group["PP-DocBee"]:
@@ -182,9 +287,29 @@ def build_processor(self, **kwargs):
             return PPDocBee2Processor(
                 image_processor=image_processor, tokenizer=tokenizer
             )
+        elif self.model_name in self.model_group["PaddleOCR-VL"]:
+            image_processor = SiglipImageProcessor.from_pretrained(self.model_dir)
+            vocab_file = str(Path(self.model_dir, "tokenizer.model"))
+            tokenizer = LlamaTokenizer.from_pretrained(
+                self.model_dir, vocab_file=vocab_file
+            )
+            # HACK
+            chat_template_file = Path(self.model_dir, "chat_template.jinja")
+            tokenizer.chat_template = ChatTemplate._compile_jinja_template(
+                chat_template_file.read_text(encoding="utf-8")
+            )
+            return PaddleOCRVLProcessor(
+                image_processor=image_processor,
+                tokenizer=tokenizer,
+            )
         else:
             raise NotImplementedError
 
+    def close(self):
+        super().close()
+        if hasattr(self, "_thread_pool"):
+            self._thread_pool.shutdown()
+
     def _format_result_dict(self, model_preds, src_data):
         if not isinstance(model_preds, list):
             model_preds = [model_preds]
@@ -251,3 +376,173 @@ def _switch_inputs_to_device(self, input_dict):
             for k in input_dict
         }
         return rst_dict
+
+    def crop_margin(self, img):  # 输入是OpenCV图像 (numpy数组)
+        import cv2
+
+        # 如果输入是彩色图像，转换为灰度图
+        if len(img.shape) == 3:
+            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        else:
+            gray = img.copy()
+
+        # 转换为0-255范围（确保是uint8类型）
+        if gray.dtype != np.uint8:
+            gray = gray.astype(np.uint8)
+
+        max_val = gray.max()
+        min_val = gray.min()
+
+        if max_val == min_val:
+            return img
+
+        # 归一化并二值化（与PIL版本逻辑一致）
+        data = (gray - min_val) / (max_val - min_val) * 255
+        data = data.astype(np.uint8)
+
+        # 创建二值图像（暗色区域为白色，亮色区域为黑色）
+        _, binary = cv2.threshold(data, 200, 255, cv2.THRESH_BINARY_INV)
+
+        # 查找非零像素坐标
+        coords = cv2.findNonZero(binary)
+
+        if coords is None:  # 如果没有找到任何内容，返回原图
+            return img
+
+        # 获取边界框
+        x, y, w, h = cv2.boundingRect(coords)
+
+        # 裁剪图像
+        cropped = img[y : y + h, x : x + w]
+
+        return cropped
+
+    def _genai_client_process(
+        self,
+        data,
+        max_new_tokens,
+        skip_special_tokens,
+        repetition_penalty,
+        temperature,
+        top_p,
+        min_pixels,
+        max_pixels,
+    ):
+        lock = Lock()
+
+        def _process(item):
+            image = item["image"]
+            prompt = item["query"]
+            if prompt == "Formula Recognition:":
+                image = self.crop_margin(image)
+            if isinstance(image, str):
+                if image.startswith("http://") or image.startswith("https://"):
+                    image_url = image
+                else:
+                    from PIL import Image
+
+                    with Image.open(image) as img:
+                        img = img.convert("RGB")
+                        with io.BytesIO() as buf:
+                            img.save(buf, format="JPEG")
+                            image_url = "data:image/jpeg;base64," + base64.b64encode(
+                                buf.getvalue()
+                            ).decode("ascii")
+            elif isinstance(image, np.ndarray):
+                import cv2
+                from PIL import Image
+
+                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                img = Image.fromarray(image)
+                with io.BytesIO() as buf:
+                    img.save(buf, format="JPEG")
+                    image_url = "data:image/jpeg;base64," + base64.b64encode(
+                        buf.getvalue()
+                    ).decode("ascii")
+            else:
+                raise TypeError(f"Not supported image type: {type(image)}")
+
+            if self._genai_client.backend == "fastdeploy-server":
+                kwargs = {
+                    "temperature": 1 if temperature is None else temperature,
+                    "top_p": 0 if top_p is None else top_p,
+                }
+            else:
+                kwargs = {
+                    "temperature": 0 if temperature is None else temperature,
+                }
+                if top_p is not None:
+                    kwargs["top_p"] = top_p
+
+            if max_new_tokens is not None:
+                kwargs["max_completion_tokens"] = max_new_tokens
+            elif self.model_name in self.model_group["PaddleOCR-VL"]:
+                kwargs["max_completion_tokens"] = 8192
+
+            kwargs["extra_body"] = {}
+            if skip_special_tokens is not None:
+                if self._genai_client.backend in (
+                    "fastdeploy-server",
+                    "vllm-server",
+                    "sglang-server",
+                ):
+                    kwargs["extra_body"]["skip_special_tokens"] = skip_special_tokens
+                else:
+                    raise ValueError("Not supported")
+
+            if repetition_penalty is not None:
+                kwargs["extra_body"]["repetition_penalty"] = repetition_penalty
+
+            if min_pixels is not None:
+                if self._genai_client.backend == "vllm-server":
+                    kwargs["extra_body"]["mm_processor_kwargs"] = kwargs[
+                        "extra_body"
+                    ].get("mm_processor_kwargs", {})
+                    kwargs["extra_body"]["mm_processor_kwargs"][
+                        "min_pixels"
+                    ] = min_pixels
+                else:
+                    warnings.warn(
+                        f"{repr(self._genai_client.backend)} does not support `min_pixels`."
+                    )
+
+            if max_pixels is not None:
+                if self._genai_client.backend == "vllm-server":
+                    kwargs["extra_body"]["mm_processor_kwargs"] = kwargs[
+                        "extra_body"
+                    ].get("mm_processor_kwargs", {})
+                    kwargs["extra_body"]["mm_processor_kwargs"][
+                        "max_pixels"
+                    ] = max_pixels
+                else:
+                    warnings.warn(
+                        f"{repr(self._genai_client.backend)} does not support `max_pixels`."
+                    )
+
+            with lock:
+                future = self._genai_client.create_chat_completion(
+                    [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "image_url", "image_url": {"url": image_url}},
+                                {"type": "text", "text": item["query"]},
+                            ],
+                        }
+                    ],
+                    return_future=True,
+                    **kwargs,
+                )
+                return future
+
+        if len(data) > 1:
+            futures = list(self._thread_pool.map(_process, data))
+        else:
+            futures = [_process(data[0])]
+
+        results = []
+        for future in futures:
+            result = future.result()
+            results.append(result.choices[0].message.content)
+
+        return results
diff --git a/paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py b/paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py
index 4c55c15c98..2b4fc3e58c 100644
--- a/paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py
+++ b/paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py
@@ -77,9 +77,11 @@ def preprocess(self, image: Union[str, Image.Image, np.ndarray, Dict, List]):
         return {"input_ids": input_ids, "images": images}
 
     @benchmark.timeit
-    def postprocess(self, model_pred, *args, **kwargs):
+    def postprocess(self, model_pred, **kwargs):
         return self.tokenizer.batch_decode(
-            model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            clean_up_tokenization_spaces=False,
         )
 
     def _load_image(self, image_file):
diff --git a/paddlex/inference/models/doc_vlm/processors/__init__.py b/paddlex/inference/models/doc_vlm/processors/__init__.py
index 1031846e45..80bcf22f58 100644
--- a/paddlex/inference/models/doc_vlm/processors/__init__.py
+++ b/paddlex/inference/models/doc_vlm/processors/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 from .GOT_ocr_2_0 import GOTImageProcessor, PPChart2TableProcessor
+from .paddleocr_vl import PaddleOCRVLProcessor, SiglipImageProcessor
 from .qwen2_5_vl import PPDocBee2Processor, Qwen2_5_VLImageProcessor
 from .qwen2_vl import PPDocBeeProcessor, Qwen2VLImageProcessor
diff --git a/paddlex/inference/models/doc_vlm/processors/common.py b/paddlex/inference/models/doc_vlm/processors/common.py
index e2ee195dda..b5cd32fae1 100644
--- a/paddlex/inference/models/doc_vlm/processors/common.py
+++ b/paddlex/inference/models/doc_vlm/processors/common.py
@@ -418,7 +418,7 @@ def process_vision_info(
         if "image" in vision_info or "image_url" in vision_info:
             image_inputs.append(fetch_image(vision_info))
         else:
-            raise ValueError("image, image_url should in content.")
+            raise ValueError("image, image_url should be in content.")
     if len(image_inputs) == 0:
         image_inputs = None
     return image_inputs
@@ -426,10 +426,10 @@ def process_vision_info(
 
 def fetch_image(
     ele: Dict[str, Union[str, Image.Image]],
-    size_factor: int,
-    min_pixels: int,
-    max_pixels: int,
-    max_ratio: float,
+    size_factor: Optional[int] = None,
+    min_pixels: Optional[int] = None,
+    max_pixels: Optional[int] = None,
+    max_ratio: Optional[float] = None,
 ) -> Image.Image:
     if not isinstance(ele, dict):
         ele = {"image": ele}
@@ -458,29 +458,41 @@ def fetch_image(
             f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
         )
     image = image_obj.convert("RGB")
-    # resize
-    if "resized_height" in ele and "resized_width" in ele:
-        resized_height, resized_width = smart_resize(
-            ele["resized_height"],
-            ele["resized_width"],
-            factor=size_factor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            max_ratio=max_ratio,
-        )
+
+    if (
+        size_factor is not None
+        and min_pixels is not None
+        and max_pixels is not None
+        and max_ratio is not None
+    ):
+        do_resize = True
     else:
-        width, height = image.size  # Image, not tensor
-        min_pixels = ele.get("min_pixels", min_pixels)
-        max_pixels = ele.get("max_pixels", max_pixels)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=size_factor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            max_ratio=max_ratio,
-        )
-    image = image.resize((resized_width, resized_height))
+        do_resize = False
+
+    if do_resize:
+        # resize
+        if "resized_height" in ele and "resized_width" in ele:
+            resized_height, resized_width = smart_resize(
+                ele["resized_height"],
+                ele["resized_width"],
+                factor=size_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                max_ratio=max_ratio,
+            )
+        else:
+            width, height = image.size  # Image, not tensor
+            min_pixels = ele.get("min_pixels", min_pixels)
+            max_pixels = ele.get("max_pixels", max_pixels)
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=size_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                max_ratio=max_ratio,
+            )
+        image = image.resize((resized_width, resized_height))
 
     return image
 
diff --git a/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/__init__.py b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/__init__.py
new file mode 100644
index 0000000000..1e5bdfafd5
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._paddleocr_vl import PaddleOCRVLProcessor
+from ._siglip import SiglipImageProcessor
diff --git a/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_paddleocr_vl.py b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_paddleocr_vl.py
new file mode 100644
index 0000000000..9b4dd9857b
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_paddleocr_vl.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/processing_keye.py
+# Original header:
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import List
+
+import paddle
+
+from .....utils.benchmark import benchmark
+from ..common import BatchFeature, fetch_image
+
+
+class PaddleOCRVLProcessor(object):
+    _DEFAULT_TEXT_KWARGS = {
+        "padding": False,
+        "return_tensors": "pd",
+    }
+    _DEFAULT_VIDEO_KWARGS = {
+        "fps": 2.0,
+        "return_tensors": "pd",
+    }
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+    ):
+        self.image_token = (
+            "<|IMAGE_PLACEHOLDER|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.video_token = (
+            "<|video_pad|>"
+            if not hasattr(tokenizer, "video_token")
+            else tokenizer.video_token
+        )
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+
+    @benchmark.timeit
+    def preprocess(
+        self,
+        input_dicts,
+    ):
+        images = [fetch_image(input_dict["image"]) for input_dict in input_dicts]
+
+        text = []
+        for input_dict in input_dicts:
+            messages = [
+                {
+                    "role": "user",
+                    "content": input_dict["query"],
+                }
+            ]
+            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False)
+            text.append(prompt)
+
+        videos = None
+        output_kwargs = {
+            "tokenizer_init_kwargs": self.tokenizer.init_kwargs,
+            "text_kwargs": copy.deepcopy(self._DEFAULT_TEXT_KWARGS),
+            "video_kwargs": copy.deepcopy(self._DEFAULT_VIDEO_KWARGS),
+        }
+
+        if images is not None:
+            image_inputs = self.image_processor(images=images, return_tensors="pd")
+            image_inputs["pixel_values"] = image_inputs["pixel_values"]
+            image_grid_thw = image_inputs["image_grid_thw"]
+
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+
+        if videos is not None:
+            # TODO: add video processing
+            videos_inputs = self.image_processor(
+                images=None, videos=videos, **output_kwargs["images_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+
+            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+            if isinstance(fps, (int, float)):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / fps
+                ] * len(video_grid_thw)
+            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / tmp for tmp in fps
+                ]
+            else:
+                raise ValueError(
+                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
+                )
+            videos_inputs.update(
+                {"second_per_grid_ts": paddle.to_tensor(second_per_grid_ts)}
+            )
+
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+
+        if not isinstance(text, list):
+            text = [text]
+
+        if image_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * int(
+                            image_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+
+        if video_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<|placeholder|>"
+                        * (
+                            video_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+
+    @benchmark.timeit
+    def postprocess(self, model_pred, **kwargs) -> List[str]:
+        return self.tokenizer.batch_decode(
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            spaces_between_special_tokens=False,
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
+        return names_from_processor + ["second_per_grid_ts"]
diff --git a/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_siglip.py b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_siglip.py
new file mode 100644
index 0000000000..393db7764a
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/processors/paddleocr_vl/_siglip.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/image_processing_keye.py
+# Original header:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Keye."""
+
+# TODO: Support videos
+
+import json
+import math
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ......utils import logging
+from ..common import (
+    BatchFeature,
+    convert_to_rgb,
+    make_batched_images,
+    make_list_of_images,
+    to_numpy_array,
+)
+
+_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+def adjust_size(size, patch_size):
+    num_patches = size // patch_size
+    if num_patches % 2 != 0:
+        num_patches -= 1
+    return num_patches * patch_size
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    # if height < factor or width < factor:
+    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # if int(height < factor//4) + int(width < factor//4):
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
+
+    if height < factor:
+        logging.debug(
+            f"smart_resize: height={height} < factor={factor}, reset height=factor"
+        )
+        width = round((width * factor) / height)
+        height = factor
+
+    if width < factor:
+        logging.debug(
+            f"smart_resize: width={width} < factor={factor}, reset width=factor"
+        )
+        height = round((height * factor) / width)
+        width = factor
+
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class SiglipImageProcessor(object):
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: int = 3,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 147384,
+        max_pixels: int = 28 * 28 * 3600,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
+        self.do_convert_rgb = do_convert_rgb
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_dir):
+        pretrained_model_dir = Path(pretrained_model_dir)
+        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
+        with open(image_processor_config_path, "r", encoding="utf-8") as f:
+            image_processor_config = json.load(f)
+        return cls(**image_processor_config)
+
+    def _preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+    ):
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        width, height = images[0].size
+        resized_height, resized_width = height, width
+        processed_images = []
+
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+
+                image = image.resize(
+                    (resized_width, resized_height), resample=self.resample
+                )
+
+            image = to_numpy_array(image)
+
+            if do_rescale:
+                image = (image * rescale_factor).astype(np.float32)
+
+            if do_normalize:
+                image = image.astype(np.float32)
+                image -= np.array(image_mean, dtype=np.float32)
+                image /= np.array(image_std, dtype=np.float32)
+
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h,
+            self.patch_size,
+            grid_w,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
+        assert self.temporal_patch_size == 1
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def __call__(
+        self,
+        images,
+        videos=None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors=None,
+    ):
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            raise NotImplementedError("Videos are not yet supported")
+
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    do_convert_rgb=do_convert_rgb,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py b/paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py
index 3916a21da9..35d899ee13 100644
--- a/paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py
+++ b/paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py
@@ -539,10 +539,12 @@ def preprocess(self, input_dicts: List[Dict]):
         return rst_inputs
 
     @benchmark.timeit
-    def postprocess(self, model_pred, *args, **kwargs) -> List[str]:
+    def postprocess(self, model_pred, **kwargs) -> List[str]:
         """
         Post process adapt for PaddleX
         """
         return self.tokenizer.batch_decode(
-            model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            clean_up_tokenization_spaces=False,
         )
diff --git a/paddlex/inference/models/doc_vlm/processors/qwen2_vl.py b/paddlex/inference/models/doc_vlm/processors/qwen2_vl.py
index 53cb675701..9b26dccc5f 100644
--- a/paddlex/inference/models/doc_vlm/processors/qwen2_vl.py
+++ b/paddlex/inference/models/doc_vlm/processors/qwen2_vl.py
@@ -534,10 +534,12 @@ def preprocess(self, input_dicts):
         return rst_inputs
 
     @benchmark.timeit
-    def postprocess(self, model_pred, *args, **kwargs):
+    def postprocess(self, model_pred, **kwargs):
         """
         Post process adapt for PaddleX
         """
         return self.tokenizer.batch_decode(
-            model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            clean_up_tokenization_spaces=False,
         )
diff --git a/paddlex/inference/models/formula_recognition/processors.py b/paddlex/inference/models/formula_recognition/processors.py
index d1ce8f962d..0c5be9ea01 100644
--- a/paddlex/inference/models/formula_recognition/processors.py
+++ b/paddlex/inference/models/formula_recognition/processors.py
@@ -337,8 +337,8 @@ def post_process(self, s: str) -> str:
             str: The post-processed LaTeX string.
         """
         text_reg = r"(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})"
-        letter = "[a-zA-Z]"
-        noletter = "[\W_^\d]"
+        letter = r"[a-zA-Z]"
+        noletter = r"[\W_^\d]"
         names = [x[0].replace(" ", "") for x in re.findall(text_reg, s)]
         s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
         news = s
@@ -840,8 +840,8 @@ def normalize(self, s: str) -> str:
             str: Normalized string.
         """
         text_reg = r"(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})"
-        letter = "[a-zA-Z]"
-        noletter = "[\W_^\d]"
+        letter = r"[a-zA-Z]"
+        noletter = r"[\W_^\d]"
         names = []
         for x in re.findall(text_reg, s):
             pattern = r"\\[a-zA-Z]+"
@@ -874,7 +874,7 @@ def normalize(self, s: str) -> str:
         return s.replace("XXXXXXX", " ")
 
     def remove_chinese_text_wrapping(self, formula):
-        pattern = re.compile(r"\\text\s*{\s*([^}]*?[\u4e00-\u9fff]+[^}]*?)\s*}")
+        pattern = re.compile(r"\\text\s*{([^{}]*[\u4e00-\u9fff]+[^{}]*)}")
 
         def replacer(match):
             return match.group(1)
diff --git a/paddlex/inference/models/formula_recognition/result.py b/paddlex/inference/models/formula_recognition/result.py
index 9ccf2dbc78..d5e694885f 100644
--- a/paddlex/inference/models/formula_recognition/result.py
+++ b/paddlex/inference/models/formula_recognition/result.py
@@ -164,18 +164,39 @@ def generate_tex_file(tex_file_path: str, equation: str) -> None:
         equation (str): The LaTeX equation to be written into the file.
     """
     with custom_open(tex_file_path, "w") as fp:
-        start_template = (
-            r"\documentclass[varwidth]{standalone}" + "\n"
-            r"\usepackage{cite}" + "\n"
-            r"\usepackage{amsmath,amssymb,amsfonts,upgreek}" + "\n"
-            r"\usepackage{graphicx}" + "\n"
-            r"\usepackage{textcomp}" + "\n"
-            r"\usepackage{xeCJK}" + "\n"
-            r"\DeclareMathSizes{14}{14}{9.8}{7}" + "\n"
-            r"\pagestyle{empty}" + "\n"
-            r"\begin{document}" + "\n"
-            r"\begin{large}" + "\n"
-        )
+        start_template = r"""
+            \documentclass[varwidth]{standalone}
+            \usepackage{cite}
+            \usepackage{amsmath,amssymb,amsfonts,upgreek}
+            \usepackage{graphicx}
+            \usepackage{textcomp}
+            \usepackage{xeCJK}
+            \DeclareMathSizes{14}{14}{9.8}{7}
+            \pagestyle{empty}
+            \makeatletter
+            \def\x@arrow{\DOTSB\Relbar}
+            \def\xlongequalsignfill@{\arrowfill@\x@arrow\Relbar\x@arrow}
+            \newcommand{\xlongequal}[2][]{\ext@arrow 0099\xlongequalsignfill@{#1}{#2}}
+            \def\xLongleftrightarrowfill@{\arrowfill@\Longleftarrow\Relbar\Longrightarrow}
+            \newcommand{\xLongleftrightarrow}[2][]{\ext@arrow 0099\xLongleftrightarrowfill@{#1}{#2}}
+            \def\xlongleftrightarrowfill@{\arrowfill@\longleftarrow\relbar\longrightarrow}
+            \newcommand{\xlongleftrightarrow}[2][]{\ext@arrow 0099\xlongleftrightarrowfill@{#1}{#2}}
+            \def\xLeftrightarrowfill@{\arrowfill@\Leftarrow\Relbar\Rightarrow}
+            \newcommand{\xLeftrightarrow}[2][]{\ext@arrow 0099\xLeftrightarrowfill@{#1}{#2}}
+            \def\xleftrightarrowfill@{\arrowfill@\leftarrow\relbar\rightarrow}
+            \newcommand{\xleftrightarrow}[2][]{\ext@arrow 0099\xleftrightarrowfill@{#1}{#2}}
+            \def\xLongleftarrowfill@{\arrowfill@\Longleftarrow\Relbar\Relbar}
+            \newcommand{\xLongleftarrow}[2][]{\ext@arrow 0099\xLongleftarrowfill@{#1}{#2}}
+            \def\xLongrightarrowfill@{\arrowfill@\Relbar\Relbar\Longrightarrow}
+            \newcommand{\xLongrightarrow}[2][]{\ext@arrow 0099\xLongrightarrowfill@{#1}{#2}}
+            \def\xlongleftarrowfill@{\arrowfill@\longleftarrow\relbar\relbar}
+            \newcommand{\xlongleftarrow}[2][]{\ext@arrow 0099\xlongleftarrowfill@{#1}{#2}}
+            \def\xlongrightarrowfill@{\arrowfill@\relbar\relbar\longrightarrow}
+            \newcommand{\xlongrightarrow}[2][]{\ext@arrow 0099\xlongrightarrowfill@{#1}{#2}}
+            \makeatother
+            \begin{document}
+            \begin{large}
+        """
         fp.write(start_template)
         equation = add_text_for_zh_formula(equation)
         equation = get_align_equation(equation)
diff --git a/paddlex/inference/models/multilingual_speech_recognition/processors.py b/paddlex/inference/models/multilingual_speech_recognition/processors.py
index 6289610698..ae5c7acade 100644
--- a/paddlex/inference/models/multilingual_speech_recognition/processors.py
+++ b/paddlex/inference/models/multilingual_speech_recognition/processors.py
@@ -1342,6 +1342,7 @@ def __init__(self, tokenizer: Tokenizer, sample_begin: int):
 
     def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor):
         if tokens.shape[1] == self.sample_begin:
+            logits.contiguous()
             logits[:, self.tokenizer.encode(" ").input_ids + [self.tokenizer.eot]] = (
                 -np.inf
             )
@@ -1352,6 +1353,7 @@ def __init__(self, suppress_tokens: Sequence[int]):
         self.suppress_tokens = list(suppress_tokens)
 
     def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor):
+        logits.contiguous()
         logits[:, self.suppress_tokens] = -np.inf
 
 
@@ -1369,6 +1371,7 @@ def __init__(
     def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor):
         # suppress <|notimestamps|> which is handled by without_timestamps
         if self.tokenizer.no_timestamps is not None:
+            logits.contiguous()
             logits[:, self.tokenizer.no_timestamps] = -np.inf
 
         # timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
@@ -1382,6 +1385,7 @@ def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor):
             )
 
             if last_was_timestamp:
+                logits.contiguous()
                 if penultimate_was_timestamp:  # has to be non-timestamp
                     logits[k, self.tokenizer.timestamp_begin :] = -np.inf
                 else:  # cannot be normal text tokens
@@ -1395,6 +1399,7 @@ def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor):
             last_allowed = (
                 self.tokenizer.timestamp_begin + self.max_initial_timestamp_index
             )
+            logits.contiguous()
             logits[:, last_allowed + 1 :] = -np.inf
 
         # if sum of probability over timestamps is above any other token, sample timestamp
@@ -1413,6 +1418,7 @@ def apply(self, logits: paddle.Tensor, tokens: paddle.Tensor):
                 logprobs[k, : self.tokenizer.timestamp_begin]
             )
             if timestamp_logprob > max_text_token_logprob:
+                logits.contiguous()
                 logits[k, : self.tokenizer.timestamp_begin] = -np.inf
 
 
diff --git a/paddlex/inference/models/object_detection/predictor.py b/paddlex/inference/models/object_detection/predictor.py
index 5d31bbdb00..e5859b3388 100644
--- a/paddlex/inference/models/object_detection/predictor.py
+++ b/paddlex/inference/models/object_detection/predictor.py
@@ -318,6 +318,7 @@ def build_to_batch(self):
             "PP-DocLayout-L",
             "PP-DocLayout_plus-L",
             "PP-DocBlockLayout",
+            "PP-DocLayoutV2",
         ]
         if any(name in self.model_name for name in models_required_imgsize):
             ordered_required_keys = (
diff --git a/paddlex/inference/models/object_detection/processors.py b/paddlex/inference/models/object_detection/processors.py
index 5b473e0b3b..cd3969e2f8 100644
--- a/paddlex/inference/models/object_detection/processors.py
+++ b/paddlex/inference/models/object_detection/processors.py
@@ -746,11 +746,12 @@ def apply(
             )
 
         if layout_nms:
-            selected_indices = nms(boxes, iou_same=0.6, iou_diff=0.98)
+            selected_indices = nms(boxes[:, :6], iou_same=0.6, iou_diff=0.98)
             boxes = np.array(boxes[selected_indices])
 
         filter_large_image = True
-        if filter_large_image and len(boxes) > 1 and boxes.shape[1] == 6:
+        # boxes.shape[1] == 6 is object detection, 8 is ordered object detection
+        if filter_large_image and len(boxes) > 1 and boxes.shape[1] in [6, 8]:
             if img_size[0] > img_size[1]:
                 area_thres = 0.82
             else:
@@ -759,7 +760,14 @@ def apply(
             img_area = img_size[0] * img_size[1]
             filtered_boxes = []
             for box in boxes:
-                label_index, score, xmin, ymin, xmax, ymax = box
+                (
+                    label_index,
+                    score,
+                    xmin,
+                    ymin,
+                    xmax,
+                    ymax,
+                ) = box[:6]
                 if label_index == image_index:
                     xmin = max(0, xmin)
                     ymin = max(0, ymin)
@@ -789,7 +797,7 @@ def apply(
                     pass
                 else:
                     contains_other, contained_by_other = check_containment(
-                        boxes, formula_index
+                        boxes[:, :6], formula_index
                     )
                     if layout_merge_bboxes_mode == "large":
                         boxes = boxes[contained_by_other == 0]
@@ -808,13 +816,19 @@ def apply(
                     else:
                         if layout_mode == "large":
                             contains_other, contained_by_other = check_containment(
-                                boxes, formula_index, category_index, mode=layout_mode
+                                boxes[:, :6],
+                                formula_index,
+                                category_index,
+                                mode=layout_mode,
                             )
                             # Remove boxes that are contained by other boxes
                             keep_mask &= contained_by_other == 0
                         elif layout_mode == "small":
                             contains_other, contained_by_other = check_containment(
-                                boxes, formula_index, category_index, mode=layout_mode
+                                boxes[:, :6],
+                                formula_index,
+                                category_index,
+                                mode=layout_mode,
                             )
                             # Keep boxes that do not contain others or are contained by others
                             keep_mask &= (contains_other == 0) | (
@@ -823,7 +837,13 @@ def apply(
                 boxes = boxes[keep_mask]
 
         if boxes.size == 0:
-            return []
+            return np.array([])
+
+        if boxes.shape[1] == 8:
+            # Sort boxes by their order
+            sorted_idx = np.lexsort((-boxes[:, 7], boxes[:, 6]))
+            sorted_boxes = boxes[sorted_idx]
+            boxes = sorted_boxes[:, :6]
 
         if layout_unclip_ratio:
             if isinstance(layout_unclip_ratio, float):
diff --git a/paddlex/inference/models/object_detection/utils.py b/paddlex/inference/models/object_detection/utils.py
index da464a6120..a34ff547e6 100644
--- a/paddlex/inference/models/object_detection/utils.py
+++ b/paddlex/inference/models/object_detection/utils.py
@@ -67,4 +67,5 @@
     "PP-DocLayout-S",
     "PP-DocLayout_plus-L",
     "PP-DocBlockLayout",
+    "PP-DocLayoutV2",
 ]
diff --git a/paddlex/inference/models/text_recognition/predictor.py b/paddlex/inference/models/text_recognition/predictor.py
index 95ea227477..1dcc2f5b87 100644
--- a/paddlex/inference/models/text_recognition/predictor.py
+++ b/paddlex/inference/models/text_recognition/predictor.py
@@ -15,6 +15,7 @@
 import numpy as np
 
 from ....modules.text_recognition.model_list import MODELS
+from ....utils.deps import class_requires_deps, is_dep_available
 from ....utils.fonts import (
     ARABIC_FONT,
     CYRILLIC_FONT,
@@ -35,7 +36,11 @@
 from .processors import CTCLabelDecode, OCRReisizeNormImg, ToBatch
 from .result import TextRecResult
 
+if is_dep_available("python-bidi"):
+    from bidi.algorithm import get_display
 
+
+@class_requires_deps("python-bidi")
 class TextRecPredictor(BasePredictor):
 
     entities = MODELS
@@ -104,6 +109,11 @@ def process(self, batch_data, return_word_box=False):
             wh_ratio_list=wh_ratio_list,
             max_wh_ratio=max_wh_ratio,
         )
+        if self.model_name in (
+            "arabic_PP-OCRv3_mobile_rec",
+            "arabic_PP-OCRv5_mobile_rec",
+        ):
+            texts = [get_display(s) for s in texts]
         return {
             "input_path": batch_data.input_paths,
             "page_index": batch_data.page_indexes,
@@ -152,6 +162,7 @@ def get_vis_font(self):
 
         if self.model_name in (
             "cyrillic_PP-OCRv3_mobile_rec",
+            "cyrillic_PP-OCRv5_mobile_rec",
             "eslav_PP-OCRv5_mobile_rec",
         ):
             return CYRILLIC_FONT
@@ -168,17 +179,23 @@ def get_vis_font(self):
         if self.model_name == "el_PP-OCRv5_mobile_rec":
             return EL_FONT
 
-        if self.model_name == "arabic_PP-OCRv3_mobile_rec":
+        if self.model_name in (
+            "arabic_PP-OCRv3_mobile_rec",
+            "arabic_PP-OCRv5_mobile_rec",
+        ):
             return ARABIC_FONT
 
         if self.model_name == "ka_PP-OCRv3_mobile_rec":
             return KANNADA_FONT
 
-        if self.model_name == "te_PP-OCRv3_mobile_rec":
+        if self.model_name in ("te_PP-OCRv3_mobile_rec", "te_PP-OCRv5_mobile_rec"):
             return TELUGU_FONT
 
-        if self.model_name == "ta_PP-OCRv3_mobile_rec":
+        if self.model_name in ("ta_PP-OCRv3_mobile_rec", "ta_PP-OCRv5_mobile_rec"):
             return TAMIL_FONT
 
-        if self.model_name == "devanagari_PP-OCRv3_mobile_rec":
+        if self.model_name in (
+            "devanagari_PP-OCRv3_mobile_rec",
+            "devanagari_PP-OCRv5_mobile_rec",
+        ):
             return DEVANAGARI_FONT
diff --git a/paddlex/inference/pipelines/__init__.py b/paddlex/inference/pipelines/__init__.py
index 038247514f..1ae8758244 100644
--- a/paddlex/inference/pipelines/__init__.py
+++ b/paddlex/inference/pipelines/__init__.py
@@ -41,6 +41,7 @@
 from .ocr import OCRPipeline
 from .open_vocabulary_detection import OpenVocabularyDetectionPipeline
 from .open_vocabulary_segmentation import OpenVocabularySegmentationPipeline
+from .paddleocr_vl import PaddleOCRVLPipeline
 from .pp_chatocr import PP_ChatOCRv3_Pipeline, PP_ChatOCRv4_Pipeline
 from .pp_doctranslation import PP_DocTranslation_Pipeline
 from .pp_shitu_v2 import ShiTuV2Pipeline
diff --git a/paddlex/inference/pipelines/_parallel.py b/paddlex/inference/pipelines/_parallel.py
index 5ec61191c1..75e9a2d9b7 100644
--- a/paddlex/inference/pipelines/_parallel.py
+++ b/paddlex/inference/pipelines/_parallel.py
@@ -44,7 +44,7 @@ def execute(
                 input_future_pairs = []
                 for pipeline in self._pipelines:
                     try:
-                        input_batch = list(input_batches)[0]
+                        input_batch = next(input_batches)
                     except StopIteration:
                         out_of_data = True
                         break
diff --git a/paddlex/inference/pipelines/base.py b/paddlex/inference/pipelines/base.py
index 220ddd5080..df627f8ab5 100644
--- a/paddlex/inference/pipelines/base.py
+++ b/paddlex/inference/pipelines/base.py
@@ -91,6 +91,7 @@ def create_model(self, config: Dict, **kwargs) -> BasePredictor:
         if self.hpi_config is not None:
             hpi_config = hpi_config or {}
             hpi_config = {**self.hpi_config, **hpi_config}
+        genai_config = config.get("genai_config", None)
 
         from .. import create_predictor
 
@@ -110,6 +111,7 @@ def create_model(self, config: Dict, **kwargs) -> BasePredictor:
             pp_option=pp_option,
             use_hpip=use_hpip,
             hpi_config=hpi_config,
+            genai_config=genai_config,
             **kwargs,
         )
         return model
@@ -146,6 +148,9 @@ def create_pipeline(self, config: Dict):
         )
         return pipeline
 
+    def close(self):
+        pass
+
     def __call__(self, input, **kwargs):
         """
         Calls the predict method with the given input and keyword arguments.
diff --git a/paddlex/inference/pipelines/doc_understanding/pipeline.py b/paddlex/inference/pipelines/doc_understanding/pipeline.py
index b80885feda..1b20c4e098 100644
--- a/paddlex/inference/pipelines/doc_understanding/pipeline.py
+++ b/paddlex/inference/pipelines/doc_understanding/pipeline.py
@@ -71,3 +71,6 @@ def predict(self, input: Dict, **kwargs) -> DocVLMResult:
             DocVLMResult: The predicted doc understanding results.
         """
         yield from self.doc_understanding_model(input, **kwargs)
+
+    def close(self):
+        self.doc_understanding_model.close()
diff --git a/paddlex/inference/pipelines/formula_recognition/pipeline.py b/paddlex/inference/pipelines/formula_recognition/pipeline.py
index 542ac7724c..82609a7aea 100644
--- a/paddlex/inference/pipelines/formula_recognition/pipeline.py
+++ b/paddlex/inference/pipelines/formula_recognition/pipeline.py
@@ -268,7 +268,7 @@ def predict(
                     layout_det_results = []
                     for _ in doc_preprocessor_images:
                         try:
-                            layout_det_res = list(external_layout_det_results)[0]
+                            layout_det_res = next(external_layout_det_results)
                         except StopIteration:
                             raise ValueError("No more layout det results")
                         layout_det_results.append(layout_det_res)
diff --git a/paddlex/inference/pipelines/layout_parsing/pipeline_v2.py b/paddlex/inference/pipelines/layout_parsing/pipeline_v2.py
index f6476cc3af..5efa719107 100644
--- a/paddlex/inference/pipelines/layout_parsing/pipeline_v2.py
+++ b/paddlex/inference/pipelines/layout_parsing/pipeline_v2.py
@@ -35,7 +35,7 @@
 from .result_v2 import LayoutParsingResultV2
 from .setting import BLOCK_LABEL_MAP, BLOCK_SETTINGS, REGION_SETTINGS
 from .utils import (
-    caculate_bbox_area,
+    calculate_bbox_area,
     calculate_minimum_enclosing_bbox,
     calculate_overlap_ratio,
     convert_formula_res_to_ocr_format,
@@ -86,6 +86,10 @@ def __init__(
         self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
         self.img_reader = ReadImage(format="BGR")
 
+    def close(self):
+        if getattr(self, "chart_recognition_model"):
+            self.chart_recognition_model.close()
+
     def inintial_predictor(self, config: dict) -> None:
         """Initializes the predictor based on the provided configuration.
 
@@ -106,6 +110,7 @@ def inintial_predictor(self, config: dict) -> None:
             self.use_doc_preprocessor = False
         self.use_table_recognition = config.get("use_table_recognition", True)
         self.use_seal_recognition = config.get("use_seal_recognition", True)
+        self.format_block_content = config.get("format_block_content", False)
         self.use_region_detection = config.get(
             "use_region_detection",
             True,
@@ -331,7 +336,7 @@ def standardized_data(
 
             # update the region box and max_block_area according to the layout boxes
             base_region_bbox = update_region_box(box, base_region_bbox)
-            max_block_area = max(max_block_area, caculate_bbox_area(box))
+            max_block_area = max(max_block_area, calculate_bbox_area(box))
 
             # update_layout_order_config_block_index(layout_order_config, label, box_idx)
 
@@ -367,7 +372,7 @@ def standardized_data(
         # check if there is only one paragraph title and without doc_title
         only_one_paragraph_title = len(paragraph_title_list) == 1 and doc_title_num == 0
         if only_one_paragraph_title:
-            paragraph_title_block_area = caculate_bbox_area(
+            paragraph_title_block_area = calculate_bbox_area(
                 layout_det_res["boxes"][paragraph_title_list[0]]["coordinate"]
             )
             title_area_max_block_threshold = BLOCK_SETTINGS.get(
@@ -505,7 +510,7 @@ def standardized_data(
         block_bboxes = [box["coordinate"] for box in layout_det_res["boxes"]]
         region_det_res["boxes"] = sorted(
             region_det_res["boxes"],
-            key=lambda item: caculate_bbox_area(item["coordinate"]),
+            key=lambda item: calculate_bbox_area(item["coordinate"]),
         )
         if len(region_det_res["boxes"]) == 0:
             region_det_res["boxes"] = [
@@ -830,11 +835,12 @@ def get_layout_parsing_res(
 
         parsing_res_list = self.sort_layout_parsing_blocks(layout_parsing_page)
 
-        index = 1
-        for block in parsing_res_list:
+        order_index = 1
+        for index, block in enumerate(parsing_res_list):
+            block.index = index
             if block.label in BLOCK_LABEL_MAP["visualize_index_labels"]:
-                block.order_index = index
-                index += 1
+                block.order_index = order_index
+                order_index += 1
 
         return parsing_res_list
 
@@ -847,6 +853,7 @@ def get_model_settings(
         use_formula_recognition: Union[bool, None],
         use_chart_recognition: Union[bool, None],
         use_region_detection: Union[bool, None],
+        format_block_content: Union[bool, None],
     ) -> dict:
         """
         Get the model settings based on the provided parameters or default values.
@@ -857,6 +864,7 @@ def get_model_settings(
             use_seal_recognition (Union[bool, None]): Enables seal recognition if True. Defaults to system setting if None.
             use_table_recognition (Union[bool, None]): Enables table recognition if True. Defaults to system setting if None.
             use_formula_recognition (Union[bool, None]): Enables formula recognition if True. Defaults to system setting if None.
+            format_block_content (Union[bool, None]): Enables block content formatting if True. Defaults to system setting if None.
 
         Returns:
             dict: A dictionary containing the model settings.
@@ -885,6 +893,9 @@ def get_model_settings(
         if use_chart_recognition is None:
             use_chart_recognition = self.use_chart_recognition
 
+        if format_block_content is None:
+            format_block_content = self.format_block_content
+
         return dict(
             use_doc_preprocessor=use_doc_preprocessor,
             use_seal_recognition=use_seal_recognition,
@@ -892,6 +903,7 @@ def get_model_settings(
             use_formula_recognition=use_formula_recognition,
             use_chart_recognition=use_chart_recognition,
             use_region_detection=use_region_detection,
+            format_block_content=format_block_content,
         )
 
     def predict(
@@ -905,6 +917,7 @@ def predict(
         use_formula_recognition: Union[bool, None] = None,
         use_chart_recognition: Union[bool, None] = None,
         use_region_detection: Union[bool, None] = None,
+        format_block_content: Union[bool, None] = None,
         layout_threshold: Optional[Union[float, dict]] = None,
         layout_nms: Optional[bool] = None,
         layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
@@ -942,6 +955,7 @@ def predict(
             use_table_recognition (Optional[bool]): Whether to use table recognition.
             use_formula_recognition (Optional[bool]): Whether to use formula recognition.
             use_region_detection (Optional[bool]): Whether to use region detection.
+            format_block_content (Optional[bool]): Whether to format block content.
             layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
             layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
             layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
@@ -981,6 +995,7 @@ def predict(
             use_formula_recognition,
             use_chart_recognition,
             use_region_detection,
+            format_block_content,
         )
 
         if not self.check_model_settings_valid(model_settings):
diff --git a/paddlex/inference/pipelines/layout_parsing/result_v2.py b/paddlex/inference/pipelines/layout_parsing/result_v2.py
index 40c4571c53..5051a0a2bf 100644
--- a/paddlex/inference/pipelines/layout_parsing/result_v2.py
+++ b/paddlex/inference/pipelines/layout_parsing/result_v2.py
@@ -109,9 +109,13 @@ def format_image_scaled_by_html_func(block, original_image_width):
 
 def format_image_plain_func(block):
     img_tags = []
-    image_path = block.image["path"]
-    img_tags.append("![]({})".format(image_path.replace("-\n", "").replace("\n", " ")))
-    return "\n".join(img_tags)
+    if block.image:
+        image_path = block.image["path"]
+        img_tags.append(
+            "![]({})".format(image_path.replace("-\n", "").replace("\n", " "))
+        )
+        return "\n".join(img_tags)
+    return ""
 
 
 def format_chart2table_func(block):
@@ -230,6 +234,18 @@ def _to_str(self, *args, **kwargs) -> dict[str, str]:
         data["page_index"] = self["page_index"]
         model_settings = self["model_settings"]
         data["model_settings"] = model_settings
+        parsing_res_list: List[LayoutBlock] = self["parsing_res_list"]
+        parsing_res_list = [
+            {
+                "block_label": parsing_res.label,
+                "block_content": parsing_res.content,
+                "block_bbox": parsing_res.bbox,
+                "block_id": parsing_res.index,
+                "block_order": parsing_res.order_index,
+            }
+            for parsing_res in parsing_res_list
+        ]
+        data["parsing_res_list"] = parsing_res_list
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
         data["layout_det_res"] = self["layout_det_res"].str["res"]
@@ -266,21 +282,108 @@ def _to_json(self, *args, **kwargs) -> dict[str, str]:
         Returns:
             Dict[str, str]: A dictionary containing the object's data in JSON format.
         """
+        if self["model_settings"].get("format_block_content", False):
+            original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
+            format_text_func = lambda block: format_centered_by_html(
+                format_text_plain_func(block)
+            )
+            format_image_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                )
+            )
+
+            if self["model_settings"].get("use_chart_recognition", False):
+                format_chart_func = format_chart2table_func
+            else:
+                format_chart_func = format_image_func
+
+            if self["model_settings"].get("use_seal_recognition", False):
+                format_seal_func = lambda block: "\n".join(
+                    [format_image_func(block), format_text_func(block)]
+                )
+            else:
+                format_seal_func = format_image_func
+
+            if self["model_settings"].get("use_table_recognition", False):
+                format_table_func = lambda block: "\n" + format_text_func(
+                    block
+                ).replace("<table>", '<table border="1">')
+            else:
+                format_table_func = format_image_func
+
+            if self["model_settings"].get("use_formula_recognition", False):
+                format_formula_func = lambda block: f"$${block.content}$$"
+            else:
+                format_formula_func = format_image_func
+
+            handle_funcs_dict = {
+                "paragraph_title": format_title_func,
+                "abstract_title": format_title_func,
+                "reference_title": format_title_func,
+                "content_title": format_title_func,
+                "doc_title": lambda block: f"# {block.content}".replace(
+                    "-\n",
+                    "",
+                ).replace("\n", " "),
+                "table_title": format_text_func,
+                "figure_title": format_text_func,
+                "chart_title": format_text_func,
+                "vision_footnote": lambda block: block.content.replace(
+                    "\n\n", "\n"
+                ).replace("\n", "\n\n"),
+                "text": lambda block: block.content.replace("\n\n", "\n").replace(
+                    "\n", "\n\n"
+                ),
+                "abstract": partial(
+                    format_first_line_func,
+                    templates=["摘要", "abstract"],
+                    format_func=lambda l: f"## {l}\n",
+                    spliter=" ",
+                ),
+                "content": lambda block: block.content.replace("-\n", "  \n").replace(
+                    "\n", "  \n"
+                ),
+                "image": format_image_func,
+                "chart": format_chart_func,
+                "formula": format_formula_func,
+                "table": format_table_func,
+                "reference": partial(
+                    format_first_line_func,
+                    templates=["参考文献", "references"],
+                    format_func=lambda l: f"## {l}",
+                    spliter="\n",
+                ),
+                "algorithm": lambda block: block.content.strip("\n"),
+                "seal": format_seal_func,
+            }
+
         data = {}
         data["input_path"] = self["input_path"]
         data["page_index"] = self["page_index"]
         model_settings = self["model_settings"]
         data["model_settings"] = model_settings
-        parsing_res_list = self["parsing_res_list"]
-        parsing_res_list = [
-            {
+        parsing_res_list: List[LayoutBlock] = self["parsing_res_list"]
+        parsing_res_list_json = []
+        for parsing_res in parsing_res_list:
+            res_dict = {
                 "block_label": parsing_res.label,
                 "block_content": parsing_res.content,
                 "block_bbox": parsing_res.bbox,
+                "block_id": parsing_res.index,
+                "block_order": parsing_res.order_index,
             }
-            for parsing_res in parsing_res_list
-        ]
-        data["parsing_res_list"] = parsing_res_list
+            if self["model_settings"].get("format_block_content", False):
+                if handle_funcs_dict.get(parsing_res.label):
+                    res_dict["block_content"] = handle_funcs_dict[parsing_res.label](
+                        parsing_res
+                    )
+                else:
+                    res_dict["block_content"] = parsing_res.content
+
+            parsing_res_list_json.append(res_dict)
+        data["parsing_res_list"] = parsing_res_list_json
         if self["model_settings"]["use_doc_preprocessor"]:
             data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
         data["layout_det_res"] = self["layout_det_res"].json["res"]
@@ -337,7 +440,7 @@ def _to_xlsx(self) -> dict[str, str]:
                 res_xlsx_dict[key] = table_res.xlsx["pred"]
         return res_xlsx_dict
 
-    def _to_markdown(self, pretty=True) -> dict:
+    def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
         """
         Save the parsing result to a Markdown file.
 
diff --git a/paddlex/inference/pipelines/layout_parsing/utils.py b/paddlex/inference/pipelines/layout_parsing/utils.py
index b2b0324a7c..d7db3d958b 100644
--- a/paddlex/inference/pipelines/layout_parsing/utils.py
+++ b/paddlex/inference/pipelines/layout_parsing/utils.py
@@ -219,38 +219,43 @@ def calculate_projection_overlap_ratio(
 
 
 def calculate_overlap_ratio(
-    bbox1: Union[list, tuple], bbox2: Union[list, tuple], mode="union"
+    bbox1: Union[np.ndarray, list, tuple],
+    bbox2: Union[np.ndarray, list, tuple],
+    mode="union",
 ) -> float:
     """
-    Calculate the overlap ratio between two bounding boxes.
+    Calculate the overlap ratio between two bounding boxes using NumPy.
 
     Args:
-        bbox1 (list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
-        bbox2 (list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
+        bbox1 (np.ndarray, list or tuple): The first bounding box, format [x_min, y_min, x_max, y_max]
+        bbox2 (np.ndarray, list or tuple): The second bounding box, format [x_min, y_min, x_max, y_max]
         mode (str): The mode of calculation, either 'union', 'small', or 'large'.
 
     Returns:
         float: The overlap ratio value between the two bounding boxes
     """
-    x_min_inter = max(bbox1[0], bbox2[0])
-    y_min_inter = max(bbox1[1], bbox2[1])
-    x_max_inter = min(bbox1[2], bbox2[2])
-    y_max_inter = min(bbox1[3], bbox2[3])
+    bbox1 = np.array(bbox1)
+    bbox2 = np.array(bbox2)
+
+    x_min_inter = np.maximum(bbox1[0], bbox2[0])
+    y_min_inter = np.maximum(bbox1[1], bbox2[1])
+    x_max_inter = np.minimum(bbox1[2], bbox2[2])
+    y_max_inter = np.minimum(bbox1[3], bbox2[3])
 
-    inter_width = max(0, x_max_inter - x_min_inter)
-    inter_height = max(0, y_max_inter - y_min_inter)
+    inter_width = np.maximum(0, x_max_inter - x_min_inter)
+    inter_height = np.maximum(0, y_max_inter - y_min_inter)
 
-    inter_area = float(inter_width) * float(inter_height)
+    inter_area = inter_width * inter_height
 
-    bbox1_area = caculate_bbox_area(bbox1)
-    bbox2_area = caculate_bbox_area(bbox2)
+    bbox1_area = calculate_bbox_area(bbox1)
+    bbox2_area = calculate_bbox_area(bbox2)
 
     if mode == "union":
         ref_area = bbox1_area + bbox2_area - inter_area
     elif mode == "small":
-        ref_area = min(bbox1_area, bbox2_area)
+        ref_area = np.minimum(bbox1_area, bbox2_area)
     elif mode == "large":
-        ref_area = max(bbox1_area, bbox2_area)
+        ref_area = np.maximum(bbox1_area, bbox2_area)
     else:
         raise ValueError(
             f"Invalid mode {mode}, must be one of ['union', 'small', 'large']."
@@ -365,8 +370,8 @@ def _get_minbox_if_overlap_by_ratio(
             The selected bounding box or None if the overlap ratio is not exceeded.
     """
     # Calculate the areas of both bounding boxes
-    area1 = caculate_bbox_area(bbox1)
-    area2 = caculate_bbox_area(bbox2)
+    area1 = calculate_bbox_area(bbox1)
+    area2 = calculate_bbox_area(bbox2)
     # Calculate the overlap ratio using a helper function
     overlap_ratio = calculate_overlap_ratio(bbox1, bbox2, mode="small")
     # Check if the overlap ratio exceeds the threshold
@@ -415,11 +420,9 @@ def remove_overlap_blocks(
                 is_block2_image = block2["label"] == "image"
 
                 if is_block1_image != is_block2_image:
-                    # 如果只有一个块在视觉标签中，删除在视觉标签中的那个块
                     drop_index = i if is_block1_image else j
                     overlap_image_blocks.append(blocks["boxes"][drop_index])
                 else:
-                    # 如果两个块都在或都不在视觉标签中，根据 overlap_box_index 决定删除哪个块
                     drop_index = i if overlap_box_index == 1 else j
 
                 dropped_indexes.add(drop_index)
@@ -616,7 +619,7 @@ def convert_formula_res_to_ocr_format(formula_res_list: List, ocr_res: dict):
         ocr_res["rec_scores"].append(1)
 
 
-def caculate_bbox_area(bbox):
+def calculate_bbox_area(bbox):
     """Calculate bounding box area"""
     x1, y1, x2, y2 = map(float, bbox)
     area = abs((x2 - x1) * (y2 - y1))
@@ -724,8 +727,11 @@ def get_show_color(label: str, order_label=False) -> Tuple:
             "vision_footnote": (144, 238, 144, 100),  # Light Green
             # Deep Purple (from 'texts_list')
             "text": (153, 0, 76, 100),
+            "vertical_text": (153, 0, 76, 100),
+            "inline_formula": (153, 0, 76, 100),
             # Bright Green (from 'interequations_list')
             "formula": (0, 255, 0, 100),
+            "display_formula": (0, 255, 0, 100),
             "abstract": (255, 239, 213, 100),  # Papaya Whip
             # Medium Green (from 'lists_list' and 'indexs_list')
             "content": (40, 169, 92, 100),
@@ -740,7 +746,7 @@ def get_show_color(label: str, order_label=False) -> Tuple:
             "chart": (216, 191, 216, 100),  # Thistle
             # Pale Yellow-Green (from 'tables_footnote_list')
             "reference": (229, 255, 204, 100),
-            # "reference_content": (229, 255, 204, 100),
+            "reference_content": (229, 255, 204, 100),
             "algorithm": (255, 250, 240, 100),  # Floral White
         }
     default_color = (158, 158, 158, 100)
diff --git a/paddlex/inference/pipelines/paddleocr_vl/__init__.py b/paddlex/inference/pipelines/paddleocr_vl/__init__.py
new file mode 100644
index 0000000000..b50ca9d04f
--- /dev/null
+++ b/paddlex/inference/pipelines/paddleocr_vl/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pipeline import PaddleOCRVLPipeline
diff --git a/paddlex/inference/pipelines/paddleocr_vl/pipeline.py b/paddlex/inference/pipelines/paddleocr_vl/pipeline.py
new file mode 100644
index 0000000000..cff337ae79
--- /dev/null
+++ b/paddlex/inference/pipelines/paddleocr_vl/pipeline.py
@@ -0,0 +1,705 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import queue
+import threading
+import time
+from itertools import chain
+from typing import Any, Dict, Optional, Tuple, Union
+
+import numpy as np
+from PIL import Image
+
+from ....utils import logging
+from ....utils.deps import pipeline_requires_extra
+from ...common.batch_sampler import ImageBatchSampler
+from ...common.reader import ReadImage
+from ...utils.benchmark import benchmark
+from ...utils.hpi import HPIConfig
+from ...utils.pp_option import PaddlePredictorOption
+from .._parallel import AutoParallelImageSimpleInferencePipeline
+from ..base import BasePipeline
+from ..components import CropByBoxes
+from ..layout_parsing.utils import gather_imgs
+from .result import PaddleOCRVLBlock, PaddleOCRVLResult
+from .uilts import (
+    convert_otsl_to_html,
+    filter_overlap_boxes,
+    merge_blocks,
+    tokenize_figure_of_table,
+    truncate_repetitive_content,
+    untokenize_figure_of_table,
+)
+
+IMAGE_LABELS = ["image", "header_image", "footer_image", "seal"]
+
+
+@benchmark.time_methods
+class _PaddleOCRVLPipeline(BasePipeline):
+    """_PaddleOCRVLPipeline Pipeline"""
+
+    def __init__(
+        self,
+        config: Dict,
+        device: Optional[str] = None,
+        pp_option: Optional[PaddlePredictorOption] = None,
+        use_hpip: bool = False,
+        hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+    ) -> None:
+        """
+        Initializes the class with given configurations and options.
+
+        Args:
+            config (Dict): Configuration dictionary containing various settings.
+            device (str, optional): Device to run the predictions on. Defaults to None.
+            pp_option (PaddlePredictorOption, optional): PaddlePredictor options. Defaults to None.
+            use_hpip (bool, optional): Whether to use the high-performance
+                inference plugin (HPIP) by default. Defaults to False.
+            hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
+                The default high-performance inference configuration dictionary.
+                Defaults to None.
+        """
+        super().__init__(
+            device=device, pp_option=pp_option, use_hpip=use_hpip, hpi_config=hpi_config
+        )
+
+        self.use_doc_preprocessor = config.get("use_doc_preprocessor", True)
+        if self.use_doc_preprocessor:
+            doc_preprocessor_config = config.get("SubPipelines", {}).get(
+                "DocPreprocessor",
+                {
+                    "pipeline_config_error": "config error for doc_preprocessor_pipeline!"
+                },
+            )
+            self.doc_preprocessor_pipeline = self.create_pipeline(
+                doc_preprocessor_config
+            )
+
+        self.use_layout_detection = config.get("use_layout_detection", True)
+        if self.use_layout_detection:
+            layout_det_config = config.get("SubModules", {}).get(
+                "LayoutDetection",
+                {"model_config_error": "config error for layout_det_model!"},
+            )
+            model_name = layout_det_config.get("model_name", None)
+            assert (
+                model_name is not None and model_name == "PP-DocLayoutV2"
+            ), "model_name must be PP-DocLayoutV2"
+            layout_kwargs = {}
+            if (threshold := layout_det_config.get("threshold", None)) is not None:
+                layout_kwargs["threshold"] = threshold
+            if (layout_nms := layout_det_config.get("layout_nms", None)) is not None:
+                layout_kwargs["layout_nms"] = layout_nms
+            if (
+                layout_unclip_ratio := layout_det_config.get(
+                    "layout_unclip_ratio", None
+                )
+            ) is not None:
+                layout_kwargs["layout_unclip_ratio"] = layout_unclip_ratio
+            if (
+                layout_merge_bboxes_mode := layout_det_config.get(
+                    "layout_merge_bboxes_mode", None
+                )
+            ) is not None:
+                layout_kwargs["layout_merge_bboxes_mode"] = layout_merge_bboxes_mode
+            self.layout_det_model = self.create_model(
+                layout_det_config, **layout_kwargs
+            )
+
+        self.use_chart_recognition = config.get("use_chart_recognition", True)
+
+        vl_rec_config = config.get("SubModules", {}).get(
+            "VLRecognition",
+            {"model_config_error": "config error for vl_rec_model!"},
+        )
+
+        self.vl_rec_model = self.create_model(vl_rec_config)
+        self.format_block_content = config.get("format_block_content", False)
+
+        self.batch_sampler = ImageBatchSampler(batch_size=config.get("batch_size", 1))
+        self.img_reader = ReadImage(format="BGR")
+        self.crop_by_boxes = CropByBoxes()
+
+        self.use_queues = config.get("use_queues", False)
+
+    def close(self):
+        self.vl_rec_model.close()
+
+    def get_model_settings(
+        self,
+        use_doc_orientation_classify: Union[bool, None],
+        use_doc_unwarping: Union[bool, None],
+        use_layout_detection: Union[bool, None],
+        use_chart_recognition: Union[bool, None],
+        format_block_content: Union[bool, None],
+    ) -> dict:
+        """
+        Get the model settings based on the provided parameters or default values.
+
+        Args:
+            use_doc_orientation_classify (Union[bool, None]): Enables document orientation classification if True. Defaults to system setting if None.
+            use_doc_unwarping (Union[bool, None]): Enables document unwarping if True. Defaults to system setting if None.
+
+        Returns:
+            dict: A dictionary containing the model settings.
+
+        """
+        if use_doc_orientation_classify is None and use_doc_unwarping is None:
+            use_doc_preprocessor = self.use_doc_preprocessor
+        else:
+            if use_doc_orientation_classify is True or use_doc_unwarping is True:
+                use_doc_preprocessor = True
+            else:
+                use_doc_preprocessor = False
+
+        if use_layout_detection is None:
+            use_layout_detection = self.use_layout_detection
+
+        if use_chart_recognition is None:
+            use_chart_recognition = self.use_chart_recognition
+
+        if format_block_content is None:
+            format_block_content = self.format_block_content
+
+        return dict(
+            use_doc_preprocessor=use_doc_preprocessor,
+            use_layout_detection=use_layout_detection,
+            use_chart_recognition=use_chart_recognition,
+            format_block_content=format_block_content,
+        )
+
+    def check_model_settings_valid(self, input_params: dict) -> bool:
+        """
+        Check if the input parameters are valid based on the initialized models.
+
+        Args:
+            input_params (Dict): A dictionary containing input parameters.
+
+        Returns:
+            bool: True if all required models are initialized according to input parameters, False otherwise.
+        """
+
+        if input_params["use_doc_preprocessor"] and not self.use_doc_preprocessor:
+            logging.error(
+                "Set use_doc_preprocessor, but the models for doc preprocessor are not initialized.",
+            )
+            return False
+
+        return True
+
+    def get_layout_parsing_results(
+        self,
+        images,
+        layout_det_results,
+        imgs_in_doc,
+        use_chart_recognition=False,
+        vlm_kwargs=None,
+    ):
+        blocks = []
+        block_imgs = []
+        text_prompts = []
+        vlm_block_ids = []
+        figure_token_maps = []
+        drop_figures_set = set()
+        image_labels = (
+            IMAGE_LABELS if use_chart_recognition else IMAGE_LABELS + ["chart"]
+        )
+        for i, (image, layout_det_res, imgs_in_doc_for_img) in enumerate(
+            zip(images, layout_det_results, imgs_in_doc)
+        ):
+            layout_det_res = filter_overlap_boxes(layout_det_res)
+            boxes = layout_det_res["boxes"]
+            blocks_for_img = self.crop_by_boxes(image, boxes)
+            blocks_for_img = merge_blocks(
+                blocks_for_img, non_merge_labels=image_labels + ["table"]
+            )
+            blocks.append(blocks_for_img)
+            for j, block in enumerate(blocks_for_img):
+                block_img = block["img"]
+                block_label = block["label"]
+                if block_label not in image_labels and block_img is not None:
+                    figure_token_map = {}
+                    text_prompt = "OCR:"
+                    drop_figures = []
+                    if block_label == "table":
+                        text_prompt = "Table Recognition:"
+                        block_img, figure_token_map, drop_figures = (
+                            tokenize_figure_of_table(
+                                block_img, block["box"], imgs_in_doc_for_img
+                            )
+                        )
+                    elif block_label == "chart" and use_chart_recognition:
+                        text_prompt = "Chart Recognition:"
+                    elif "formula" in block_label and block_label != "formula_number":
+                        text_prompt = "Formula Recognition:"
+                    block_imgs.append(block_img)
+                    text_prompts.append(text_prompt)
+                    figure_token_maps.append(figure_token_map)
+                    vlm_block_ids.append((i, j))
+                    drop_figures_set.update(drop_figures)
+
+        kwargs = {
+            "use_cache": True,
+            "max_new_tokens": 4096,
+            **(vlm_kwargs or {}),
+        }
+        vl_rec_results = list(
+            self.vl_rec_model.predict(
+                [
+                    {
+                        "image": block_img,
+                        "query": text_prompt,
+                    }
+                    for block_img, text_prompt in zip(block_imgs, text_prompts)
+                ],
+                skip_special_tokens=True,
+                **kwargs,
+            )
+        )
+
+        parsing_res_lists = []
+        table_res_lists = []
+        curr_vlm_block_idx = 0
+        for i, blocks_for_img in enumerate(blocks):
+            parsing_res_list = []
+            table_res_list = []
+            for j, block in enumerate(blocks_for_img):
+                block_img = block["img"]
+                block_bbox = block["box"]
+                block_label = block["label"]
+                block_content = ""
+                if curr_vlm_block_idx < len(vlm_block_ids) and vlm_block_ids[
+                    curr_vlm_block_idx
+                ] == (i, j):
+                    vl_rec_result = vl_rec_results[curr_vlm_block_idx]
+                    figure_token_map = figure_token_maps[curr_vlm_block_idx]
+                    block_img4vl = block_imgs[curr_vlm_block_idx]
+                    curr_vlm_block_idx += 1
+                    vl_rec_result["image"] = block_img4vl
+                    result_str = vl_rec_result.get("result", "")
+                    if result_str is None:
+                        result_str = ""
+                    result_str = truncate_repetitive_content(result_str)
+                    if ("\\(" in result_str and "\\)" in result_str) or (
+                        "\\[" in result_str and "\\]" in result_str
+                    ):
+                        result_str = result_str.replace("$", "")
+
+                        result_str = (
+                            result_str.replace("\(", " $ ")
+                            .replace("\\)", " $ ")
+                            .replace("\\[", " $$ ")
+                            .replace("\\]", " $$ ")
+                        )
+                        if block_label == "formula_number":
+                            result_str = result_str.replace("$", "")
+                    if block_label == "table":
+                        html_str = convert_otsl_to_html(result_str)
+                        if html_str != "":
+                            result_str = html_str
+                        result_str = untokenize_figure_of_table(
+                            result_str, figure_token_map
+                        )
+
+                    block_content = result_str
+
+                block_info = PaddleOCRVLBlock(
+                    label=block_label,
+                    bbox=block_bbox,
+                    content=block_content,
+                )
+                if block_label in image_labels and block_img is not None:
+                    x_min, y_min, x_max, y_max = list(map(int, block_bbox))
+                    img_path = f"imgs/img_in_{block_label}_box_{x_min}_{y_min}_{x_max}_{y_max}.jpg"
+                    if img_path not in drop_figures_set:
+                        import cv2
+
+                        block_img = cv2.cvtColor(block_img, cv2.COLOR_BGR2RGB)
+                        block_info.image = {
+                            "path": img_path,
+                            "img": Image.fromarray(block_img),
+                        }
+                    else:
+                        continue
+
+                parsing_res_list.append(block_info)
+            parsing_res_lists.append(parsing_res_list)
+            table_res_lists.append(table_res_list)
+
+        return parsing_res_lists, table_res_lists, imgs_in_doc
+
+    def predict(
+        self,
+        input: Union[str, list[str], np.ndarray, list[np.ndarray]],
+        use_doc_orientation_classify: Union[bool, None] = False,
+        use_doc_unwarping: Union[bool, None] = False,
+        use_layout_detection: Union[bool, None] = None,
+        use_chart_recognition: Union[bool, None] = None,
+        layout_threshold: Optional[Union[float, dict]] = None,
+        layout_nms: Optional[bool] = None,
+        layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None,
+        layout_merge_bboxes_mode: Optional[str] = None,
+        use_queues: Optional[bool] = None,
+        prompt_label: Optional[Union[str, None]] = None,
+        format_block_content: Union[bool, None] = None,
+        repetition_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None,
+        **kwargs,
+    ) -> PaddleOCRVLResult:
+        """
+        Predicts the layout parsing result for the given input.
+
+        Args:
+            input (Union[str, list[str], np.ndarray, list[np.ndarray]]): Input image path, list of image paths,
+                                                                        numpy array of an image, or list of numpy arrays.
+            use_doc_orientation_classify (Optional[bool]): Whether to use document orientation classification.
+            use_doc_unwarping (Optional[bool]): Whether to use document unwarping.
+            layout_threshold (Optional[float]): The threshold value to filter out low-confidence predictions. Default is None.
+            layout_nms (bool, optional): Whether to use layout-aware NMS. Defaults to False.
+            layout_unclip_ratio (Optional[Union[float, Tuple[float, float]]], optional): The ratio of unclipping the bounding box.
+                Defaults to None.
+                If it's a single number, then both width and height are used.
+                If it's a tuple of two numbers, then they are used separately for width and height respectively.
+                If it's None, then no unclipping will be performed.
+            layout_merge_bboxes_mode (Optional[str], optional): The mode for merging bounding boxes. Defaults to None.
+            **kwargs (Any): Additional settings to extend functionality.
+
+        Returns:
+            PaddleOCRVLResult: The predicted layout parsing result.
+        """
+        model_settings = self.get_model_settings(
+            use_doc_orientation_classify,
+            use_doc_unwarping,
+            use_layout_detection,
+            use_chart_recognition,
+            format_block_content,
+        )
+
+        if not self.check_model_settings_valid(model_settings):
+            yield {"error": "the input params for model settings are invalid!"}
+
+        if use_queues is None:
+            use_queues = self.use_queues
+
+        if not model_settings["use_layout_detection"]:
+            prompt_label = prompt_label if prompt_label else "ocr"
+            if prompt_label.lower() == "chart":
+                model_settings["use_chart_recognition"] = True
+            assert prompt_label.lower() in [
+                "ocr",
+                "formula",
+                "table",
+                "chart",
+            ], f"Layout detection is disabled (use_layout_detection=False). 'prompt_label' must be one of ['ocr', 'formula', 'table', 'chart'], but got '{prompt_label}'."
+
+        def _process_cv(batch_data, new_batch_size=None):
+            if not new_batch_size:
+                new_batch_size = len(batch_data)
+
+            for idx in range(0, len(batch_data), new_batch_size):
+                instances = batch_data.instances[idx : idx + new_batch_size]
+                input_paths = batch_data.input_paths[idx : idx + new_batch_size]
+                page_indexes = batch_data.page_indexes[idx : idx + new_batch_size]
+
+                image_arrays = self.img_reader(instances)
+
+                if model_settings["use_doc_preprocessor"]:
+                    doc_preprocessor_results = list(
+                        self.doc_preprocessor_pipeline(
+                            image_arrays,
+                            use_doc_orientation_classify=use_doc_orientation_classify,
+                            use_doc_unwarping=use_doc_unwarping,
+                        )
+                    )
+                else:
+                    doc_preprocessor_results = [
+                        {"output_img": arr} for arr in image_arrays
+                    ]
+
+                doc_preprocessor_images = [
+                    item["output_img"] for item in doc_preprocessor_results
+                ]
+
+                if model_settings["use_layout_detection"]:
+                    layout_det_results = list(
+                        self.layout_det_model(
+                            doc_preprocessor_images,
+                            threshold=layout_threshold,
+                            layout_nms=layout_nms,
+                            layout_unclip_ratio=layout_unclip_ratio,
+                            layout_merge_bboxes_mode=layout_merge_bboxes_mode,
+                        )
+                    )
+
+                    imgs_in_doc = [
+                        gather_imgs(doc_pp_img, layout_det_res["boxes"])
+                        for doc_pp_img, layout_det_res in zip(
+                            doc_preprocessor_images, layout_det_results
+                        )
+                    ]
+                else:
+                    layout_det_results = []
+                    for doc_preprocessor_image in doc_preprocessor_images:
+                        layout_det_results.append(
+                            {
+                                "input_path": None,
+                                "page_index": None,
+                                "boxes": [
+                                    {
+                                        "cls_id": 0,
+                                        "label": prompt_label.lower(),
+                                        "score": 1,
+                                        "coordinate": [
+                                            0,
+                                            0,
+                                            doc_preprocessor_image.shape[1],
+                                            doc_preprocessor_image.shape[0],
+                                        ],
+                                    }
+                                ],
+                            }
+                        )
+                    imgs_in_doc = [[] for _ in layout_det_results]
+
+                yield input_paths, page_indexes, doc_preprocessor_images, doc_preprocessor_results, layout_det_results, imgs_in_doc
+
+        def _process_vlm(results_cv):
+            (
+                input_paths,
+                page_indexes,
+                doc_preprocessor_images,
+                doc_preprocessor_results,
+                layout_det_results,
+                imgs_in_doc,
+            ) = results_cv
+
+            parsing_res_lists, table_res_lists, imgs_in_doc = (
+                self.get_layout_parsing_results(
+                    doc_preprocessor_images,
+                    layout_det_results,
+                    imgs_in_doc,
+                    model_settings["use_chart_recognition"],
+                    {
+                        "repetition_penalty": repetition_penalty,
+                        "temperature": temperature,
+                        "top_p": top_p,
+                        "min_pixels": min_pixels,
+                        "max_pixels": max_pixels,
+                    },
+                )
+            )
+
+            for (
+                input_path,
+                page_index,
+                doc_preprocessor_image,
+                doc_preprocessor_res,
+                layout_det_res,
+                table_res_list,
+                parsing_res_list,
+                imgs_in_doc_for_img,
+            ) in zip(
+                input_paths,
+                page_indexes,
+                doc_preprocessor_images,
+                doc_preprocessor_results,
+                layout_det_results,
+                table_res_lists,
+                parsing_res_lists,
+                imgs_in_doc,
+            ):
+                single_img_res = {
+                    "input_path": input_path,
+                    "page_index": page_index,
+                    "doc_preprocessor_res": doc_preprocessor_res,
+                    "layout_det_res": layout_det_res,
+                    "table_res_list": table_res_list,
+                    "parsing_res_list": parsing_res_list,
+                    "imgs_in_doc": imgs_in_doc_for_img,
+                    "model_settings": model_settings,
+                }
+                yield PaddleOCRVLResult(single_img_res)
+
+        if use_queues:
+            max_num_batches_in_process = 64
+            queue_input = queue.Queue(maxsize=max_num_batches_in_process)
+            queue_cv = queue.Queue(maxsize=max_num_batches_in_process)
+            queue_vlm = queue.Queue(
+                maxsize=self.batch_sampler.batch_size * max_num_batches_in_process
+            )
+            event_shutdown = threading.Event()
+            event_data_loading_done = threading.Event()
+            event_cv_processing_done = threading.Event()
+            event_vlm_processing_done = threading.Event()
+
+            def _worker_input(input_):
+                all_batch_data = self.batch_sampler(input_)
+                while not event_shutdown.is_set():
+                    try:
+                        batch_data = next(all_batch_data)
+                    except StopIteration:
+                        break
+                    except Exception as e:
+                        queue_input.put((False, "input", e))
+                        break
+                    else:
+                        queue_input.put((True, batch_data))
+                event_data_loading_done.set()
+
+            def _worker_cv():
+                while not event_shutdown.is_set():
+                    try:
+                        item = queue_input.get(timeout=0.5)
+                    except queue.Empty:
+                        if event_data_loading_done.is_set():
+                            event_cv_processing_done.set()
+                            break
+                        continue
+                    if not item[0]:
+                        queue_cv.put(item)
+                        break
+                    try:
+                        for results_cv in _process_cv(
+                            item[1],
+                            (
+                                self.layout_det_model.batch_sampler.batch_size
+                                if model_settings["use_layout_detection"]
+                                else None
+                            ),
+                        ):
+                            queue_cv.put((True, results_cv))
+                    except Exception as e:
+                        queue_cv.put((False, "cv", e))
+                        break
+
+            def _worker_vlm():
+                MAX_QUEUE_DELAY_SECS = 0.5
+                MAX_NUM_BOXES = self.vl_rec_model.batch_sampler.batch_size
+
+                while not event_shutdown.is_set():
+                    results_cv_list = []
+                    start_time = time.time()
+                    should_break = False
+                    num_boxes = 0
+                    while True:
+                        remaining_time = MAX_QUEUE_DELAY_SECS - (
+                            time.time() - start_time
+                        )
+                        if remaining_time <= 0:
+                            break
+                        try:
+                            item = queue_cv.get(timeout=remaining_time)
+                        except queue.Empty:
+                            break
+                        if not item[0]:
+                            queue_vlm.put(item)
+                            should_break = True
+                            break
+                        results_cv_list.append(item[1])
+                        for res in results_cv_list[-1][4]:
+                            num_boxes += len(res["boxes"])
+                        if num_boxes >= MAX_NUM_BOXES:
+                            break
+                    if should_break:
+                        break
+                    if not results_cv_list:
+                        if event_cv_processing_done.is_set():
+                            event_vlm_processing_done.set()
+                            break
+                        continue
+
+                    merged_results_cv = [
+                        list(chain.from_iterable(lists))
+                        for lists in zip(*results_cv_list)
+                    ]
+
+                    try:
+                        for result_vlm in _process_vlm(merged_results_cv):
+                            queue_vlm.put((True, result_vlm))
+                    except Exception as e:
+                        queue_vlm.put((False, "vlm", e))
+                        break
+
+            thread_input = threading.Thread(
+                target=_worker_input, args=(input,), daemon=False
+            )
+            thread_input.start()
+            thread_cv = threading.Thread(target=_worker_cv, daemon=False)
+            thread_cv.start()
+            thread_vlm = threading.Thread(target=_worker_vlm, daemon=False)
+            thread_vlm.start()
+
+        try:
+            if use_queues:
+                while not (event_vlm_processing_done.is_set() and queue_vlm.empty()):
+                    try:
+                        item = queue_vlm.get(timeout=0.5)
+                    except queue.Empty:
+                        if event_vlm_processing_done.is_set():
+                            break
+                        continue
+                    if not item[0]:
+                        raise RuntimeError(
+                            f"Exception from the '{item[1]}' worker: {item[2]}"
+                        )
+                    else:
+                        yield item[1]
+            else:
+                for batch_data in self.batch_sampler(input):
+                    results_cv_list = list(_process_cv(batch_data))
+                    assert len(results_cv_list) == 1, len(results_cv_list)
+                    results_cv = results_cv_list[0]
+                    for res in _process_vlm(results_cv):
+                        yield res
+        finally:
+            if use_queues:
+                event_shutdown.set()
+                thread_cv.join(timeout=5)
+                if thread_cv.is_alive():
+                    logging.warning("CV worker did not terminate in time")
+                thread_vlm.join(timeout=5)
+                if thread_vlm.is_alive():
+                    logging.warning("VLM worker did not terminate in time")
+
+    def concatenate_markdown_pages(self, markdown_list: list) -> tuple:
+        """
+        Concatenate Markdown content from multiple pages into a single document.
+
+        Args:
+            markdown_list (list): A list containing Markdown data for each page.
+
+        Returns:
+            tuple: A tuple containing the processed Markdown text.
+        """
+        markdown_texts = ""
+
+        for res in markdown_list:
+            markdown_texts += "\n\n" + res["markdown_texts"]
+
+        return markdown_texts
+
+
+@pipeline_requires_extra("ocr")
+class PaddleOCRVLPipeline(AutoParallelImageSimpleInferencePipeline):
+    entities = "PaddleOCR-VL"
+
+    @property
+    def _pipeline_cls(self):
+        return _PaddleOCRVLPipeline
+
+    def _get_batch_size(self, config):
+        return config.get("batch_size", 1)
diff --git a/paddlex/inference/pipelines/paddleocr_vl/result.py b/paddlex/inference/pipelines/paddleocr_vl/result.py
new file mode 100644
index 0000000000..cef785acce
--- /dev/null
+++ b/paddlex/inference/pipelines/paddleocr_vl/result.py
@@ -0,0 +1,493 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from functools import partial
+
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+
+from ....utils.fonts import PINGFANG_FONT
+from ...common.result import (
+    BaseCVResult,
+    HtmlMixin,
+    JsonMixin,
+    MarkdownMixin,
+    XlsxMixin,
+)
+from ..layout_parsing.result_v2 import (
+    format_centered_by_html,
+    format_first_line_func,
+    format_image_plain_func,
+    format_image_scaled_by_html_func,
+    format_text_plain_func,
+    format_title_func,
+    simplify_table_func,
+)
+
+VISUALIZE_INDEX_LABELS = [
+    "text",
+    "formula",
+    "inline_formula",
+    "display_formula",
+    "algorithm",
+    "reference",
+    "reference_content",
+    "content",
+    "abstract",
+    "paragraph_title",
+    "doc_title",
+    "vertical_text",
+    "ocr",
+]
+
+
+class PaddleOCRVLBlock(object):
+    """PaddleOCRVL Block Class"""
+
+    def __init__(self, label, bbox, content="") -> None:
+        """
+        Initialize a PaddleOCRVLBlock object.
+
+        Args:
+            label (str): Label assigned to the block.
+            bbox (list): Bounding box coordinates of the block.
+            content (str, optional): Content of the block. Defaults to an empty string.
+        """
+        self.label = label
+        self.bbox = list(map(int, bbox))
+        self.content = content
+        self.image = None
+
+    def __str__(self) -> str:
+        """
+        Return a string representation of the block.
+        """
+        _str = f"\n\n#################\nlabel:\t{self.label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
+        return _str
+
+    def __repr__(self) -> str:
+        """
+        Return a string representation of the block.
+        """
+        _str = f"\n\n#################\nlabel:\t{self.label}\nbbox:\t{self.bbox}\ncontent:\t{self.content}\n#################"
+        return _str
+
+
+def merge_formula_and_number(formula, formula_number):
+    """
+    Merge a formula and its formula number for display.
+
+    Args:
+        formula (str): The formula string.
+        formula_number (str): The formula number string.
+
+    Returns:
+        str: The merged formula with tag.
+    """
+    formula = formula.replace("$$", "")
+    merge_formula = r"{} \tag*{{{}}}".format(formula, formula_number)
+    return f"$${merge_formula}$$"
+
+
+def format_chart2table_func(block):
+    lines_list = block.content.split("\n")
+    # 提取表头和内容
+    header = lines_list[0].split("|")
+    rows = [line.split("|") for line in lines_list[1:]]
+    # 构造HTML表格
+    html = "<table border=1 style='margin: auto; width: max-content;'>\n"
+    html += (
+        "  <thead><tr>"
+        + "".join(
+            f"<th style='text-align: center;'>{cell.strip()}</th>" for cell in header
+        )
+        + "</tr></thead>\n"
+    )
+    html += "  <tbody>\n"
+    for row in rows:
+        html += (
+            "    <tr>"
+            + "".join(
+                f"<td style='text-align: center;'>{cell.strip()}</td>" for cell in row
+            )
+            + "</tr>\n"
+        )
+    html += "  </tbody>\n"
+    html += "</table>"
+    return html
+
+
+def format_table_center_func(block):
+    tabel_content = block.content
+    tabel_content = tabel_content.replace(
+        "<table>", "<table border=1 style='margin: auto; width: max-content;'>"
+    )
+    tabel_content = tabel_content.replace("<th>", "<th style='text-align: center;'>")
+    tabel_content = tabel_content.replace("<td>", "<td style='text-align: center;'>")
+    return tabel_content
+
+
+def build_handle_funcs_dict(
+    *,
+    text_func,
+    image_func,
+    chart_func,
+    table_func,
+    formula_func,
+    seal_func,
+):
+    """
+    Build a dictionary mapping block labels to their formatting functions.
+
+    Args:
+        text_func: Function to format text blocks.
+        image_func: Function to format image blocks.
+        chart_func: Function to format chart blocks.
+        table_func: Function to format table blocks.
+        formula_func: Function to format formula blocks.
+        seal_func: Function to format seal blocks.
+
+    Returns:
+        dict: A mapping from block label to handler function.
+    """
+    return {
+        "paragraph_title": format_title_func,
+        "abstract_title": format_title_func,
+        "reference_title": format_title_func,
+        "content_title": format_title_func,
+        "doc_title": lambda block: f"# {block.content}".replace("-\n", "").replace(
+            "\n", " "
+        ),
+        "table_title": text_func,
+        "figure_title": text_func,
+        "chart_title": text_func,
+        "vision_footnote": lambda block: block.content.replace("\n\n", "\n").replace(
+            "\n", "\n\n"
+        ),
+        "text": lambda block: block.content.replace("\n\n", "\n").replace("\n", "\n\n"),
+        "ocr": lambda block: block.content.replace("\n\n", "\n").replace("\n", "\n\n"),
+        "vertical_text": lambda block: block.content.replace("\n\n", "\n").replace(
+            "\n", "\n\n"
+        ),
+        "reference_content": lambda block: block.content.replace("\n\n", "\n").replace(
+            "\n", "\n\n"
+        ),
+        "abstract": partial(
+            format_first_line_func,
+            templates=["摘要", "abstract"],
+            format_func=lambda l: f"## {l}\n",
+            spliter=" ",
+        ),
+        "content": lambda block: block.content.replace("-\n", "  \n").replace(
+            "\n", "  \n"
+        ),
+        "image": image_func,
+        "chart": chart_func,
+        "formula": formula_func,
+        "display_formula": formula_func,
+        "inline_formula": formula_func,
+        "table": table_func,
+        "reference": partial(
+            format_first_line_func,
+            templates=["参考文献", "references"],
+            format_func=lambda l: f"## {l}",
+            spliter="\n",
+        ),
+        "algorithm": lambda block: block.content.strip("\n"),
+        "seal": seal_func,
+    }
+
+
+class PaddleOCRVLResult(BaseCVResult, HtmlMixin, XlsxMixin, MarkdownMixin):
+    """
+    PaddleOCRVLResult class for holding and formatting OCR/VL parsing results.
+    """
+
+    def __init__(self, data) -> None:
+        """
+        Initializes a new instance of the class with the specified data.
+
+        Args:
+            data: The input data for the parsing result.
+        """
+        super().__init__(data)
+        HtmlMixin.__init__(self)
+        XlsxMixin.__init__(self)
+        MarkdownMixin.__init__(self)
+        JsonMixin.__init__(self)
+
+    def _to_img(self) -> dict[str, np.ndarray]:
+        """
+        Convert the parsing result to a dictionary of images.
+
+        Returns:
+            dict: Keys are names, values are numpy arrays (images).
+        """
+        from ..layout_parsing.utils import get_show_color
+
+        res_img_dict = {}
+        model_settings = self["model_settings"]
+        if model_settings["use_doc_preprocessor"]:
+            for key, value in self["doc_preprocessor_res"].img.items():
+                res_img_dict[key] = value
+        if self["model_settings"]["use_layout_detection"]:
+            res_img_dict["layout_det_res"] = self["layout_det_res"].img["res"]
+
+        # for layout ordering image
+        image = Image.fromarray(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
+        draw = ImageDraw.Draw(image, "RGBA")
+        font_size = int(0.018 * int(image.width)) + 2
+        font = ImageFont.truetype(PINGFANG_FONT.path, font_size, encoding="utf-8")
+        parsing_result = self["parsing_res_list"]
+        order_index = 0
+        for block in parsing_result:
+            bbox = block.bbox
+            label = block.label
+            fill_color = get_show_color(label, False)
+            draw.rectangle(bbox, fill=fill_color)
+            if label in VISUALIZE_INDEX_LABELS:
+                text_position = (bbox[2] + 2, bbox[1] - font_size // 2)
+                if int(image.width) - bbox[2] < font_size:
+                    text_position = (
+                        int(bbox[2] - font_size * 1.1),
+                        bbox[1] - font_size // 2,
+                    )
+                draw.text(text_position, str(order_index + 1), font=font, fill="red")
+                order_index += 1
+
+        res_img_dict["layout_order_res"] = image
+
+        return res_img_dict
+
+    def _to_html(self) -> dict[str, str]:
+        """
+        Converts the prediction to its corresponding HTML representation.
+
+        Returns:
+            dict: The str type HTML representation result.
+        """
+        res_html_dict = {}
+        if len(self["table_res_list"]) > 0:
+            for sno in range(len(self["table_res_list"])):
+                table_res = self["table_res_list"][sno]
+                table_region_id = table_res["table_region_id"]
+                key = f"table_{table_region_id}"
+                res_html_dict[key] = table_res.html["pred"]
+        return res_html_dict
+
+    def _to_xlsx(self) -> dict[str, str]:
+        """
+        Converts the prediction HTML to an XLSX file path.
+
+        Returns:
+            dict: The str type XLSX representation result.
+        """
+        res_xlsx_dict = {}
+        if len(self["table_res_list"]) > 0:
+            for sno in range(len(self["table_res_list"])):
+                table_res = self["table_res_list"][sno]
+                table_region_id = table_res["table_region_id"]
+                key = f"table_{table_region_id}"
+                res_xlsx_dict[key] = table_res.xlsx["pred"]
+        return res_xlsx_dict
+
+    def _to_str(self, *args, **kwargs) -> dict[str, str]:
+        """
+        Converts the instance's attributes to a dictionary and then to a string.
+
+        Args:
+            *args: Additional positional arguments passed to the base class method.
+            **kwargs: Additional keyword arguments passed to the base class method.
+
+        Returns:
+            dict: A dictionary with the instance's attributes converted to strings.
+        """
+        data = {}
+        data["input_path"] = self["input_path"]
+        data["page_index"] = self["page_index"]
+        model_settings = self["model_settings"]
+        data["model_settings"] = model_settings
+        if self["model_settings"]["use_doc_preprocessor"]:
+            data["doc_preprocessor_res"] = self["doc_preprocessor_res"].str["res"]
+        if self["model_settings"]["use_layout_detection"]:
+            data["layout_det_res"] = self["layout_det_res"].str["res"]
+        return JsonMixin._to_str(data, *args, **kwargs)
+
+    def _to_json(self, *args, **kwargs) -> dict[str, str]:
+        """
+        Converts the object's data to a JSON dictionary.
+
+        Args:
+            *args: Positional arguments passed to the JsonMixin._to_json method.
+            **kwargs: Keyword arguments passed to the JsonMixin._to_json method.
+
+        Returns:
+            dict: A dictionary containing the object's data in JSON format.
+        """
+        data = {}
+        data["input_path"] = self["input_path"]
+        data["page_index"] = self["page_index"]
+        model_settings = self["model_settings"]
+        data["model_settings"] = model_settings
+        if self["model_settings"].get("format_block_content", False):
+            original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
+            format_text_func = lambda block: format_centered_by_html(
+                format_text_plain_func(block)
+            )
+            format_image_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                )
+            )
+
+            if self["model_settings"].get("use_chart_recognition", False):
+                format_chart_func = format_chart2table_func
+            else:
+                format_chart_func = format_image_func
+
+            format_seal_func = format_image_func
+
+            format_table_func = lambda block: "\n" + format_table_center_func(block)
+            format_formula_func = lambda block: block.content
+
+            handle_funcs_dict = build_handle_funcs_dict(
+                text_func=format_text_func,
+                image_func=format_image_func,
+                chart_func=format_chart_func,
+                table_func=format_table_func,
+                formula_func=format_formula_func,
+                seal_func=format_seal_func,
+            )
+
+        parsing_res_list = self["parsing_res_list"]
+        parsing_res_list_json = []
+        order_index = 1
+        for idx, parsing_res in enumerate(parsing_res_list):
+            label = parsing_res.label
+            if label in VISUALIZE_INDEX_LABELS:
+                order = order_index
+                order_index += 1
+            else:
+                order = None
+            res_dict = {
+                "block_label": parsing_res.label,
+                "block_content": parsing_res.content,
+                "block_bbox": parsing_res.bbox,
+                "block_id": idx,
+                "block_order": order,
+            }
+            if self["model_settings"].get("format_block_content", False):
+                if handle_funcs_dict.get(parsing_res.label):
+                    res_dict["block_content"] = handle_funcs_dict[parsing_res.label](
+                        parsing_res
+                    )
+                else:
+                    res_dict["block_content"] = parsing_res.content
+
+            parsing_res_list_json.append(res_dict)
+        data["parsing_res_list"] = parsing_res_list_json
+        if self["model_settings"]["use_doc_preprocessor"]:
+            data["doc_preprocessor_res"] = self["doc_preprocessor_res"].json["res"]
+        if self["model_settings"]["use_layout_detection"]:
+            data["layout_det_res"] = self["layout_det_res"].json["res"]
+        return JsonMixin._to_json(data, *args, **kwargs)
+
+    def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
+        """
+        Save the parsing result to a Markdown file.
+
+        Args:
+            pretty (Optional[bool]): whether to pretty markdown by HTML, default by True.
+            show_formula_number (bool): whether to show formula numbers.
+
+        Returns:
+            dict: Markdown information with text and images.
+        """
+        original_image_width = self["doc_preprocessor_res"]["output_img"].shape[1]
+
+        if pretty:
+            format_text_func = lambda block: format_centered_by_html(
+                format_text_plain_func(block)
+            )
+            format_image_func = lambda block: format_centered_by_html(
+                format_image_scaled_by_html_func(
+                    block,
+                    original_image_width=original_image_width,
+                )
+            )
+        else:
+            format_text_func = lambda block: block.content
+            format_image_func = format_image_plain_func
+
+        format_chart_func = (
+            format_chart2table_func
+            if self["model_settings"]["use_chart_recognition"]
+            else format_image_func
+        )
+
+        if pretty:
+            format_table_func = lambda block: "\n" + format_table_center_func(block)
+        else:
+            format_table_func = lambda block: simplify_table_func("\n" + block.content)
+
+        format_formula_func = lambda block: block.content
+        format_seal_func = format_image_func
+
+        handle_funcs_dict = build_handle_funcs_dict(
+            text_func=format_text_func,
+            image_func=format_image_func,
+            chart_func=format_chart_func,
+            table_func=format_table_func,
+            formula_func=format_formula_func,
+            seal_func=format_seal_func,
+        )
+
+        markdown_content = ""
+        markdown_info = {}
+        markdown_info["markdown_images"] = {}
+        for idx, block in enumerate(self["parsing_res_list"]):
+            label = block.label
+            if block.image is not None:
+                markdown_info["markdown_images"][block.image["path"]] = block.image[
+                    "img"
+                ]
+            handle_func = handle_funcs_dict.get(label, None)
+            if (
+                show_formula_number
+                and (label == "display_formula" or label == "formula")
+                and idx != len(self["parsing_res_list"]) - 1
+            ):
+                next_block = self["parsing_res_list"][idx + 1]
+                next_block_label = next_block.label
+                if next_block_label == "formula_number":
+                    block.content = merge_formula_and_number(
+                        block.content, next_block.content
+                    )
+            if handle_func:
+                markdown_content += (
+                    "\n\n" + handle_func(block)
+                    if markdown_content
+                    else handle_func(block)
+                )
+
+        markdown_info["page_index"] = self["page_index"]
+        markdown_info["input_path"] = self["input_path"]
+        markdown_info["markdown_texts"] = markdown_content
+        for img in self["imgs_in_doc"]:
+            markdown_info["markdown_images"][img["path"]] = img["img"]
+
+        return markdown_info
diff --git a/paddlex/inference/pipelines/paddleocr_vl/uilts.py b/paddlex/inference/pipelines/paddleocr_vl/uilts.py
new file mode 100644
index 0000000000..0f214b94ee
--- /dev/null
+++ b/paddlex/inference/pipelines/paddleocr_vl/uilts.py
@@ -0,0 +1,925 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import html
+import itertools
+import math
+import re
+from collections import Counter
+from copy import deepcopy
+from typing import Any, Dict, List, Tuple
+
+import numpy as np
+from PIL import Image
+from pydantic import BaseModel, computed_field, model_validator
+
+from ..layout_parsing.utils import (
+    calculate_bbox_area,
+    calculate_overlap_ratio,
+    calculate_projection_overlap_ratio,
+)
+
+
+def filter_overlap_boxes(
+    layout_det_res: Dict[str, List[Dict]]
+) -> Dict[str, List[Dict]]:
+    """
+    Remove overlapping boxes from layout detection results based on a given overlap ratio.
+
+    Args:
+        layout_det_res (Dict[str, List[Dict]]): Layout detection result dict containing a 'boxes' list.
+
+    Returns:
+        Dict[str, List[Dict]]: Filtered dict with overlapping boxes removed.
+    """
+    layout_det_res_filtered = deepcopy(layout_det_res)
+    boxes = [
+        box for box in layout_det_res_filtered["boxes"] if box["label"] != "reference"
+    ]
+    dropped_indexes = set()
+
+    for i in range(len(boxes)):
+        for j in range(i + 1, len(boxes)):
+            if i in dropped_indexes or j in dropped_indexes:
+                continue
+            overlap_ratio = calculate_overlap_ratio(
+                boxes[i]["coordinate"], boxes[j]["coordinate"], "small"
+            )
+            if overlap_ratio > 0.7:
+                box_area_i = calculate_bbox_area(boxes[i]["coordinate"])
+                box_area_j = calculate_bbox_area(boxes[j]["coordinate"])
+                if (
+                    boxes[i]["label"] == "image" or boxes[j]["label"] == "image"
+                ) and boxes[i]["label"] != boxes[j]["label"]:
+                    continue
+                if box_area_i >= box_area_j:
+                    dropped_indexes.add(j)
+                else:
+                    dropped_indexes.add(i)
+    layout_det_res_filtered["boxes"] = [
+        box for idx, box in enumerate(boxes) if idx not in dropped_indexes
+    ]
+    return layout_det_res_filtered
+
+
+def to_pil_image(img):
+    """
+    Convert the input to a PIL Image.
+
+    Args:
+        img (PIL.Image or numpy.ndarray): Input image.
+
+    Returns:
+        PIL.Image: PIL Image object.
+    """
+    if isinstance(img, Image.Image):
+        return img
+    return Image.fromarray(img)
+
+
+def to_np_array(img):
+    """
+    Convert the input to a numpy array.
+
+    Args:
+        img (PIL.Image or numpy.ndarray): Input image.
+
+    Returns:
+        numpy.ndarray: Numpy array image.
+    """
+    if isinstance(img, Image.Image):
+        return np.array(img)
+    return img
+
+
+def calc_merged_wh(images):
+    """
+    Calculate width (max of all) and height (sum) for a vertical merge of images.
+
+    Args:
+        images (List[PIL.Image or np.ndarray]): List of images.
+
+    Returns:
+        Tuple[int, int]: (width, height) of merged image.
+    """
+    widths = [to_pil_image(img).width for img in images]
+    heights = [to_pil_image(img).height for img in images]
+    w = max(widths)
+    h = sum(heights)
+    return w, h
+
+
+def merge_images(images, aligns="center"):
+    """
+    Merge images vertically with given alignment.
+
+    Args:
+        images (List[PIL.Image or np.ndarray]): List of images to merge.
+        aligns (str or List[str]): Alignment(s) for each merge step ('center', 'right', 'left').
+
+    Returns:
+        np.ndarray: Merged image as numpy array.
+    """
+    if not images:
+        return None
+    if len(images) == 1:
+        return to_np_array(images[0])
+    if isinstance(aligns, str):
+        aligns = [aligns] * (len(images) - 1)
+    if len(aligns) != len(images) - 1:
+        raise ValueError("The length of aligns must be len(images) - 1")
+    merged = to_pil_image(images[0])
+    for i in range(1, len(images)):
+        img2 = to_pil_image(images[i])
+        align = aligns[i - 1]
+        w = max(merged.width, img2.width)
+        h = merged.height + img2.height
+        new_img = Image.new("RGB", (w, h), (255, 255, 255))
+        if align == "center":
+            x1 = (w - merged.width) // 2
+            x2 = (w - img2.width) // 2
+        elif align == "right":
+            x1 = w - merged.width
+            x2 = w - img2.width
+        else:  # left
+            x1 = x2 = 0
+        new_img.paste(merged, (x1, 0))
+        new_img.paste(img2, (x2, merged.height))
+        merged = new_img
+    return to_np_array(merged)
+
+
+def merge_blocks(blocks, non_merge_labels):
+    """
+    Merge blocks based on alignment and overlap logic, except for those with labels in non_merge_labels.
+
+    Args:
+        blocks (List[Dict]): List of block dicts.
+        non_merge_labels (List[str]): Block labels that should not be merged.
+
+    Returns:
+        List[Dict]: List of processed (and possibly merged) blocks.
+    """
+    blocks_to_merge = []
+    non_merge_blocks = {}
+    for idx, block in enumerate(blocks):
+        if block["label"] in non_merge_labels:
+            non_merge_blocks[idx] = block
+        else:
+            blocks_to_merge.append((idx, block))
+
+    merged_groups = []
+    current_group = []
+    current_indices = []
+    current_aligns = []
+
+    def is_aligned(a1, a2):
+        return abs(a1 - a2) <= 5
+
+    def get_alignment(block_bbox, prev_bbox):
+        if is_aligned(block_bbox[0], prev_bbox[0]):
+            return "left"
+        elif is_aligned(block_bbox[2], prev_bbox[2]):
+            return "right"
+        else:
+            return "center"
+
+    def overlapwith_other_box(block_idx, prev_idx, blocks):
+        prev_bbox = blocks[prev_idx]["box"]
+        block_bbox = blocks[block_idx]["box"]
+        x1 = min(prev_bbox[0], block_bbox[0])
+        y1 = min(prev_bbox[1], block_bbox[1])
+        x2 = max(prev_bbox[2], block_bbox[2])
+        y2 = max(prev_bbox[3], block_bbox[3])
+        min_box = [x1, y1, x2, y2]
+        for idx, other_block in enumerate(blocks):
+            if idx in [block_idx, prev_idx]:
+                continue
+            other_bbox = other_block["box"]
+            if calculate_overlap_ratio(min_box, other_bbox) > 0:
+                return True
+        return False
+
+    for i, (idx, block) in enumerate(blocks_to_merge):
+        if not current_group:
+            current_group = [block]
+            current_indices = [idx]
+            current_aligns = []
+            continue
+
+        prev_idx, prev_block = blocks_to_merge[i - 1]
+        prev_bbox = prev_block["box"]
+        prev_label = prev_block["label"]
+        block_bbox = block["box"]
+        block_label = block["label"]
+
+        iou_h = calculate_projection_overlap_ratio(block_bbox, prev_bbox, "horizontal")
+        is_cross = (
+            iou_h == 0
+            and block_label == "text"
+            and block_label == prev_label
+            and block_bbox[0] > prev_bbox[2]
+            and block_bbox[1] < prev_bbox[3]
+            and block_bbox[0] - prev_bbox[2]
+            < max(prev_bbox[2] - prev_bbox[0], block_bbox[2] - block_bbox[0]) * 0.3
+        )
+        is_updown_align = (
+            iou_h > 0
+            and block_label in ["text"]
+            and block_label == prev_label
+            and block_bbox[3] >= prev_bbox[1]
+            and abs(block_bbox[1] - prev_bbox[3])
+            < max(prev_bbox[3] - prev_bbox[1], block_bbox[3] - block_bbox[1]) * 0.5
+            and (
+                is_aligned(block_bbox[0], prev_bbox[0])
+                ^ is_aligned(block_bbox[2], prev_bbox[2])
+            )
+            and overlapwith_other_box(idx, prev_idx, blocks)
+        )
+        if is_cross:
+            align_mode = "center"
+        elif is_updown_align:
+            align_mode = get_alignment(block_bbox, prev_bbox)
+        else:
+            align_mode = None
+
+        if is_cross or is_updown_align:
+            current_group.append(block)
+            current_indices.append(idx)
+            current_aligns.append(align_mode)
+        else:
+            merged_groups.append((current_indices, current_group, current_aligns))
+            current_group = [block]
+            current_indices = [idx]
+            current_aligns = []
+    if current_group:
+        merged_groups.append((current_indices, current_group, current_aligns))
+
+    group_ranges = []
+    for group_indices, group, aligns in merged_groups:
+        start, end = min(group_indices), max(group_indices)
+        group_ranges.append((start, end, group_indices, aligns))
+
+    result_blocks = []
+    used_indices = set()
+    idx = 0
+    while idx < len(blocks):
+        group_found = False
+        for (start, end, group_indices, aligns), (g_indices, g_blocks, g_aligns) in zip(
+            group_ranges, merged_groups
+        ):
+            if idx == start and all(i not in used_indices for i in group_indices):
+                group_found = True
+                imgs = [blocks[i]["img"] for i in group_indices]
+                merge_aligns = aligns if aligns else []
+                w, h = calc_merged_wh(imgs)
+                aspect_ratio = h / w if w != 0 else float("inf")
+                if aspect_ratio >= 3:
+                    for j, block_idx in enumerate(group_indices):
+                        block = blocks[block_idx].copy()
+                        block["img"] = blocks[block_idx]["img"]
+                        block["merge_aligns"] = None
+                        result_blocks.append(block)
+                        used_indices.add(block_idx)
+                else:
+                    merged_img = merge_images(imgs, merge_aligns)
+                    for j, block_idx in enumerate(group_indices):
+                        block = blocks[block_idx].copy()
+                        block["img"] = merged_img if j == 0 else None
+                        block["merge_aligns"] = merge_aligns if j == 0 else None
+                        result_blocks.append(block)
+                        used_indices.add(block_idx)
+                insert_list = []
+                for n_idx in range(start + 1, end):
+                    if n_idx in non_merge_blocks:
+                        insert_list.append(n_idx)
+                for n_idx in insert_list:
+                    result_blocks.append(non_merge_blocks[n_idx])
+                    used_indices.add(n_idx)
+                idx = end + 1
+                break
+        if group_found:
+            continue
+        if idx in non_merge_blocks and idx not in used_indices:
+            result_blocks.append(non_merge_blocks[idx])
+            used_indices.add(idx)
+        idx += 1
+    return result_blocks
+
+
+def paint_token(image, box, token_str):
+    """
+    Fill a rectangular area in the image with a white background and write the given token string.
+
+    Args:
+        image (np.ndarray): Image to paint on.
+        box (tuple): (x1, y1, x2, y2) coordinates of rectangle.
+        token_str (str): Token string to write.
+
+    Returns:
+        np.ndarray: Modified image.
+    """
+    import cv2
+
+    def get_optimal_font_scale(text, fontFace, square_size, fill_ratio=0.9):
+        # the scale is greater than 0.2 and less than 10,
+        # suitable for square_size is greater than 30 and less than 1000
+        left, right = 0.2, 10
+        optimal_scale = left
+        # search the optimal font scale
+        while right - left > 1e-2:
+            mid = (left + right) / 2
+            (w, h), _ = cv2.getTextSize(text, fontFace, mid, thickness=1)
+            if w < square_size * fill_ratio and h < square_size * fill_ratio:
+                optimal_scale = mid
+                left = mid
+            else:
+                right = mid
+        return optimal_scale, w, h
+
+    x1, y1, x2, y2 = [int(v) for v in box]
+    box_w = x2 - x1
+    box_h = y2 - y1
+
+    img = image.copy()
+    cv2.rectangle(img, (x1, y1), (x2, y2), color=(255, 255, 255), thickness=-1)
+
+    # automatically set scale and thickness according to length of the shortest side
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    thickness_scale_ratio = 4
+    font_scale, text_w, text_h = get_optimal_font_scale(
+        token_str, font, min(box_w, box_h), fill_ratio=0.9
+    )
+    font_thickness = max(1, math.floor(font_scale * thickness_scale_ratio))
+
+    # calculate center coordinates of the patinting text
+    text_x = x1 + (box_w - text_w) // 2
+    text_y = y1 + (box_h + text_h) // 2
+
+    cv2.putText(
+        img,
+        token_str,
+        (text_x, text_y),
+        font,
+        font_scale,
+        (0, 0, 0),
+        font_thickness,
+        lineType=cv2.LINE_AA,
+    )
+    return img
+
+
+def tokenize_figure_of_table(table_block_img, table_box, figures):
+    """
+    Replace figures in a table area with tokens, return new image and token map.
+
+    Args:
+        table_block_img (np.ndarray): Table image.
+        table_box (list): Table bounding box [x_min, y_min, x_max, y_max].
+        figures (List[Dict]): List of figure dicts (must contain 'coordinate', 'path').
+
+    Returns:
+        Tuple[np.ndarray, Dict[str, str], List[str]]:
+            - New table image,
+            - Token-to-img HTML map,
+            - List of figure paths dropped.
+    """
+
+    def gen_random_map(num):
+        exclude_digits = {"0", "1", "9"}
+        seq = []
+        i = 0
+        while len(seq) < num:
+            if not (set(str(i)) & exclude_digits):
+                seq.append(i)
+            i += 1
+        return seq
+
+    import random
+
+    random.seed(1024)
+    token_map = {}
+    table_x_min, table_y_min, table_x_max, table_y_max = table_box
+    drop_idxes = []
+    random_map = gen_random_map(len(figures))
+    random.shuffle(random_map)
+    for figure_id, figure in enumerate(figures):
+        figure_x_min, figure_y_min, figure_x_max, figure_y_max = figure["coordinate"]
+        if (
+            figure_x_min >= table_x_min
+            and figure_y_min >= table_y_min
+            and figure_x_max <= table_x_max
+            and figure_y_max <= table_y_max
+        ):
+            drop_idxes.append(figure_id)
+            # the figure is too small to can't be tokenized and recognized when shortest length is less than 25
+            if min(figure_x_max - figure_x_min, figure_y_max - figure_y_min) < 25:
+                continue
+            draw_box = [
+                figure_x_min - table_x_min,
+                figure_y_min - table_y_min,
+                figure_x_max - table_x_min,
+                figure_y_max - table_y_min,
+            ]
+            token_str = "[F" + str(random_map[figure_id]) + "]"
+            table_block_img = paint_token(table_block_img, draw_box, token_str)
+            token_map[token_str] = f'<img src="{figure["path"]}" >'
+    drop_figures = [f["path"] for i, f in enumerate(figures) if i in drop_idxes]
+    return table_block_img, token_map, drop_figures
+
+
+def untokenize_figure_of_table(table_res_str, figure_token_map):
+    """
+    Replace tokens in a string with their HTML image equivalents.
+
+    Args:
+        table_res_str (str): Table string with tokens.
+        figure_token_map (dict): Mapping from tokens to HTML img tags.
+
+    Returns:
+        str: Untokenized string.
+    """
+
+    def repl(match):
+        token_id = match.group(1)
+        token = f"[F{token_id}]"
+        return figure_token_map.get(token, match.group(0))
+
+    pattern = r"\[F(\d+)\]"
+    return re.sub(pattern, repl, table_res_str)
+
+
+class TableCell(BaseModel):
+    """
+    TableCell represents a single cell in a table.
+
+    Attributes:
+        row_span (int): Number of rows spanned.
+        col_span (int): Number of columns spanned.
+        start_row_offset_idx (int): Start row index.
+        end_row_offset_idx (int): End row index (exclusive).
+        start_col_offset_idx (int): Start column index.
+        end_col_offset_idx (int): End column index (exclusive).
+        text (str): Cell text content.
+        column_header (bool): Whether this cell is a column header.
+        row_header (bool): Whether this cell is a row header.
+        row_section (bool): Whether this cell is a row section.
+    """
+
+    row_span: int = 1
+    col_span: int = 1
+    start_row_offset_idx: int
+    end_row_offset_idx: int
+    start_col_offset_idx: int
+    end_col_offset_idx: int
+    text: str
+    column_header: bool = False
+    row_header: bool = False
+    row_section: bool = False
+
+    @model_validator(mode="before")
+    @classmethod
+    def from_dict_format(cls, data: Any) -> Any:
+        """
+        Create TableCell from dict, extracting 'text' property correctly.
+
+        Args:
+            data (Any): Input data.
+
+        Returns:
+            Any: TableCell-compatible dict.
+        """
+        if isinstance(data, Dict):
+            if "text" in data:
+                return data
+            text = data["bbox"].get("token", "")
+            if not len(text):
+                text_cells = data.pop("text_cell_bboxes", None)
+                if text_cells:
+                    for el in text_cells:
+                        text += el["token"] + " "
+                text = text.strip()
+            data["text"] = text
+        return data
+
+
+class TableData(BaseModel):
+    """
+    TableData holds a table's cells, row and column counts, and provides a grid property.
+
+    Attributes:
+        table_cells (List[TableCell]): List of table cells.
+        num_rows (int): Number of rows.
+        num_cols (int): Number of columns.
+    """
+
+    table_cells: List[TableCell] = []
+    num_rows: int = 0
+    num_cols: int = 0
+
+    @computed_field
+    @property
+    def grid(self) -> List[List[TableCell]]:
+        """
+        Returns a 2D grid of TableCell objects for the table.
+
+        Returns:
+            List[List[TableCell]]: Table as 2D grid.
+        """
+        table_data = [
+            [
+                TableCell(
+                    text="",
+                    start_row_offset_idx=i,
+                    end_row_offset_idx=i + 1,
+                    start_col_offset_idx=j,
+                    end_col_offset_idx=j + 1,
+                )
+                for j in range(self.num_cols)
+            ]
+            for i in range(self.num_rows)
+        ]
+        for cell in self.table_cells:
+            for i in range(
+                min(cell.start_row_offset_idx, self.num_rows),
+                min(cell.end_row_offset_idx, self.num_rows),
+            ):
+                for j in range(
+                    min(cell.start_col_offset_idx, self.num_cols),
+                    min(cell.end_col_offset_idx, self.num_cols),
+                ):
+                    table_data[i][j] = cell
+        return table_data
+
+
+# OTSL tag constants
+OTSL_NL = "<nl>"
+OTSL_FCEL = "<fcel>"
+OTSL_ECEL = "<ecel>"
+OTSL_LCEL = "<lcel>"
+OTSL_UCEL = "<ucel>"
+OTSL_XCEL = "<xcel>"
+
+NON_CAPTURING_TAG_GROUP = "(?:<fcel>|<ecel>|<nl>|<lcel>|<ucel>|<xcel>)"
+OTSL_FIND_PATTERN = re.compile(
+    f"{NON_CAPTURING_TAG_GROUP}.*?(?={NON_CAPTURING_TAG_GROUP}|$)", flags=re.DOTALL
+)
+
+
+def otsl_extract_tokens_and_text(s: str):
+    """
+    Extract OTSL tags and text parts from the input string.
+
+    Args:
+        s (str): OTSL string.
+
+    Returns:
+        Tuple[List[str], List[str]]: (tokens, text_parts)
+    """
+    pattern = (
+        r"("
+        + r"|".join([OTSL_NL, OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL])
+        + r")"
+    )
+    tokens = re.findall(pattern, s)
+    text_parts = re.split(pattern, s)
+    text_parts = [token for token in text_parts if token.strip()]
+    return tokens, text_parts
+
+
+def otsl_parse_texts(texts, tokens):
+    """
+    Parse OTSL text and tags into TableCell objects and tag structure.
+
+    Args:
+        texts (List[str]): List of tokens and text.
+        tokens (List[str]): List of OTSL tags.
+
+    Returns:
+        Tuple[List[TableCell], List[List[str]]]: (table_cells, split_row_tokens)
+    """
+    split_word = OTSL_NL
+    split_row_tokens = [
+        list(y)
+        for x, y in itertools.groupby(tokens, lambda z: z == split_word)
+        if not x
+    ]
+    table_cells = []
+    r_idx = 0
+    c_idx = 0
+
+    # Ensure matrix completeness
+    if split_row_tokens:
+        max_cols = max(len(row) for row in split_row_tokens)
+        for row in split_row_tokens:
+            while len(row) < max_cols:
+                row.append(OTSL_ECEL)
+        new_texts = []
+        text_idx = 0
+        for row in split_row_tokens:
+            for token in row:
+                new_texts.append(token)
+                if text_idx < len(texts) and texts[text_idx] == token:
+                    text_idx += 1
+                    if text_idx < len(texts) and texts[text_idx] not in [
+                        OTSL_NL,
+                        OTSL_FCEL,
+                        OTSL_ECEL,
+                        OTSL_LCEL,
+                        OTSL_UCEL,
+                        OTSL_XCEL,
+                    ]:
+                        new_texts.append(texts[text_idx])
+                        text_idx += 1
+            new_texts.append(OTSL_NL)
+            if text_idx < len(texts) and texts[text_idx] == OTSL_NL:
+                text_idx += 1
+        texts = new_texts
+
+    def count_right(tokens, c_idx, r_idx, which_tokens):
+        span = 0
+        c_idx_iter = c_idx
+        while tokens[r_idx][c_idx_iter] in which_tokens:
+            c_idx_iter += 1
+            span += 1
+            if c_idx_iter >= len(tokens[r_idx]):
+                return span
+        return span
+
+    def count_down(tokens, c_idx, r_idx, which_tokens):
+        span = 0
+        r_idx_iter = r_idx
+        while tokens[r_idx_iter][c_idx] in which_tokens:
+            r_idx_iter += 1
+            span += 1
+            if r_idx_iter >= len(tokens):
+                return span
+        return span
+
+    for i, text in enumerate(texts):
+        cell_text = ""
+        if text in [OTSL_FCEL, OTSL_ECEL]:
+            row_span = 1
+            col_span = 1
+            right_offset = 1
+            if text != OTSL_ECEL:
+                cell_text = texts[i + 1]
+                right_offset = 2
+
+            next_right_cell = (
+                texts[i + right_offset] if i + right_offset < len(texts) else ""
+            )
+            next_bottom_cell = ""
+            if r_idx + 1 < len(split_row_tokens):
+                if c_idx < len(split_row_tokens[r_idx + 1]):
+                    next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
+
+            if next_right_cell in [OTSL_LCEL, OTSL_XCEL]:
+                col_span += count_right(
+                    split_row_tokens, c_idx + 1, r_idx, [OTSL_LCEL, OTSL_XCEL]
+                )
+            if next_bottom_cell in [OTSL_UCEL, OTSL_XCEL]:
+                row_span += count_down(
+                    split_row_tokens, c_idx, r_idx + 1, [OTSL_UCEL, OTSL_XCEL]
+                )
+
+            table_cells.append(
+                TableCell(
+                    text=cell_text.strip(),
+                    row_span=row_span,
+                    col_span=col_span,
+                    start_row_offset_idx=r_idx,
+                    end_row_offset_idx=r_idx + row_span,
+                    start_col_offset_idx=c_idx,
+                    end_col_offset_idx=c_idx + col_span,
+                )
+            )
+        if text in [OTSL_FCEL, OTSL_ECEL, OTSL_LCEL, OTSL_UCEL, OTSL_XCEL]:
+            c_idx += 1
+        if text == OTSL_NL:
+            r_idx += 1
+            c_idx = 0
+    return table_cells, split_row_tokens
+
+
+def export_to_html(table_data: TableData):
+    """
+    Export TableData to HTML table.
+
+    Args:
+        table_data (TableData): TableData object.
+
+    Returns:
+        str: HTML string.
+    """
+    nrows = table_data.num_rows
+    ncols = table_data.num_cols
+    if len(table_data.table_cells) == 0:
+        return ""
+    body = ""
+    grid = table_data.grid
+    for i in range(nrows):
+        body += "<tr>"
+        for j in range(ncols):
+            cell: TableCell = grid[i][j]
+            rowspan, rowstart = (cell.row_span, cell.start_row_offset_idx)
+            colspan, colstart = (cell.col_span, cell.start_col_offset_idx)
+            if rowstart != i or colstart != j:
+                continue
+            content = html.escape(cell.text.strip())
+            celltag = "th" if cell.column_header else "td"
+            opening_tag = f"{celltag}"
+            if rowspan > 1:
+                opening_tag += f' rowspan="{rowspan}"'
+            if colspan > 1:
+                opening_tag += f' colspan="{colspan}"'
+            body += f"<{opening_tag}>{content}</{celltag}>"
+        body += "</tr>"
+    body = f"<table>{body}</table>"
+    return body
+
+
+def otsl_pad_to_sqr_v2(otsl_str: str) -> str:
+    """
+    Pad OTSL string to a square (rectangular) format, ensuring each row has equal number of cells.
+
+    Args:
+        otsl_str (str): OTSL string.
+
+    Returns:
+        str: Padded OTSL string.
+    """
+    assert isinstance(otsl_str, str)
+    otsl_str = otsl_str.strip()
+    if OTSL_NL not in otsl_str:
+        return otsl_str + OTSL_NL
+    lines = otsl_str.split(OTSL_NL)
+    row_data = []
+    for line in lines:
+        if not line:
+            continue
+        raw_cells = OTSL_FIND_PATTERN.findall(line)
+        if not raw_cells:
+            continue
+        total_len = len(raw_cells)
+        min_len = 0
+        for i, cell_str in enumerate(raw_cells):
+            if cell_str.startswith(OTSL_FCEL):
+                min_len = i + 1
+        row_data.append(
+            {"raw_cells": raw_cells, "total_len": total_len, "min_len": min_len}
+        )
+    if not row_data:
+        return OTSL_NL
+    global_min_width = max(row["min_len"] for row in row_data) if row_data else 0
+    max_total_len = max(row["total_len"] for row in row_data) if row_data else 0
+    search_start = global_min_width
+    search_end = max(global_min_width, max_total_len)
+    min_total_cost = float("inf")
+    optimal_width = search_end
+
+    for width in range(search_start, search_end + 1):
+        current_total_cost = sum(abs(row["total_len"] - width) for row in row_data)
+        if current_total_cost < min_total_cost:
+            min_total_cost = current_total_cost
+            optimal_width = width
+
+    repaired_lines = []
+    for row in row_data:
+        cells = row["raw_cells"]
+        current_len = len(cells)
+        if current_len > optimal_width:
+            new_cells = cells[:optimal_width]
+        else:
+            padding = [OTSL_ECEL] * (optimal_width - current_len)
+            new_cells = cells + padding
+        repaired_lines.append("".join(new_cells))
+    return OTSL_NL.join(repaired_lines) + OTSL_NL
+
+
+def convert_otsl_to_html(otsl_content: str):
+    """
+    Convert OTSL-v1.0 string to HTML. Only 6 tags allowed: <fcel>, <ecel>, <nl>, <lcel>, <ucel>, <xcel>.
+
+    Args:
+        otsl_content (str): OTSL string.
+
+    Returns:
+        str: HTML table.
+    """
+    otsl_content = otsl_pad_to_sqr_v2(otsl_content)
+    tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
+    table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
+    table_data = TableData(
+        num_rows=len(split_row_tokens),
+        num_cols=(max(len(row) for row in split_row_tokens) if split_row_tokens else 0),
+        table_cells=table_cells,
+    )
+    return export_to_html(table_data)
+
+
+def find_shortest_repeating_substring(s: str) -> str | None:
+    """
+    Find the shortest substring that repeats to form the entire string.
+
+    Args:
+        s (str): Input string.
+
+    Returns:
+        str or None: Shortest repeating substring, or None if not found.
+    """
+    n = len(s)
+    for i in range(1, n // 2 + 1):
+        if n % i == 0:
+            substring = s[:i]
+            if substring * (n // i) == s:
+                return substring
+    return None
+
+
+def find_repeating_suffix(
+    s: str, min_len: int = 8, min_repeats: int = 5
+) -> Tuple[str, str, int] | None:
+    """
+    Detect if string ends with a repeating phrase.
+
+    Args:
+        s (str): Input string.
+        min_len (int): Minimum length of unit.
+        min_repeats (int): Minimum repeat count.
+
+    Returns:
+        Tuple[str, str, int] or None: (prefix, unit, count) if found, else None.
+    """
+    for i in range(len(s) // (min_repeats), min_len - 1, -1):
+        unit = s[-i:]
+        if s.endswith(unit * min_repeats):
+            count = 0
+            temp_s = s
+            while temp_s.endswith(unit):
+                temp_s = temp_s[:-i]
+                count += 1
+            start_index = len(s) - (count * i)
+            return s[:start_index], unit, count
+    return None
+
+
+def truncate_repetitive_content(
+    content: str, line_threshold: int = 10, char_threshold: int = 10, min_len: int = 10
+) -> str:
+    """
+    Detect and truncate character-level, phrase-level, or line-level repetition in content.
+
+    Args:
+        content (str): Input text.
+        line_threshold (int): Min lines for line-level truncation.
+        char_threshold (int): Min repeats for char-level truncation.
+        min_len (int): Min length for char-level check.
+
+    Returns:
+        Tuple[str, str]: (truncated_content, info_string)
+    """
+    stripped_content = content.strip()
+    if not stripped_content:
+        return content
+
+    # Priority 1: Phrase-level suffix repetition in long single lines.
+    if "\n" not in stripped_content and len(stripped_content) > 100:
+        suffix_match = find_repeating_suffix(stripped_content, min_len=8, min_repeats=5)
+        if suffix_match:
+            prefix, repeating_unit, count = suffix_match
+            if len(repeating_unit) * count > len(stripped_content) * 0.5:
+                return prefix
+
+    # Priority 2: Full-string character-level repetition (e.g., 'ababab')
+    if "\n" not in stripped_content and len(stripped_content) > min_len:
+        repeating_unit = find_shortest_repeating_substring(stripped_content)
+        if repeating_unit:
+            count = len(stripped_content) // len(repeating_unit)
+            if count >= char_threshold:
+                return repeating_unit
+
+    # Priority 3: Line-level repetition (e.g., same line repeated many times)
+    lines = [line.strip() for line in content.split("\n") if line.strip()]
+    if not lines:
+        return content
+    total_lines = len(lines)
+    if total_lines < line_threshold:
+        return content
+    line_counts = Counter(lines)
+    most_common_line, count = line_counts.most_common(1)[0]
+    if count >= line_threshold and (count / total_lines) >= 0.8:
+        return most_common_line
+
+    return content
diff --git a/paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py b/paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py
index 420a435cab..733ea7ec54 100644
--- a/paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py
+++ b/paddlex/inference/pipelines/pp_chatocr/pipeline_v4.py
@@ -96,6 +96,10 @@ def __init__(
 
         self.table_structure_len_max = 500
 
+    def close(self):
+        if self.layout_parsing_pipeline is not None:
+            self.layout_parsing_pipeline.close()
+
     def inintial_visual_predictor(self, config: dict) -> None:
         """
         Initializes the visual predictor with the given configuration.
diff --git a/paddlex/inference/pipelines/pp_doctranslation/pipeline.py b/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
index d95b100821..0dc493370d 100644
--- a/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
+++ b/paddlex/inference/pipelines/pp_doctranslation/pipeline.py
@@ -83,6 +83,10 @@ def __init__(
 
         self.markdown_batch_sampler = MarkDownBatchSampler()
 
+    def close(self):
+        if self.layout_parsing_pipeline is not None:
+            self.layout_parsing_pipeline.close()
+
     def inintial_visual_predictor(self, config: dict) -> None:
         """
         Initializes the visual predictor with the given configuration.
diff --git a/paddlex/inference/pipelines/pp_doctranslation/result.py b/paddlex/inference/pipelines/pp_doctranslation/result.py
index 663d430566..7be5e42bd9 100644
--- a/paddlex/inference/pipelines/pp_doctranslation/result.py
+++ b/paddlex/inference/pipelines/pp_doctranslation/result.py
@@ -35,5 +35,5 @@ def _get_input_fn(self):
             fn = f"{stem}_{language}{suffix}"
         return fn
 
-    def _to_markdown(self, pretty=True) -> dict:
+    def _to_markdown(self, pretty=True, show_formula_number=False) -> dict:
         return self
diff --git a/paddlex/inference/pipelines/seal_recognition/pipeline.py b/paddlex/inference/pipelines/seal_recognition/pipeline.py
index 73731ab273..aa26d0e2ad 100644
--- a/paddlex/inference/pipelines/seal_recognition/pipeline.py
+++ b/paddlex/inference/pipelines/seal_recognition/pipeline.py
@@ -259,7 +259,7 @@ def predict(
                     layout_det_results = []
                     for _ in doc_preprocessor_images:
                         try:
-                            layout_det_res = list(external_layout_det_results)[0]
+                            layout_det_res = next(external_layout_det_results)
                         except StopIteration:
                             raise ValueError("No more layout det results")
                         layout_det_results.append(layout_det_res)
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py
index ceff555697..ef18f5b012 100644
--- a/paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/ocr.py
@@ -60,6 +60,7 @@ async def _infer(request: InferRequest) -> AIStudioResultResponse[InferResult]:
             text_det_box_thresh=request.textDetBoxThresh,
             text_det_unclip_ratio=request.textDetUnclipRatio,
             text_rec_score_thresh=request.textRecScoreThresh,
+            return_word_box=request.returnWordBox,
         )
 
         ocr_results: List[Dict[str, Any]] = []
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/paddleocr_vl.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/paddleocr_vl.py
new file mode 100644
index 0000000000..05b4f0f5ee
--- /dev/null
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/paddleocr_vl.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List
+
+from .....utils.deps import function_requires_deps, is_dep_available
+from ...infra import utils as serving_utils
+from ...infra.config import AppConfig
+from ...infra.models import AIStudioResultResponse
+from ...schemas.paddleocr_vl import INFER_ENDPOINT, InferRequest, InferResult
+from .._app import create_app, primary_operation
+from ._common import common
+from ._common import ocr as ocr_common
+
+if is_dep_available("fastapi"):
+    from fastapi import FastAPI
+
+
+@function_requires_deps("fastapi")
+def create_pipeline_app(pipeline: Any, app_config: AppConfig) -> "FastAPI":
+    app, ctx = create_app(
+        pipeline=pipeline, app_config=app_config, app_aiohttp_session=True
+    )
+
+    ocr_common.update_app_context(ctx)
+
+    @primary_operation(
+        app,
+        INFER_ENDPOINT,
+        "infer",
+    )
+    async def _infer(
+        request: InferRequest,
+    ) -> AIStudioResultResponse[InferResult]:
+        pipeline = ctx.pipeline
+
+        log_id = serving_utils.generate_log_id()
+        visualize_enabled = (
+            request.visualize if request.visualize is not None else ctx.config.visualize
+        )
+        images, data_info = await ocr_common.get_images(request, ctx)
+
+        result = await pipeline.infer(
+            images,
+            use_doc_orientation_classify=request.useDocOrientationClassify,
+            use_doc_unwarping=request.useDocUnwarping,
+            use_layout_detection=request.useLayoutDetection,
+            use_chart_recognition=request.useChartRecognition,
+            layout_threshold=request.layoutThreshold,
+            layout_nms=request.layoutNms,
+            layout_unclip_ratio=request.layoutUnclipRatio,
+            layout_merge_bboxes_mode=request.layoutMergeBboxesMode,
+            prompt_label=request.promptLabel,
+            format_block_content=request.formatBlockContent,
+            repetition_penalty=request.repetitionPenalty,
+            temperature=request.temperature,
+            top_p=request.topP,
+            min_pixels=request.minPixels,
+            max_pixels=request.maxPixels,
+        )
+
+        layout_parsing_results: List[Dict[str, Any]] = []
+        for i, (img, item) in enumerate(zip(images, result)):
+            pruned_res = common.prune_result(item.json["res"])
+            # XXX
+            md_data = item._to_markdown(
+                pretty=request.prettifyMarkdown,
+                show_formula_number=request.showFormulaNumber,
+            )
+            md_text = md_data["markdown_texts"]
+            md_imgs = await serving_utils.call_async(
+                common.postprocess_images,
+                md_data["markdown_images"],
+                log_id,
+                filename_template=f"markdown_{i}/{{key}}",
+                file_storage=ctx.extra["file_storage"],
+                return_urls=ctx.extra["return_img_urls"],
+                max_img_size=ctx.extra["max_output_img_size"],
+            )
+            if visualize_enabled:
+                imgs = {
+                    "input_img": img,
+                    **item.img,
+                }
+                imgs = await serving_utils.call_async(
+                    common.postprocess_images,
+                    imgs,
+                    log_id,
+                    filename_template=f"{{key}}_{i}.jpg",
+                    file_storage=ctx.extra["file_storage"],
+                    return_urls=ctx.extra["return_img_urls"],
+                    max_img_size=ctx.extra["max_output_img_size"],
+                )
+            else:
+                imgs = {}
+            layout_parsing_results.append(
+                dict(
+                    prunedResult=pruned_res,
+                    markdown=dict(
+                        text=md_text,
+                        images=md_imgs,
+                    ),
+                    outputImages=(
+                        {k: v for k, v in imgs.items() if k != "input_img"}
+                        if imgs
+                        else None
+                    ),
+                    inputImage=imgs.get("input_img"),
+                )
+            )
+
+        return AIStudioResultResponse[InferResult](
+            logId=log_id,
+            result=InferResult(
+                layoutParsingResults=layout_parsing_results,
+                dataInfo=data_info,
+            ),
+        )
+
+    return app
diff --git a/paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py b/paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py
index 96bf085a05..3b3a9a660f 100644
--- a/paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py
+++ b/paddlex/inference/serving/basic_serving/_pipeline_apps/pp_structurev3.py
@@ -61,6 +61,7 @@ async def _infer(
             use_formula_recognition=request.useFormulaRecognition,
             use_chart_recognition=request.useChartRecognition,
             use_region_detection=request.useRegionDetection,
+            format_block_content=request.formatBlockContent,
             layout_threshold=request.layoutThreshold,
             layout_nms=request.layoutNms,
             layout_unclip_ratio=request.layoutUnclipRatio,
diff --git a/paddlex/inference/serving/schemas/ocr.py b/paddlex/inference/serving/schemas/ocr.py
index 57d02a6675..3d0e34c108 100644
--- a/paddlex/inference/serving/schemas/ocr.py
+++ b/paddlex/inference/serving/schemas/ocr.py
@@ -41,6 +41,7 @@ class InferRequest(ocr.BaseInferRequest):
     textDetBoxThresh: Optional[float] = None
     textDetUnclipRatio: Optional[float] = None
     textRecScoreThresh: Optional[float] = None
+    returnWordBox: Optional[bool] = None
     visualize: Optional[bool] = None
 
 
diff --git a/paddlex/inference/serving/schemas/paddleocr_vl.py b/paddlex/inference/serving/schemas/paddleocr_vl.py
new file mode 100644
index 0000000000..0839fd3094
--- /dev/null
+++ b/paddlex/inference/serving/schemas/paddleocr_vl.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, Final, List, Optional, Tuple, Union
+
+from pydantic import BaseModel
+
+from ..infra.models import DataInfo, PrimaryOperations
+from .shared import ocr
+
+__all__ = [
+    "INFER_ENDPOINT",
+    "InferRequest",
+    "LayoutParsingResult",
+    "InferResult",
+    "PRIMARY_OPERATIONS",
+]
+
+INFER_ENDPOINT: Final[str] = "/layout-parsing"
+
+
+class InferRequest(ocr.BaseInferRequest):
+    useDocOrientationClassify: Optional[bool] = None
+    useDocUnwarping: Optional[bool] = None
+    useLayoutDetection: Optional[bool] = None
+    useChartRecognition: Optional[bool] = None
+    layoutThreshold: Optional[Union[float, dict]] = None
+    layoutNms: Optional[bool] = None
+    layoutUnclipRatio: Optional[Union[float, Tuple[float, float], dict]] = None
+    layoutMergeBboxesMode: Optional[Union[str, dict]] = None
+    promptLabel: Optional[str] = None
+    formatBlockContent: Optional[bool] = None
+    repetitionPenalty: Optional[float] = None
+    temperature: Optional[float] = None
+    topP: Optional[float] = None
+    minPixels: Optional[int] = None
+    maxPixels: Optional[int] = None
+    prettifyMarkdown: bool = True
+    showFormulaNumber: bool = False
+    visualize: Optional[bool] = None
+
+
+class MarkdownData(BaseModel):
+    text: str
+    images: Optional[Dict[str, str]] = None
+
+
+class LayoutParsingResult(BaseModel):
+    prunedResult: dict
+    markdown: MarkdownData
+    outputImages: Optional[Dict[str, str]] = None
+    inputImage: Optional[str] = None
+
+
+class InferResult(BaseModel):
+    layoutParsingResults: List[LayoutParsingResult]
+    dataInfo: DataInfo
+
+
+PRIMARY_OPERATIONS: Final[PrimaryOperations] = {
+    "infer": (INFER_ENDPOINT, InferRequest, InferResult),
+}
diff --git a/paddlex/inference/serving/schemas/pp_structurev3.py b/paddlex/inference/serving/schemas/pp_structurev3.py
index f17ce035e8..9bf1134b49 100644
--- a/paddlex/inference/serving/schemas/pp_structurev3.py
+++ b/paddlex/inference/serving/schemas/pp_structurev3.py
@@ -39,6 +39,7 @@ class InferRequest(ocr.BaseInferRequest):
     useFormulaRecognition: Optional[bool] = None
     useChartRecognition: Optional[bool] = None
     useRegionDetection: Optional[bool] = None
+    formatBlockContent: Optional[bool] = None
     layoutThreshold: Optional[Union[float, dict]] = None
     layoutNms: Optional[bool] = None
     layoutUnclipRatio: Optional[Union[float, Tuple[float, float], dict]] = None
diff --git a/paddlex/inference/utils/benchmark.py b/paddlex/inference/utils/benchmark.py
index 2d64599206..55f1eefc37 100644
--- a/paddlex/inference/utils/benchmark.py
+++ b/paddlex/inference/utils/benchmark.py
@@ -567,13 +567,13 @@ def print_detail_data(self):
         detail_head = [
             "Step",
             "Operation",
-            "Time",
+            "Time (ms)",
         ]
         table = PrettyTable(detail_head)
         table.add_rows(self._detail_list)
         table_title = "Detail Data".center(len(str(table).split("\n")[0]), " ")
         table.align["Operation"] = "l"
-        table.align["Time"] = "l"
+        table.align["Time (ms)"] = "l"
         logging.info(table_title)
         logging.info(table)
 
@@ -582,13 +582,13 @@ def print_summary_data(self):
         summary_head = [
             "Level",
             "Operation",
-            "Time",
+            "Time (ms)",
         ]
         table = PrettyTable(summary_head)
         table.add_rows(self._summary_list)
         table_title = "Summary Data".center(len(str(table).split("\n")[0]), " ")
         table.align["Operation"] = "l"
-        table.align["Time"] = "l"
+        table.align["Time (ms)"] = "l"
         logging.info(table_title)
         logging.info(table)
 
@@ -600,7 +600,7 @@ def save_pipeline_data(self, save_path):
         detail_head = [
             "Step",
             "Operation",
-            "Time",
+            "Time (ms)",
         ]
         csv_data = [detail_head, *self._detail_list]
         with open(Path(save_dir) / "detail.csv", "w", newline="") as file:
@@ -610,7 +610,7 @@ def save_pipeline_data(self, save_path):
         summary_head = [
             "Level",
             "Operation",
-            "Time",
+            "Time (ms)",
         ]
         csv_data = [summary_head, *self._summary_list]
         with open(Path(save_dir) / "summary.csv", "w", newline="") as file:
diff --git a/paddlex/inference/utils/hpi_model_info_collection.json b/paddlex/inference/utils/hpi_model_info_collection.json
index 6cca95ec48..78835aa809 100644
--- a/paddlex/inference/utils/hpi_model_info_collection.json
+++ b/paddlex/inference/utils/hpi_model_info_collection.json
@@ -1357,6 +1357,72 @@
         "paddle_mkldnn",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "korean_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "th_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "el_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "paddle_mkldnn",
         "openvino",
@@ -1399,6 +1465,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     },
     "paddle31": {
@@ -2769,6 +2838,72 @@
         "paddle_mkldnn",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "korean_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "th_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "el_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "paddle_mkldnn",
         "openvino",
@@ -2814,6 +2949,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     },
     "paddle311": {
@@ -4185,6 +4323,72 @@
         "paddle_mkldnn",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "korean_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "th_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "el_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "openvino",
+        "onnxruntime",
+        "paddle_mkldnn",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "paddle_mkldnn",
         "openvino",
@@ -4230,6 +4434,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     }
   },
@@ -5548,6 +5755,72 @@
         "onnxruntime",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "korean_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "th_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "el_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "tensorrt",
         "paddle"
@@ -5585,6 +5858,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     },
     "paddle31": {
@@ -6915,6 +7191,72 @@
         "onnxruntime",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "korean_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "th_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "el_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "tensorrt",
         "onnxruntime",
@@ -6958,6 +7300,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     },
     "paddle311": {
@@ -8287,6 +8632,72 @@
         "onnxruntime",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "korean_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "th_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "el_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "paddle_tensorrt_fp16",
+        "tensorrt",
+        "onnxruntime",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "tensorrt",
         "onnxruntime",
@@ -8330,6 +8741,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     }
   },
@@ -9299,6 +9713,50 @@
         "onnxruntime",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "korean_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "th_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "el_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "paddle"
       ],
@@ -9334,6 +9792,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     },
     "paddle31": {
@@ -10313,6 +10774,50 @@
         "onnxruntime",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "korean_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "eslav_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "th_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "el_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "onnxruntime",
         "paddle"
@@ -10351,6 +10856,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     },
     "paddle311": {
@@ -11330,6 +11838,34 @@
         "onnxruntime",
         "paddle"
       ],
+      "en_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "latin_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "cyrillic_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "arabic_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "devanagari_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "ta_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
+      "te_PP-OCRv5_mobile_rec": [
+        "onnxruntime",
+        "paddle"
+      ],
       "PP-OCRv5_server_det": [
         "onnxruntime",
         "paddle"
@@ -11368,6 +11904,9 @@
       ],
       "YOWO": [
         "paddle"
+      ],
+      "PP-DocLayoutV2": [
+        "paddle"
       ]
     }
   }
diff --git a/paddlex/inference/utils/official_models.py b/paddlex/inference/utils/official_models.py
index 505ebe076e..00de4707c5 100644
--- a/paddlex/inference/utils/official_models.py
+++ b/paddlex/inference/utils/official_models.py
@@ -45,6 +45,7 @@
     "ResNet152",
     "ResNet152_vd",
     "ResNet200_vd",
+    "PaddleOCR-VL-0.9B",
     "PP-LCNet_x0_25",
     "PP-LCNet_x0_25_textline_ori",
     "PP-LCNet_x0_35",
@@ -294,6 +295,7 @@
     "GroundingDINO-T",
     "SAM-H_box",
     "SAM-H_point",
+    "PP-DocLayoutV2",
     "PP-DocLayout-L",
     "PP-DocLayout-M",
     "PP-DocLayout-S",
@@ -315,6 +317,11 @@
     "th_PP-OCRv5_mobile_rec",
     "el_PP-OCRv5_mobile_rec",
     "en_PP-OCRv5_mobile_rec",
+    "arabic_PP-OCRv5_mobile_rec",
+    "te_PP-OCRv5_mobile_rec",
+    "ta_PP-OCRv5_mobile_rec",
+    "devanagari_PP-OCRv5_mobile_rec",
+    "cyrillic_PP-OCRv5_mobile_rec",
 ]
 
 
@@ -338,6 +345,7 @@
     "en_PP-OCRv5_mobile_rec",
     "th_PP-OCRv5_mobile_rec",
     "el_PP-OCRv5_mobile_rec",
+    "PaddleOCR-VL-0.9B",
     "PicoDet_layout_1x",
     "PicoDet_layout_1x_table",
     "PicoDet-L_layout_17cls",
@@ -349,10 +357,12 @@
     "PP-DocBee-2B",
     "PP-DocBee-7B",
     "PP-DocBlockLayout",
+    "PP-DocLayoutV2",
     "PP-DocLayout-L",
     "PP-DocLayout-M",
     "PP-DocLayout_plus-L",
     "PP-DocLayout-S",
+    "PP-DocLayoutV2",
     "PP-FormulaNet-L",
     "PP-FormulaNet_plus-L",
     "PP-FormulaNet_plus-M",
@@ -388,6 +398,11 @@
     "te_PP-OCRv3_mobile_rec",
     "UniMERNet",
     "UVDoc",
+    "arabic_PP-OCRv5_mobile_rec",
+    "te_PP-OCRv5_mobile_rec",
+    "ta_PP-OCRv5_mobile_rec",
+    "devanagari_PP-OCRv5_mobile_rec",
+    "cyrillic_PP-OCRv5_mobile_rec",
 ]
 
 
@@ -404,8 +419,25 @@ def get_model(self, model_name):
         assert (
             model_name in self.model_list
         ), f"The model {model_name} is not supported on hosting {self.__class__.__name__}!"
+        if model_name == "PaddleOCR-VL-0.9B":
+            model_name = "PaddleOCR-VL"
+
         model_dir = self._save_dir / f"{model_name}"
-        self._download(model_name, model_dir)
+        if os.path.exists(model_dir):
+            logging.info(
+                f"Model files already exist. Using cached files. To redownload, please delete the directory manually: `{model_dir}`."
+            )
+        else:
+            logging.info(
+                f"Using official model ({model_name}), the model files will be automatically downloaded and saved in `{model_dir}`."
+            )
+            self._download(model_name, model_dir)
+
+        if model_name == "PaddleOCR-VL":
+            vl_model_dir = model_dir / "PaddleOCR-VL-0.9B"
+            if vl_model_dir.exists() and vl_model_dir.is_dir():
+                return vl_model_dir
+
         return model_dir
 
     @abstractmethod
@@ -543,9 +575,6 @@ def _build_hosters(self):
         return hosters
 
     def _get_model_local_path(self, model_name):
-        logging.info(
-            f"Using official model ({model_name}), the model files will be automatically downloaded and saved in {self._save_dir}."
-        )
         if len(self._hosters) == 0:
             msg = "No available model hosting platforms detected. Please check your network connection."
             logging.error(msg)
@@ -556,17 +585,19 @@ def _download_from_hoster(self, hosters, model_name):
         for idx, hoster in enumerate(hosters):
             if model_name in hoster.model_list:
                 try:
-                    return hoster.get_model(model_name)
-                except Exception as e:
-                    logging.warning(
-                        f"Encounter exception when download model from {hoster.alias}: \n{e}."
+                    model_path = hoster.get_model(model_name)
+                    logging.debug(
+                        f"`{model_name}` model files has been download from model source: `{hoster.alias}`!"
                     )
+                    return model_path
+
+                except Exception as e:
                     if len(hosters) <= 1:
                         raise Exception(
-                            f"No model source is available! Please check network or use local model files!"
+                            f"Encounter exception when download model from {hoster.alias}. No model source is available! Please check network or use local model files!"
                         )
                     logging.warning(
-                        f"PaddleX would try to download from other model sources."
+                        f"Encountering exception when download model from {hoster.alias}: \n{e}, will try to download from other model sources: `hosters[idx + 1].alias`."
                     )
                     return self._download_from_hoster(hosters[idx + 1 :], model_name)
 
diff --git a/paddlex/inference/utils/pp_option.py b/paddlex/inference/utils/pp_option.py
index 5dd08ec240..db99fbec03 100644
--- a/paddlex/inference/utils/pp_option.py
+++ b/paddlex/inference/utils/pp_option.py
@@ -18,7 +18,7 @@
 
 from ...utils import logging
 from ...utils.device import get_default_device, parse_device, set_env_for_device_type
-from ...utils.flags import ENABLE_MKLDNN_BYDEFAULT, USE_PIR_TRT
+from ...utils.flags import ENABLE_MKLDNN_BYDEFAULT, USE_PIR_TRT, DISABLE_DEVICE_FALLBACK
 from .misc import is_mkldnn_available
 from .mkldnn_blocklist import MKLDNN_BLOCKLIST
 from .new_ir_blocklist import NEWIR_BLOCKLIST
@@ -81,6 +81,23 @@ def setdefault_by_model_name(self, model_name):
         for k, v in self._get_default_config(model_name).items():
             self._cfg.setdefault(k, v)
 
+        if self.device_type == "gpu":
+            import paddle
+
+            if not (paddle.device.is_compiled_with_cuda() and paddle.device.cuda.device_count() > 0):
+                if DISABLE_DEVICE_FALLBACK:
+                    raise RuntimeError(
+                        "Device fallback is disabled and the specified device (GPU) is not available. "
+                        "To fall back to CPU instead, unset the PADDLE_PDX_DISABLE_DEVICE_FALLBACK environment variable."
+                    )
+                else:
+                    logging.warning(
+                        "The specified device (GPU) is not available! Switching to CPU instead."
+                    )
+                self.device_type = "cpu"
+                self.run_mode = get_default_run_mode(model_name, "cpu")
+                self.device_id = None
+
         # for trt
         if self.run_mode in ("trt_int8", "trt_fp32", "trt_fp16"):
             trt_cfg_setting = TRT_CFG_SETTING[model_name]
diff --git a/paddlex/model.py b/paddlex/model.py
index 830cb6c20b..f072f11e72 100644
--- a/paddlex/model.py
+++ b/paddlex/model.py
@@ -81,12 +81,13 @@ def _build_predictor(self):
         predict_kwargs = deepcopy(self._config.Predict)
 
         model_dir = predict_kwargs.pop("model_dir", None)
+        device = self._config.Global.get("device", None)
 
         UNSET = object()
-        device = self._config.Global.get("device", None)
         kernel_option = predict_kwargs.pop("kernel_option", UNSET)
         use_hpip = predict_kwargs.pop("use_hpip", UNSET)
         hpi_config = predict_kwargs.pop("hpi_config", UNSET)
+        genai_config = predict_kwargs.pop("genai_config", UNSET)
 
         create_predictor_kwargs = {}
         if kernel_option is not UNSET:
@@ -99,10 +100,12 @@ def _build_predictor(self):
             create_predictor_kwargs["use_hpip"] = False
         if hpi_config is not UNSET:
             create_predictor_kwargs["hpi_config"] = hpi_config
+        if genai_config is not UNSET:
+            create_predictor_kwargs["genai_config"] = genai_config
 
         predictor = create_predictor(
             self._model_name,
-            model_dir,
+            model_dir=model_dir,
             device=device,
             **create_predictor_kwargs,
         )
diff --git a/paddlex/modules/base/trainer.py b/paddlex/modules/base/trainer.py
index 61daa57b84..78b3e149ad 100644
--- a/paddlex/modules/base/trainer.py
+++ b/paddlex/modules/base/trainer.py
@@ -84,7 +84,7 @@ def train(self, *args, **kwargs):
                     "uniform_output_enabled", True
                 ),
                 "export_with_pir": export_with_pir,
-                "ips": self.train_config.get("dist_ips", None)
+                "ips": self.train_config.get("dist_ips", None),
             }
         )
 
diff --git a/paddlex/modules/doc_vlm/model_list.py b/paddlex/modules/doc_vlm/model_list.py
index 5886d04eed..c09af0b36f 100644
--- a/paddlex/modules/doc_vlm/model_list.py
+++ b/paddlex/modules/doc_vlm/model_list.py
@@ -13,4 +13,10 @@
 # limitations under the License.
 
 
-MODELS = ["PP-DocBee-2B", "PP-DocBee-7B", "PP-Chart2Table", "PP-DocBee2-3B"]
+MODELS = [
+    "PP-DocBee-2B",
+    "PP-DocBee-7B",
+    "PP-Chart2Table",
+    "PP-DocBee2-3B",
+    "PaddleOCR-VL-0.9B",
+]
diff --git a/paddlex/modules/object_detection/model_list.py b/paddlex/modules/object_detection/model_list.py
index 96fa8e8085..0df27337fb 100644
--- a/paddlex/modules/object_detection/model_list.py
+++ b/paddlex/modules/object_detection/model_list.py
@@ -83,4 +83,5 @@
     "PP-DocLayout-S",
     "PP-DocLayout_plus-L",
     "PP-DocBlockLayout",
+    "PP-DocLayoutV2",
 ]
diff --git a/paddlex/modules/text_recognition/model_list.py b/paddlex/modules/text_recognition/model_list.py
index 8b2698aebd..5976e4b5dd 100644
--- a/paddlex/modules/text_recognition/model_list.py
+++ b/paddlex/modules/text_recognition/model_list.py
@@ -39,4 +39,9 @@
     "en_PP-OCRv5_mobile_rec",
     "el_PP-OCRv5_mobile_rec",
     "th_PP-OCRv5_mobile_rec",
+    "arabic_PP-OCRv5_mobile_rec",
+    "cyrillic_PP-OCRv5_mobile_rec",
+    "devanagari_PP-OCRv5_mobile_rec",
+    "ta_PP-OCRv5_mobile_rec",
+    "te_PP-OCRv5_mobile_rec"
 ]
diff --git a/paddlex/paddlex_cli.py b/paddlex/paddlex_cli.py
index 590380a150..a5c27a1e4a 100644
--- a/paddlex/paddlex_cli.py
+++ b/paddlex/paddlex_cli.py
@@ -29,10 +29,12 @@
 from .utils import logging
 from .utils.deps import (
     get_dep_version,
-    get_paddle2onnx_spec,
+    get_genai_dep_specs,
+    get_genai_fastdeploy_spec,
+    get_paddle2onnx_dep_specs,
     get_serving_dep_specs,
+    is_dep_available,
     is_paddle2onnx_plugin_available,
-    require_paddle2onnx_plugin,
 )
 from .utils.env import get_paddle_cuda_version
 from .utils.install import install_packages, uninstall_packages
@@ -225,12 +227,20 @@ def install(args):
     """install paddlex"""
 
     def _install_serving_deps():
-        reqs = get_serving_dep_specs()
-        # Should we sort the requirements?
-        install_packages(reqs)
+        try:
+            install_packages(get_serving_dep_specs())
+        except Exception:
+            logging.error("Installation failed", exc_info=True)
+            sys.exit(1)
+        logging.info("Successfully installed the serving plugin")
 
     def _install_paddle2onnx_deps():
-        install_packages([get_paddle2onnx_spec()])
+        try:
+            install_packages(get_paddle2onnx_dep_specs())
+        except Exception:
+            logging.error("Installation failed", exc_info=True)
+            sys.exit(1)
+        logging.info("Successfully installed the Paddle2ONNX plugin")
 
     def _install_hpi_deps(device_type):
         SUPPORTED_DEVICE_TYPES = ["cpu", "gpu", "npu"]
@@ -270,33 +280,107 @@ def _install_hpi_deps(device_type):
                 logging.info(
                     f"The high-performance inference plugin '{package}' is mutually exclusive with '{other_package}' (version {version} installed). Uninstalling '{other_package}'..."
                 )
-                uninstall_packages([other_package])
+                try:
+                    uninstall_packages([other_package])
+                except Exception:
+                    logging.error("Failed to uninstall packages", exc_info=True)
+                    sys.exit(1)
 
         with importlib.resources.path("paddlex", hpip_links_file) as f:
             version = get_dep_version(package)
-            if version is None:
-                install_packages([package], pip_install_opts=["--find-links", str(f)])
-            else:
-                response = input(
-                    f"The high-performance inference plugin is already installed (version {repr(version)}). Do you want to reinstall it? (y/n):"
-                )
-                if response.lower() in ["y", "yes"]:
-                    uninstall_packages([package])
+            try:
+                if version is None:
                     install_packages(
-                        [package],
-                        pip_install_opts=[
-                            "--find-links",
-                            str(f),
-                        ],
+                        [package], pip_install_opts=["--find-links", str(f)]
                     )
                 else:
-                    return
+                    response = input(
+                        f"The high-performance inference plugin is already installed (version {repr(version)}). Do you want to reinstall it? (y/n):"
+                    )
+                    if response.lower() in ["y", "yes"]:
+                        uninstall_packages([package])
+                        install_packages(
+                            [package],
+                            pip_install_opts=[
+                                "--find-links",
+                                str(f),
+                            ],
+                        )
+                    else:
+                        return
+            except Exception:
+                logging.error("Installation failed", exc_info=True)
+                sys.exit(1)
+
+        logging.info("Successfully installed the high-performance inference plugin")
 
         if not is_paddle2onnx_plugin_available():
             logging.info(
                 "The Paddle2ONNX plugin is not available. It is recommended to run `paddlex --install paddle2onnx` to install the Paddle2ONNX plugin to use the full functionality of high-performance inference."
             )
 
+    def _install_genai_deps(plugin_types):
+        fd_plugin_types = []
+        not_fd_plugin_types = []
+        for plugin_type in plugin_types:
+            if "fastdeploy" in plugin_type:
+                fd_plugin_types.append(plugin_type)
+            else:
+                not_fd_plugin_types.append(plugin_type)
+        if fd_plugin_types:
+            if not is_dep_available("paddlepaddle"):
+                sys.exit("Please install PaddlePaddle first.")
+            import paddle.device
+
+            if not paddle.device.is_compiled_with_cuda():
+                sys.exit("Currently, only the GPU version of FastDeploy is supported.")
+            cap = paddle.device.cuda.get_device_capability()
+            if cap in ((8, 0), (9, 0)):
+                index_url = "https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/"
+            elif cap in ((8, 6), (8, 9)):
+                index_url = "https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-86_89/"
+            else:
+                sys.exit(
+                    f"The compute capability of the GPU is {cap[0]}.{cap[1]}, which is not supported. The supported compute capabilities are 8.0, 8.6, 8.9, and 9.0."
+                )
+            try:
+                install_packages(
+                    [get_genai_fastdeploy_spec("gpu")],
+                    pip_install_opts=["--extra-index-url", index_url],
+                )
+            except Exception:
+                logging.error("Installation failed", exc_info=True)
+                sys.exit(1)
+
+        reqs = []
+        for plugin_type in not_fd_plugin_types:
+            try:
+                r = get_genai_dep_specs(plugin_type)
+            except ValueError:
+                logging.error("Invalid generative AI plugin type: %s", plugin_type)
+                sys.exit(2)
+            reqs += r
+        try:
+            install_packages(reqs, constraints="required")
+        except Exception:
+            logging.error("Installation failed", exc_info=True)
+            sys.exit(1)
+
+        for plugin_type in plugin_types:
+            if "vllm" in plugin_type or "sglang" in plugin_type:
+                try:
+                    install_packages(["wheel"], constraints="required")
+                    install_packages(["flash-attn == 2.8.2"], constraints="required")
+                except Exception:
+                    logging.error("Installation failed", exc_info=True)
+                    sys.exit(1)
+                break
+
+        logging.info(
+            "Successfully installed the generative AI plugin"
+            + ("s" if len(plugin_types) > 1 else "")
+        )
+
     # Enable debug info
     os.environ["PADDLE_PDX_DEBUG"] = "True"
     # Disable eager initialization
@@ -322,10 +406,10 @@ def _install_hpi_deps(device_type):
 
     hpi_plugins = list(filter(lambda name: name.startswith("hpi-"), plugins))
     if hpi_plugins:
-        for i in hpi_plugins:
-            plugins.remove(i)
+        for p in hpi_plugins:
+            plugins.remove(p)
         if plugins:
-            logging.error("`hpi` cannot be used together with other plugins.")
+            logging.error("`hpi-xxx` cannot be used together with other plugins.")
             sys.exit(2)
         if len(hpi_plugins) > 1 or len(hpi_plugins[0].split("-")) != 2:
             logging.error(
@@ -338,10 +422,29 @@ def _install_hpi_deps(device_type):
         _install_hpi_deps(device_type=device_type)
         return
 
+    genai_plugins = list(filter(lambda name: name.startswith("genai-"), plugins))
+    if genai_plugins:
+        for p in genai_plugins:
+            plugins.remove(p)
+        if plugins:
+            logging.error("`genai-xxx` cannot be used together with other plugins.")
+            sys.exit(2)
+        genai_plugin_types = [p[len("genai-") :] for p in genai_plugins]
+        _install_genai_deps(genai_plugin_types)
+        return
+
+    all_repo_names = get_all_supported_repo_names()
+    unknown_plugins = []
+    for p in plugins:
+        if p not in all_repo_names:
+            unknown_plugins.append(p)
+    if unknown_plugins:
+        logging.error("Unknown plugins: %s", unknown_plugins)
+        sys.exit(2)
     if plugins:
         repo_names = plugins
     elif len(plugins) == 0:
-        repo_names = get_all_supported_repo_names()
+        repo_names = all_repo_names
     setup(
         repo_names=repo_names,
         no_deps=args.no_deps,
@@ -374,19 +477,31 @@ def pipeline_predict(
 
 
 def serve(pipeline, *, device, use_hpip, hpi_config, host, port):
-    from .inference.serving.basic_serving import create_pipeline_app, run_server
+    try:
+        from .inference.serving.basic_serving import create_pipeline_app, run_server
+    except RuntimeError:
+        logging.error("Failed to load the serving module", exc_info=True)
+        sys.exit(1)
 
     pipeline_config = load_pipeline_config(pipeline)
-    pipeline = create_pipeline(
-        config=pipeline_config, device=device, use_hpip=use_hpip, hpi_config=hpi_config
-    )
+    try:
+        pipeline = create_pipeline(
+            config=pipeline_config,
+            device=device,
+            use_hpip=use_hpip,
+            hpi_config=hpi_config,
+        )
+    except Exception:
+        logging.error("Failed to create the pipeline", exc_info=True)
+        sys.exit(1)
     app = create_pipeline_app(pipeline, pipeline_config)
     run_server(app, host=host, port=port)
 
 
 # TODO: Move to another module
 def paddle_to_onnx(paddle_model_dir, onnx_model_dir, *, opset_version):
-    require_paddle2onnx_plugin()
+    if not is_paddle2onnx_plugin_available():
+        sys.exit("Please install the Paddle2ONNX plugin first.")
 
     ONNX_MODEL_FILENAME = f"{MODEL_FILE_PREFIX}.onnx"
     CONFIG_FILENAME = f"{MODEL_FILE_PREFIX}.yml"
@@ -448,6 +563,9 @@ def _copy_additional_files(input_dir, output_dir):
             shutil.copy(src_path, dst_path)
             logging.info(f"Copied {src_path} to {dst_path}")
 
+    if not paddle_model_dir:
+        sys.exit("PaddlePaddle model directory must be specified")
+
     paddle_model_dir = Path(paddle_model_dir)
     if not onnx_model_dir:
         onnx_model_dir = paddle_model_dir
@@ -476,7 +594,6 @@ def main():
 
     if args.install is not None:
         install(args)
-        return
     elif args.serve:
         serve(
             args.pipeline,
@@ -486,14 +603,12 @@ def main():
             host=args.host,
             port=args.port,
         )
-        return
     elif args.paddle2onnx:
         paddle_to_onnx(
             args.paddle_model_dir,
             args.onnx_model_dir,
             opset_version=args.opset_version,
         )
-        return
     else:
         if args.get_pipeline_config is not None:
             interactive_get_pipeline(args.get_pipeline_config, args.save_path)
@@ -506,13 +621,16 @@ def main():
                     pipeline_args_dict[arg_name] = getattr(args, arg_name)
                 else:
                     logging.warning(f"Argument {arg_name} is missing in args")
-            pipeline_predict(
-                args.pipeline,
-                args.input,
-                args.device,
-                args.save_path,
-                use_hpip=args.use_hpip or None,
-                hpi_config=args.hpi_config,
-                **pipeline_args_dict,
-            )
-            return
+            try:
+                pipeline_predict(
+                    args.pipeline,
+                    args.input,
+                    args.device,
+                    args.save_path,
+                    use_hpip=args.use_hpip or None,
+                    hpi_config=args.hpi_config,
+                    **pipeline_args_dict,
+                )
+            except Exception:
+                logging.error("Pipeline prediction failed", exc_info=True)
+                sys.exit(1)
diff --git a/paddlex/repo_apis/PaddleOCR_api/configs/arabic_PP-OCRv5_mobile_rec.yaml b/paddlex/repo_apis/PaddleOCR_api/configs/arabic_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..c087ce55ca
--- /dev/null
+++ b/paddlex/repo_apis/PaddleOCR_api/configs/arabic_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,141 @@
+Global:
+  model_name: arabic_PP-OCRv5_mobile_rec # To use static model for inference.
+  debug: false
+  use_gpu: true
+  epoch_num: 75
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/arabic_rec_ppocr_v5
+  save_epoch_step: 10
+  eval_batch_step: [0, 1000]
+  cal_metric_during_train: true
+  pretrained_model:
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: false
+  infer_img:
+  character_dict_path: ./ppocr/utils/dict/ppocrv5_arabic_dict.txt
+  max_text_length: &max_text_length 25
+  infer_mode: false
+  use_space_char: true
+  distributed: true
+  save_res_path: ./output/rec/predicts_arabic_ppocrv5.txt
+  d2s_train_image_shape: [3, 48, 320]
+
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Cosine
+    learning_rate: 0.0005
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    factor: 3.0e-05
+
+
+Architecture:
+  model_type: rec
+  algorithm: SVTR_LCNet
+  Transform:
+  Backbone:
+    name: PPLCNetV3
+    scale: 0.95
+  Head:
+    name: MultiHead
+    head_list:
+      - CTCHead:
+          Neck:
+            name: svtr
+            dims: 120
+            depth: 2
+            hidden_dims: 120
+            kernel_size: [1, 3]
+            use_guide: True
+          Head:
+            fc_decay: 0.00001
+      - NRTRHead:
+          nrtr_dim: 384
+          max_text_length: *max_text_length
+
+Loss:
+  name: MultiLoss
+  loss_config_list:
+    - CTCLoss:
+    - NRTRLoss:
+
+PostProcess:  
+  name: CTCLabelDecode
+
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  ignore_space: False
+
+Train:
+  dataset:
+    name: MultiScaleDataSet
+    ds_width: false
+    data_dir: ./train_data/
+    ext_op_transform_idx: 1
+    label_file_list:
+    - ./train_data/train_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - RecConAug:
+        prob: 0.5
+        ext_data_num: 2
+        image_shape: [48, 320, 3]
+        max_text_length: *max_text_length
+    - RecAug:
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  sampler:
+    name: MultiScaleSampler
+    scales: [[320, 32], [320, 48], [320, 64]]
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [8, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: true
+    batch_size_per_card: *bs
+    drop_last: true
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/
+    label_file_list:
+    - ./train_data/val_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - RecResizeImg:
+        image_shape: [3, 48, 320]
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  loader:
+    shuffle: true
+    drop_last: false
+    batch_size_per_card: 128
+    num_workers: 4
diff --git a/paddlex/repo_apis/PaddleOCR_api/configs/cyrillic_PP-OCRv5_mobile_rec.yaml b/paddlex/repo_apis/PaddleOCR_api/configs/cyrillic_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..0d887ea978
--- /dev/null
+++ b/paddlex/repo_apis/PaddleOCR_api/configs/cyrillic_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,141 @@
+Global:
+  model_name: cyrillic_PP-OCRv5_mobile_rec # To use static model for inference.
+  debug: false
+  use_gpu: true
+  epoch_num: 75
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/cyrillic_rec_ppocr_v5
+  save_epoch_step: 10
+  eval_batch_step: [0, 1000]
+  cal_metric_during_train: true
+  pretrained_model:
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: false
+  infer_img:
+  character_dict_path: ./ppocr/utils/dict/ppocrv5_cyrillic_dict.txt
+  max_text_length: &max_text_length 25
+  infer_mode: false
+  use_space_char: true
+  distributed: true
+  save_res_path: ./output/rec/predicts_cyrillic_ppocrv5.txt
+  d2s_train_image_shape: [3, 48, 320]
+
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Cosine
+    learning_rate: 0.0005
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    factor: 3.0e-05
+
+
+Architecture:
+  model_type: rec
+  algorithm: SVTR_LCNet
+  Transform:
+  Backbone:
+    name: PPLCNetV3
+    scale: 0.95
+  Head:
+    name: MultiHead
+    head_list:
+      - CTCHead:
+          Neck:
+            name: svtr
+            dims: 120
+            depth: 2
+            hidden_dims: 120
+            kernel_size: [1, 3]
+            use_guide: True
+          Head:
+            fc_decay: 0.00001
+      - NRTRHead:
+          nrtr_dim: 384
+          max_text_length: *max_text_length
+
+Loss:
+  name: MultiLoss
+  loss_config_list:
+    - CTCLoss:
+    - NRTRLoss:
+
+PostProcess:  
+  name: CTCLabelDecode
+
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  ignore_space: False
+
+Train:
+  dataset:
+    name: MultiScaleDataSet
+    ds_width: false
+    data_dir: ./train_data/
+    ext_op_transform_idx: 1
+    label_file_list:
+    - ./train_data/train_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - RecConAug:
+        prob: 0.5
+        ext_data_num: 2
+        image_shape: [48, 320, 3]
+        max_text_length: *max_text_length
+    - RecAug:
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  sampler:
+    name: MultiScaleSampler
+    scales: [[320, 32], [320, 48], [320, 64]]
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [8, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: true
+    batch_size_per_card: *bs
+    drop_last: true
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/
+    label_file_list:
+    - ./train_data/val_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - RecResizeImg:
+        image_shape: [3, 48, 320]
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  loader:
+    shuffle: true
+    drop_last: false
+    batch_size_per_card: 128
+    num_workers: 4
diff --git a/paddlex/repo_apis/PaddleOCR_api/configs/devanagari_PP-OCRv5_mobile_rec.yaml b/paddlex/repo_apis/PaddleOCR_api/configs/devanagari_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..dd85e5178d
--- /dev/null
+++ b/paddlex/repo_apis/PaddleOCR_api/configs/devanagari_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,141 @@
+Global:
+  model_name: devanagari_PP-OCRv5_mobile_rec # To use static model for inference.
+  debug: false
+  use_gpu: true
+  epoch_num: 75
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/devanagari_rec_ppocr_v5
+  save_epoch_step: 10
+  eval_batch_step: [0, 1000]
+  cal_metric_during_train: true
+  pretrained_model:
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: false
+  infer_img:
+  character_dict_path: ./ppocr/utils/dict/ppocrv5_devanagari_dict.txt
+  max_text_length: &max_text_length 25
+  infer_mode: false
+  use_space_char: true
+  distributed: true
+  save_res_path: ./output/rec/predicts_devanagari_ppocrv5.txt
+  d2s_train_image_shape: [3, 48, 320]
+
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Cosine
+    learning_rate: 0.0005
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    factor: 3.0e-05
+
+
+Architecture:
+  model_type: rec
+  algorithm: SVTR_LCNet
+  Transform:
+  Backbone:
+    name: PPLCNetV3
+    scale: 0.95
+  Head:
+    name: MultiHead
+    head_list:
+      - CTCHead:
+          Neck:
+            name: svtr
+            dims: 120
+            depth: 2
+            hidden_dims: 120
+            kernel_size: [1, 3]
+            use_guide: True
+          Head:
+            fc_decay: 0.00001
+      - NRTRHead:
+          nrtr_dim: 384
+          max_text_length: *max_text_length
+
+Loss:
+  name: MultiLoss
+  loss_config_list:
+    - CTCLoss:
+    - NRTRLoss:
+
+PostProcess:  
+  name: CTCLabelDecode
+
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  ignore_space: False
+
+Train:
+  dataset:
+    name: MultiScaleDataSet
+    ds_width: false
+    data_dir: ./train_data/
+    ext_op_transform_idx: 1
+    label_file_list:
+    - ./train_data/train_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - RecConAug:
+        prob: 0.5
+        ext_data_num: 2
+        image_shape: [48, 320, 3]
+        max_text_length: *max_text_length
+    - RecAug:
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  sampler:
+    name: MultiScaleSampler
+    scales: [[320, 32], [320, 48], [320, 64]]
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [8, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: true
+    batch_size_per_card: *bs
+    drop_last: true
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/
+    label_file_list:
+    - ./train_data/val_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - RecResizeImg:
+        image_shape: [3, 48, 320]
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  loader:
+    shuffle: true
+    drop_last: false
+    batch_size_per_card: 128
+    num_workers: 4
diff --git a/paddlex/repo_apis/PaddleOCR_api/configs/ta_PP-OCRv5_mobile_rec.yaml b/paddlex/repo_apis/PaddleOCR_api/configs/ta_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..17f17e7643
--- /dev/null
+++ b/paddlex/repo_apis/PaddleOCR_api/configs/ta_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,141 @@
+Global:
+  model_name: ta_PP-OCRv5_mobile_rec # To use static model for inference.
+  debug: false
+  use_gpu: true
+  epoch_num: 75
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/ta_rec_ppocr_v5
+  save_epoch_step: 10
+  eval_batch_step: [0, 1000]
+  cal_metric_during_train: true
+  pretrained_model:
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: false
+  infer_img:
+  character_dict_path: ./ppocr/utils/dict/ppocrv5_ta_dict.txt
+  max_text_length: &max_text_length 25
+  infer_mode: false
+  use_space_char: true
+  distributed: true
+  save_res_path: ./output/rec/predicts_ta_ppocrv5.txt
+  d2s_train_image_shape: [3, 48, 320]
+
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Cosine
+    learning_rate: 0.0005
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    factor: 3.0e-05
+
+
+Architecture:
+  model_type: rec
+  algorithm: SVTR_LCNet
+  Transform:
+  Backbone:
+    name: PPLCNetV3
+    scale: 0.95
+  Head:
+    name: MultiHead
+    head_list:
+      - CTCHead:
+          Neck:
+            name: svtr
+            dims: 120
+            depth: 2
+            hidden_dims: 120
+            kernel_size: [1, 3]
+            use_guide: True
+          Head:
+            fc_decay: 0.00001
+      - NRTRHead:
+          nrtr_dim: 384
+          max_text_length: *max_text_length
+
+Loss:
+  name: MultiLoss
+  loss_config_list:
+    - CTCLoss:
+    - NRTRLoss:
+
+PostProcess:  
+  name: CTCLabelDecode
+
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  ignore_space: False
+
+Train:
+  dataset:
+    name: MultiScaleDataSet
+    ds_width: false
+    data_dir: ./train_data/
+    ext_op_transform_idx: 1
+    label_file_list:
+    - ./train_data/train_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - RecConAug:
+        prob: 0.5
+        ext_data_num: 2
+        image_shape: [48, 320, 3]
+        max_text_length: *max_text_length
+    - RecAug:
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  sampler:
+    name: MultiScaleSampler
+    scales: [[320, 32], [320, 48], [320, 64]]
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [8, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: true
+    batch_size_per_card: *bs
+    drop_last: true
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/
+    label_file_list:
+    - ./train_data/val_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - RecResizeImg:
+        image_shape: [3, 48, 320]
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  loader:
+    shuffle: true
+    drop_last: false
+    batch_size_per_card: 128
+    num_workers: 4
diff --git a/paddlex/repo_apis/PaddleOCR_api/configs/te_PP-OCRv5_mobile_rec.yaml b/paddlex/repo_apis/PaddleOCR_api/configs/te_PP-OCRv5_mobile_rec.yaml
new file mode 100644
index 0000000000..75c0fc98f1
--- /dev/null
+++ b/paddlex/repo_apis/PaddleOCR_api/configs/te_PP-OCRv5_mobile_rec.yaml
@@ -0,0 +1,141 @@
+Global:
+  model_name: te_PP-OCRv5_mobile_rec # To use static model for inference.
+  debug: false
+  use_gpu: true
+  epoch_num: 75
+  log_smooth_window: 20
+  print_batch_step: 10
+  save_model_dir: ./output/te_rec_ppocr_v5
+  save_epoch_step: 10
+  eval_batch_step: [0, 1000]
+  cal_metric_during_train: true
+  pretrained_model:
+  checkpoints:
+  save_inference_dir:
+  use_visualdl: false
+  infer_img:
+  character_dict_path: ./ppocr/utils/dict/ppocrv5_te_dict.txt
+  max_text_length: &max_text_length 25
+  infer_mode: false
+  use_space_char: true
+  distributed: true
+  save_res_path: ./output/rec/predicts_te_ppocrv5.txt
+  d2s_train_image_shape: [3, 48, 320]
+
+
+Optimizer:
+  name: Adam
+  beta1: 0.9
+  beta2: 0.999
+  lr:
+    name: Cosine
+    learning_rate: 0.0005
+    warmup_epoch: 5
+  regularizer:
+    name: L2
+    factor: 3.0e-05
+
+
+Architecture:
+  model_type: rec
+  algorithm: SVTR_LCNet
+  Transform:
+  Backbone:
+    name: PPLCNetV3
+    scale: 0.95
+  Head:
+    name: MultiHead
+    head_list:
+      - CTCHead:
+          Neck:
+            name: svtr
+            dims: 120
+            depth: 2
+            hidden_dims: 120
+            kernel_size: [1, 3]
+            use_guide: True
+          Head:
+            fc_decay: 0.00001
+      - NRTRHead:
+          nrtr_dim: 384
+          max_text_length: *max_text_length
+
+Loss:
+  name: MultiLoss
+  loss_config_list:
+    - CTCLoss:
+    - NRTRLoss:
+
+PostProcess:  
+  name: CTCLabelDecode
+
+Metric:
+  name: RecMetric
+  main_indicator: acc
+  ignore_space: False
+
+Train:
+  dataset:
+    name: MultiScaleDataSet
+    ds_width: false
+    data_dir: ./train_data/
+    ext_op_transform_idx: 1
+    label_file_list:
+    - ./train_data/train_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - RecConAug:
+        prob: 0.5
+        ext_data_num: 2
+        image_shape: [48, 320, 3]
+        max_text_length: *max_text_length
+    - RecAug:
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  sampler:
+    name: MultiScaleSampler
+    scales: [[320, 32], [320, 48], [320, 64]]
+    first_bs: &bs 128
+    fix_bs: false
+    divided_factor: [8, 16] # w, h
+    is_training: True
+  loader:
+    shuffle: true
+    batch_size_per_card: *bs
+    drop_last: true
+    num_workers: 8
+Eval:
+  dataset:
+    name: SimpleDataSet
+    data_dir: ./train_data/
+    label_file_list:
+    - ./train_data/val_list.txt
+    transforms:
+    - DecodeImage:
+        img_mode: BGR
+        channel_first: false
+    - MultiLabelEncode:
+        gtc_encode: NRTRLabelEncode
+    - RecResizeImg:
+        image_shape: [3, 48, 320]
+    - KeepKeys:
+        keep_keys:
+        - image
+        - label_ctc
+        - label_gtc
+        - length
+        - valid_ratio
+  loader:
+    shuffle: true
+    drop_last: false
+    batch_size_per_card: 128
+    num_workers: 4
diff --git a/paddlex/repo_apis/PaddleOCR_api/text_rec/register.py b/paddlex/repo_apis/PaddleOCR_api/text_rec/register.py
index 5dc7692daa..57b984a6fe 100644
--- a/paddlex/repo_apis/PaddleOCR_api/text_rec/register.py
+++ b/paddlex/repo_apis/PaddleOCR_api/text_rec/register.py
@@ -268,3 +268,48 @@
         "supported_apis": ["train", "evaluate", "predict", "export"],
     }
 )
+
+register_model_info(
+    {
+        "model_name": "arabic_PP-OCRv5_mobile_rec",
+        "suite": "TextRec",
+        "config_path": osp.join(PDX_CONFIG_DIR, "arabic_PP-OCRv5_mobile_rec.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export"],
+    }
+)
+
+register_model_info(
+    {
+        "model_name": "te_PP-OCRv5_mobile_rec",
+        "suite": "TextRec",
+        "config_path": osp.join(PDX_CONFIG_DIR, "te_PP-OCRv5_mobile_rec.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export"],
+    }
+)
+
+register_model_info(
+    {
+        "model_name": "ta_PP-OCRv5_mobile_rec",
+        "suite": "TextRec",
+        "config_path": osp.join(PDX_CONFIG_DIR, "ta_PP-OCRv5_mobile_rec.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export"],
+    }
+)
+
+register_model_info(
+    {
+        "model_name": "devanagari_PP-OCRv5_mobile_rec",
+        "suite": "TextRec",
+        "config_path": osp.join(PDX_CONFIG_DIR, "devanagari_PP-OCRv5_mobile_rec.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export"],
+    }
+)
+
+register_model_info(
+    {
+        "model_name": "cyrillic_PP-OCRv5_mobile_rec",
+        "suite": "TextRec",
+        "config_path": osp.join(PDX_CONFIG_DIR, "cyrillic_PP-OCRv5_mobile_rec.yaml"),
+        "supported_apis": ["train", "evaluate", "predict", "export"],
+    }
+)
diff --git a/paddlex/utils/deps.py b/paddlex/utils/deps.py
index 6463898a94..c9d99f6720 100644
--- a/paddlex/utils/deps.py
+++ b/paddlex/utils/deps.py
@@ -29,6 +29,8 @@
 )
 _COLLECTIVE_EXTRA_NAMES = {"base", "plugins", "all"}
 
+_SUPPORTED_GENAI_ENGINE_BACKENDS = ["fastdeploy-server", "vllm-server", "sglang-server"]
+
 
 class DependencyError(Exception):
     pass
@@ -63,18 +65,21 @@ def _get_extras():
 EXTRAS = _get_extras()
 
 
-def _get_dep_specs():
+def _get_base_dep_specs(required_only=False):
     dep_specs = defaultdict(list)
     for dep_spec in importlib.metadata.requires("paddlex"):
         extra_name, dep_spec = _get_extra_name_and_remove_extra_marker(dep_spec)
-        if extra_name is None or extra_name == "all":
+        if (required_only and extra_name is None) or (
+            not required_only and (extra_name is None or extra_name == "base")
+        ):
             dep_spec = dep_spec.rstrip()
             req = Requirement(dep_spec)
             dep_specs[req.name].append(dep_spec)
     return dep_specs
 
 
-DEP_SPECS = _get_dep_specs()
+BASE_DEP_SPECS = _get_base_dep_specs()
+REQUIRED_DEP_SPECS = _get_base_dep_specs(required_only=True)
 
 
 def get_dep_version(dep):
@@ -85,33 +90,32 @@ def get_dep_version(dep):
 
 
 @lru_cache()
-def is_dep_available(dep, /, check_version=None):
-    # Currently for several special deps we check if the import packages exist.
-    if dep in ("paddlepaddle", "paddle-custom-device", "ultra-infer") and check_version:
+def is_dep_available(dep, /, check_version=False):
+    if (
+        dep in ("paddlepaddle", "paddle-custom-device", "ultra-infer", "fastdeploy")
+        and check_version
+    ):
         raise ValueError(
-            "Currently, `check_version` is not allowed to be `True` for `paddlepaddle`, `paddle-custom-device`, and `ultra-infer`."
+            "`check_version` is not allowed to be `True` for `paddlepaddle`, `paddle-custom-device`, `ultra-infer`, and `fastdeploy`."
         )
+    # Currently for several special deps we check if the import packages exist.
     if dep == "paddlepaddle":
         return importlib.util.find_spec("paddle") is not None
     elif dep == "paddle-custom-device":
         return importlib.util.find_spec("paddle_custom_device") is not None
     elif dep == "ultra-infer":
         return importlib.util.find_spec("ultra_infer") is not None
-    else:
-        if dep != "paddle2onnx" and dep not in DEP_SPECS:
-            raise ValueError("Unknown dependency")
-    if check_version is None:
-        if dep == "paddle2onnx":
-            check_version = True
-        else:
-            check_version = False
+    elif dep == "fastdeploy":
+        return importlib.util.find_spec("fastdeploy") is not None
     version = get_dep_version(dep)
     if version is None:
         return False
     if check_version:
-        if dep == "paddle2onnx":
-            return Version(version) in Requirement(get_paddle2onnx_spec()).specifier
-        for dep_spec in DEP_SPECS[dep]:
+        if dep not in BASE_DEP_SPECS:
+            raise ValueError(
+                f"Currently, `check_version=True` is supported only for base dependencies."
+            )
+        for dep_spec in BASE_DEP_SPECS[dep]:
             if Version(version) in Requirement(dep_spec).specifier:
                 return True
     else:
@@ -252,5 +256,68 @@ def require_paddle2onnx_plugin():
         )
 
 
-def get_paddle2onnx_spec():
-    return "paddle2onnx == 2.0.2rc3"
+def get_paddle2onnx_dep_specs():
+    dep_specs = []
+    for item in EXTRAS["paddle2onnx"].values():
+        dep_specs += item
+    return dep_specs
+
+
+def is_genai_engine_plugin_available(backend="any"):
+    if backend != "any" and backend not in _SUPPORTED_GENAI_ENGINE_BACKENDS:
+        raise ValueError(f"Unknown backend type: {backend}")
+    if backend == "any":
+        for be in _SUPPORTED_GENAI_ENGINE_BACKENDS:
+            if is_genai_engine_plugin_available(be):
+                return True
+        return False
+    else:
+        if "fastdeploy" in backend:
+            return is_dep_available("fastdeploy")
+        elif is_extra_available(f"genai-{backend}"):
+            if "vllm" in backend or "sglang" in backend:
+                return is_dep_available("flash-attn")
+            return True
+        return False
+
+
+def require_genai_engine_plugin(backend="any"):
+    if not is_genai_engine_plugin_available(backend):
+        if backend == "any":
+            prefix = "The generative AI engine plugins are"
+        else:
+            prefix = f"The generative AI {repr(backend)} engine plugin is"
+        raise RuntimeError(f"{prefix} not available. Please install it properly.")
+
+
+def is_genai_client_plugin_available():
+    return is_extra_available("genai-client")
+
+
+def require_genai_client_plugin():
+    if not is_genai_client_plugin_available():
+        raise RuntimeError(
+            "The generative AI client plugin is not available. Please install it properly."
+        )
+
+
+def get_genai_fastdeploy_spec(device_type):
+    SUPPORTED_DEVICE_TYPES = ("gpu",)
+    if device_type not in SUPPORTED_DEVICE_TYPES:
+        raise ValueError(f"Unsupported device type: {device_type}")
+    if device_type == "gpu":
+        return "fastdeploy-gpu == 2.0.3"
+    else:
+        raise AssertionError
+
+
+def get_genai_dep_specs(type):
+    if type != "client" and type not in _SUPPORTED_GENAI_ENGINE_BACKENDS:
+        raise ValueError(f"Invalid type: {type}")
+    if "fastdeploy" in type:
+        raise ValueError(f"{repr(type)} is not supported")
+
+    dep_specs = []
+    for item in EXTRAS[f"genai-{type}"].values():
+        dep_specs += item
+    return dep_specs
diff --git a/paddlex/utils/flags.py b/paddlex/utils/flags.py
index 3c634f54df..b49b2160e2 100644
--- a/paddlex/utils/flags.py
+++ b/paddlex/utils/flags.py
@@ -29,6 +29,7 @@
     "USE_PIR_TRT",
     "DISABLE_DEV_MODEL_WL",
     "DISABLE_CINN_MODEL_WL",
+    "DISABLE_DEVICE_FALLBACK",
 ]
 
 
@@ -60,6 +61,9 @@ def get_flag_from_env_var(name, default, format_func=str):
 ENABLE_MKLDNN_BYDEFAULT = get_flag_from_env_var(
     "PADDLE_PDX_ENABLE_MKLDNN_BYDEFAULT", True
 )
+DISABLE_DEVICE_FALLBACK = get_flag_from_env_var(
+    "PADDLE_PDX_DISABLE_DEVICE_FALLBACK", False
+)
 
 MODEL_SOURCE = os.environ.get("PADDLE_PDX_MODEL_SOURCE", "huggingface").lower()
 
diff --git a/paddlex/utils/install.py b/paddlex/utils/install.py
index 89e1153e4f..3df89ffc77 100644
--- a/paddlex/utils/install.py
+++ b/paddlex/utils/install.py
@@ -23,35 +23,47 @@
 
 
 def install_packages_from_requirements_file(
-    requirements_file_path, pip_install_opts=None
+    requirements_file_path,
+    pip_install_opts=None,
+    constraints="base",
 ):
-    from .deps import DEP_SPECS
+    from .deps import BASE_DEP_SPECS, REQUIRED_DEP_SPECS
 
-    # TODO: Precompute or cache the constraints
-    with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
-        for reqs in DEP_SPECS.values():
-            for req in reqs:
-                req = Requirement(req)
-                if req.marker and not req.marker.evaluate():
-                    continue
-                if req.url:
-                    req = f"{req.name}@{req.url}"
-                else:
-                    req = f"{req.name}{req.specifier}"
-                f.write(req + "\n")
-        constraints_file_path = f.name
+    if constraints not in ("base", "required", "none"):
+        raise ValueError(f"Invalid constraints setting: {constraints}")
 
     args = [
         sys.executable,
         "-m",
         "pip",
         "install",
-        "-c",
-        constraints_file_path,
         *(pip_install_opts or []),
         "-r",
         requirements_file_path,
     ]
+
+    if constraints == "base":
+        dep_specs = BASE_DEP_SPECS
+    elif constraints == "required":
+        dep_specs = REQUIRED_DEP_SPECS
+    else:
+        dep_specs = None
+    if dep_specs:
+        # TODO: Precompute or cache the constraints
+        with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
+            for reqs in dep_specs.values():
+                for req in reqs:
+                    req = Requirement(req)
+                    if req.marker and not req.marker.evaluate():
+                        continue
+                    if req.url:
+                        req = f"{req.name}@{req.url}"
+                    else:
+                        req = f"{req.name}{req.specifier}"
+                    f.write(req + "\n")
+            constraints_file_path = f.name
+        args.extend(["-c", constraints_file_path])
+
     logging.debug("Command: %s", args)
 
     try:
@@ -60,14 +72,16 @@ def install_packages_from_requirements_file(
         os.unlink(constraints_file_path)
 
 
-def install_packages(requirements, pip_install_opts=None):
+def install_packages(requirements, pip_install_opts=None, constraints="base"):
     with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
         for req in requirements:
             f.write(req + "\n")
         reqs_file_path = f.name
     try:
         return install_packages_from_requirements_file(
-            reqs_file_path, pip_install_opts=pip_install_opts
+            reqs_file_path,
+            pip_install_opts=pip_install_opts,
+            constraints=constraints,
         )
     finally:
         os.unlink(reqs_file_path)
diff --git a/paddlex/utils/pipeline_arguments.py b/paddlex/utils/pipeline_arguments.py
index b7e276873e..ff31dc1eec 100644
--- a/paddlex/utils/pipeline_arguments.py
+++ b/paddlex/utils/pipeline_arguments.py
@@ -593,6 +593,90 @@ def validator(cli_input: str) -> cli_expected_type:
             "help": "Determines whether to use end-to-end wireless table recognition model",
         },
     ],
+    "PaddleOCR-VL": [
+        {
+            "name": "--use_doc_orientation_classify",
+            "type": bool,
+            "help": "Determines whether to use document orientation classification",
+        },
+        {
+            "name": "--use_doc_unwarping",
+            "type": bool,
+            "help": "Determines whether to use document unwarping",
+        },
+        {
+            "name": "--use_layout_detection",
+            "type": bool,
+            "help": "Determines whether to use layout detection",
+        },
+        {
+            "name": "--use_chart_recognition",
+            "type": bool,
+            "help": "Determines whether to use document chart recognition",
+        },
+        {
+            "name": "--layout_threshold",
+            "type": custom_type(Optional[Union[float, Dict[int, float]]]),
+            "help": "Determines confidence threshold for layout detection",
+        },
+        {
+            "name": "--layout_nms",
+            "type": bool,
+            "help": "Determines whether to use non maximum suppression",
+        },
+        {
+            "name": "--layout_unclip_ratio",
+            "type": custom_type(
+                Optional[Union[float, Tuple[float, float], Dict[int, Tuple]]]
+            ),
+            "help": "Determines unclip ratio for layout detection boxes",
+        },
+        {
+            "name": "--layout_merge_bboxes_mode",
+            "type": custom_type(Optional[Union[str, Dict[int, str]]]),
+            "help": "Determines merge mode for layout detection bboxes, 'union', 'large' or 'small'",
+        },
+        {
+            "name": "--use_queues",
+            "type": bool,
+            "help": "Determines whether to use queues",
+        },
+        {
+            "name": "--prompt_label",
+            "type": custom_type(Optional[str]),
+            "help": "Sets the prompt label when not using layout detection",
+        },
+        {
+            "name": "--format_block_content",
+            "type": bool,
+            "help": "Determines whether to format block content",
+        },
+        {
+            "name": "--repetition_penalty",
+            "type": custom_type(Optional[float]),
+            "help": "",
+        },
+        {
+            "name": "--temperature",
+            "type": custom_type(Optional[float]),
+            "help": "Temperature parameter for VLLM model.",
+        },
+        {
+            "name": "--top_p",
+            "type": custom_type(Optional[float]),
+            "help": "Top-p parameter for VLLM model.",
+        },
+        {
+            "name": "--min_pixels",
+            "type": custom_type(Optional[int]),
+            "help": "Sets the minimum pixels for VLLM model.",
+        },
+        {
+            "name": "--max_pixels",
+            "type": custom_type(Optional[int]),
+            "help": "Sets the max_pixels pixels for VLLM model.",
+        },
+    ],
     "ts_forecast": None,
     "ts_anomaly_detection": None,
     "ts_classification": None,
diff --git a/setup.py b/setup.py
index b8d4f19700..031a8ed5a9 100644
--- a/setup.py
+++ b/setup.py
@@ -20,9 +20,9 @@
 
 from setuptools import find_packages, setup
 
-DEP_SPECS = {
+BASE_DEP_SPECS = {
     "aiohttp": ">= 3.9",
-    "aistudio_sdk": ">=0.3.5",
+    "aistudio-sdk": ">=0.3.5",
     "bce-python-sdk": ">= 0.9",
     "beautifulsoup4": "",
     "chardet": "",
@@ -31,19 +31,17 @@
     "decord": "== 0.6.0; (platform_machine == 'x86_64' or platform_machine == 'AMD64') and sys_platform != 'darwin'",
     "einops": "",
     "faiss-cpu": "",
-    "fastapi": ">= 0.110",
     "filelock": "",
-    "filetype": ">= 1.2",
     "ftfy": "",
     "GPUtil": ">= 1.4",
-    "huggingface_hub": "",
+    "huggingface-hub": "",
     "imagesize": "",
     "Jinja2": "",
     "joblib": "",
-    "langchain": ">= 0.2",
-    "langchain-community": ">= 0.2",
+    "langchain": ">= 0.2, < 1.0",
+    "langchain-community": ">= 0.2, < 1.0",
     "langchain-core": "",
-    "langchain-openai": ">= 0.1",
+    "langchain-openai": ">= 0.1, < 1.0",
     "lxml": "",
     "matplotlib": "",
     "modelscope": ">=1.28.0",
@@ -58,33 +56,33 @@
     "prettytable": "",
     "py-cpuinfo": "",
     "pyclipper": "",
-    "pycocotools": "<=2.0.8",  # pycocotools upgrade incompatible since 2.0.9
+    "pycocotools": "<= 2.0.8",  # pycocotools upgrade incompatible since 2.0.9
     "pydantic": ">= 2",
     "pypdfium2": ">= 4",
+    "python-bidi": "",
     "PyYAML": "== 6.0.2",
     "regex": "",
     "requests": "",
     "ruamel.yaml": "",
+    "safetensors": "",
     "scikit-image": "",
     "scikit-learn": "",
+    "sentencepiece": "",
     "shapely": "",
     "soundfile": "",
-    "starlette": ">= 0.36",
     "tiktoken": "",
     "tokenizers": ">= 0.19",
     "tqdm": "",
     "typing-extensions": "",
     "ujson": "",
-    "uvicorn": ">= 0.16",
-    "yarl": ">= 1.9",
 }
 
 REQUIRED_DEPS = [
-    "aistudio_sdk",
+    "aistudio-sdk",
     "chardet",
     "colorlog",
     "filelock",
-    "huggingface_hub",
+    "huggingface-hub",
     "modelscope",
     "numpy",
     "packaging",
@@ -121,6 +119,8 @@
             # For the same reason as in `cv`
             "pypdfium2",
             "regex",
+            "safetensors",
+            "sentencepiece",
             "tiktoken",
         ],
         "ie": [
@@ -161,6 +161,7 @@
             "opencv-contrib-python",
             "pyclipper",
             "pypdfium2",
+            "python-bidi",
             "shapely",
         ],
         "ocr": [
@@ -174,8 +175,11 @@
             "premailer",
             "pyclipper",
             "pypdfium2",
+            "python-bidi",
             "regex",
+            "safetensors",
             "scikit-learn",
+            "sentencepiece",
             "shapely",
             "tiktoken",
             "tokenizers",
@@ -199,14 +203,37 @@
         ],
     },
     "plugins": {
+        "genai-client": [
+            "openai >= 1.63",
+        ],
+        "genai-sglang-server": [
+            "einops",
+            "sglang [all] == 0.5.2",
+            "torch == 2.8.0",
+            "transformers",
+            "xformers",
+        ],
+        "genai-vllm-server": [
+            "einops",
+            "torch == 2.8.0",
+            "transformers",
+            "uvloop",
+            "vllm == 0.10.2",
+            "xformers",
+        ],
+        "paddle2onnx": [
+            "paddle2onnx == 2.0.2rc3",
+        ],
         "serving": [
-            "aiohttp",
-            "bce-python-sdk",
-            "fastapi",
-            "filetype",
-            "starlette",
-            "uvicorn",
-            "yarl",
+            "aiohttp >= 3.9",
+            "bce-python-sdk >= 0.9",
+            "fastapi >= 0.110",
+            "filetype >= 1.2",
+            "opencv-contrib-python == 4.10.0.84",
+            "pypdfium2 >= 4",
+            "starlette >= 0.36",
+            "uvicorn >= 0.16",
+            "yarl >= 1.9",
         ],
     },
 }
@@ -215,7 +242,7 @@
 def _get_dep_specs(deps):
     dep_specs = []
     for dep in deps:
-        val = DEP_SPECS[dep]
+        val = BASE_DEP_SPECS[dep]
         if not isinstance(val, list):
             val = [val]
         for v in val:
@@ -243,16 +270,17 @@ def dependencies():
 
 def extras():
     dic = {}
-    all_dep_specs = set()
-    for group_name, group in EXTRAS.items():
-        group_dep_specs = set()
-        for extra_name, extra_deps in group.items():
-            extra_dep_specs = _get_dep_specs(extra_deps)
-            dic[extra_name] = _sort_dep_specs(extra_dep_specs)
-            group_dep_specs.update(extra_dep_specs)
-            dic[group_name] = _sort_dep_specs(group_dep_specs)
-            all_dep_specs.update(group_dep_specs)
-    dic["all"] = _sort_dep_specs(all_dep_specs)
+
+    base_dep_specs = set()
+    for extra_name, extra_deps in EXTRAS["base"].items():
+        extra_dep_specs = _get_dep_specs(extra_deps)
+        dic[extra_name] = _sort_dep_specs(extra_dep_specs)
+        base_dep_specs.update(extra_dep_specs)
+    dic["base"] = _sort_dep_specs(base_dep_specs)
+
+    for extra_name, extra_dep_specs in EXTRAS["plugins"].items():
+        dic[extra_name] = _sort_dep_specs(extra_dep_specs)
+
     return dic
 
 
@@ -295,19 +323,23 @@ def _recursively_find(pattern, exts=None):
     for p in itertools.chain(
         _recursively_find("paddlex/configs/*", exts=[".yml", ".yaml"]),
     ):
-        if Path(p).suffix in (".pyc", ".pyo"):
-            continue
         pkg_data.append(Path(p).relative_to("paddlex").as_posix())
     pipeline_config = [
         Path(p).relative_to("paddlex").as_posix()
         for p in glob.glob("paddlex/pipelines/*.yaml")
     ]
-    pkg_data.append("inference/pipelines/ppchatocrv3/ch_prompt.yaml")
     pkg_data.extend(pipeline_config)
+    pkg_data.append("inference/pipelines/ppchatocrv3/ch_prompt.yaml")
     pkg_data.append(".version")
     pkg_data.append("hpip_links.html")
     pkg_data.append("hpip_links_cu12.html")
     pkg_data.append("inference/utils/hpi_model_info_collection.json")
+    genai_chat_templates = [
+        Path(p).relative_to("paddlex").as_posix()
+        for p in glob.glob("paddlex/inference/genai/chat_templates/*.jinja")
+    ]
+    pkg_data.extend(genai_chat_templates)
+    pkg_data.extend("inference/genai/models/")
     ops_file_dir = "paddlex/ops"
     ops_file_types = ["h", "hpp", "cpp", "cc", "cu"]
     return pkgs, {
@@ -334,6 +366,10 @@ def _recursively_find(pattern, exts=None):
         entry_points={
             "console_scripts": [
                 "paddlex = paddlex.__main__:console_entry",
+                "paddlex_genai_server = paddlex.inference.genai.server:main",
+            ],
+            "vllm.general_plugins": [
+                "register_paddlex_genai_models = paddlex.inference.genai.backends.vllm:register_models"
             ],
         },
         # PyPI package information
@@ -342,7 +378,6 @@ def _recursively_find(pattern, exts=None):
             "Intended Audience :: Developers",
             "Intended Audience :: Education",
             "Intended Audience :: Science/Research",
-            "License :: OSI Approved :: Apache Software License",
             "Programming Language :: Python :: 3.8",
             "Programming Language :: Python :: 3.9",
             "Programming Language :: Python :: 3.10",
@@ -355,6 +390,7 @@ def _recursively_find(pattern, exts=None):
             "Topic :: Software Development :: Libraries",
             "Topic :: Software Development :: Libraries :: Python Modules",
         ],
-        license="Apache 2.0",
+        license="Apache-2.0",
+        license_files=["LICENSE"],
         keywords=["paddlepaddle"],
     )