HabanaAI · tianyuan211 · Sep 28, 2025 · Sep 28, 2025 · Sep 29, 2025 · Oct 14, 2025
@@ -554,6 +554,7 @@ Specified using `--task generate`.
 | `MolmoForCausalLM`                           | Molmo                                                                    | T + I<sup>+</sup>                                                     | `allenai/Molmo-7B-D-0924`, `allenai/Molmo-7B-O-0924`, etc.                                                                                              | ✅︎                     | ✅︎                          | ✅︎                    |
 | `NVLM_D_Model`                               | NVLM-D 1.0                                                               | T + I<sup>+</sup>                                                     | `nvidia/NVLM-D-72B`, etc.                                                                                                                               |                       | ✅︎                          | ✅︎                      |
 | `Ovis`                                       | Ovis2, Ovis1.6                                                           | T + I<sup>+</sup>                                                     | `AIDC-AI/Ovis2-1B`, `AIDC-AI/Ovis1.6-Llama3.2-3B`, etc.                                                                                                 |                       | ✅︎                          | ✅︎                      |
+| `Ovis2_5`                                    | Ovis2.5                                                                  | T + I<sup>+</sup> + V                                                 | `AIDC-AI/Ovis2.5-9B`, etc.                                                                                                 |                       |                             | ✅︎                      |
 | `PaliGemmaForConditionalGeneration`          | PaliGemma, PaliGemma 2                                                   | T + I<sup>E</sup>                                                     | `google/paligemma-3b-pt-224`, `google/paligemma-3b-mix-224`, `google/paligemma2-3b-ft-docci-448`, etc.                                                  |                       | ✅︎                          | ⚠️                       |
 | `Phi3VForCausalLM`                           | Phi-3-Vision, Phi-3.5-Vision                                             | T + I<sup>E+</sup>                                                    | `microsoft/Phi-3-vision-128k-instruct`, `microsoft/Phi-3.5-vision-instruct`, etc.                                                                       |                      | ✅︎                          |  ✅︎                     |
 | `Phi4MMForCausalLM`                          | Phi-4-multimodal                                                         | T + I<sup>+</sup> / T + A<sup>+</sup> / I<sup>+</sup> + A<sup>+</sup> | `microsoft/Phi-4-multimodal-instruct`, etc.                                                                                                             | ✅︎                     | ✅︎                          | ✅︎                       |

@@ -794,7 +794,7 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
         max_model_len=4096,
         max_num_seqs=2,
         trust_remote_code=True,
-        dtype="half",
+        dtype="bfloat16",
         limit_mm_per_prompt={modality: 1},
     )
 
@@ -812,6 +812,41 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Ovis2_5
+def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2.5-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="bfloat16",
+        limit_mm_per_prompt={modality: 1},
+    )
+    if modality == "image":
+        placeholder = "<image>"
+    elif modality == "video":
+        placeholder = "<video>"
+
+    # need to use ovis tokenizer, since ovis2.5 tokenizer is not configured properly
+    tokenizer = AutoTokenizer.from_pretrained(
+        "AIDC-AI/Ovis2-1B", trust_remote_code=True
+    )
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # PaliGemma
 def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1137,6 +1172,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     "molmo": run_molmo,
     "NVLM_D": run_nvlm_d,
     "ovis": run_ovis,
+    "ovis2_5": run_ovis2_5,
     "paligemma": run_paligemma,
     "paligemma2": run_paligemma2,
     "phi3_v": run_phi3v,

@@ -460,6 +460,36 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+# ovis2_5
+def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "AIDC-AI/Ovis2.5-2B"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        dtype="half",
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(
+        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
+    )
+    messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=[fetch_image(url) for url in image_urls],
+    )
+
+
 def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "mistral-community/pixtral-12b"
 
@@ -742,6 +772,7 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
     "mllama": load_mllama,
     "NVLM_D": load_nvlm_d,
     "ovis": load_ovis,
+    "ovis2_5": load_ovis2_5,
     "phi3_v": load_phi3v,
     "phi4_mm": load_phi4mm,
     "pixtral_hf": load_pixtral_hf,

diff --git a/scripts/docs/embedding&rerank单卡多模型部署.md b/scripts/docs/embedding&rerank单卡多模型部署.md
@@ -31,7 +31,6 @@ python launch_multi_models.py --models model1 model2 --max-model-len 8192
 python launch_multi_models.py --models model1 model2 --env-preset performance
 ```
 
-
 ## 目录
 
 1. [系统OS配置](#系统os配置)
@@ -59,11 +58,11 @@ cpupower idle-set -d 2
 cpupower idle-set -d 3
 
 ```
+
 ### 安装vLLM-fork
 
 请按照vLLM-fork标准方式安装vLLM 1.22.0 版本。
 
-
 ## 服务启动
 
 ### 启动docker
@@ -80,8 +79,8 @@ cpupower idle-set -d 3
   - JSON配置文件
   - 命令行单独设置
 
-
 #### 基本启动命令
+
 ```bash
 # 自定义max-model-len
 python launch_multi_models.py --models /data/models/gte-modernbert-base /data/models/gte-reranker-modernbert-base \
@@ -100,6 +99,7 @@ python launch_multi_models.py --models /data/models/gte-modernbert-base /data/mo
 #### 环境变量配置
 
 ##### 使用预设配置
+
 ```bash
 # 默认预设（保守设置）
 python launch_multi_models.py --models model1 model2 --env-preset default
@@ -109,6 +109,7 @@ python launch_multi_models.py --models model1 model2 --env-preset performance
 ```
 
 **性能预设包含的环境变量：**
+
 ```
 VLLM_CONTIGUOUS_PA=false
 VLLM_SKIP_WARMUP=false
@@ -128,12 +129,14 @@ PT_HPU_LAZY_MODE=1
 ```
 
 ##### 使用配置文件
+
 ```bash
 # 从JSON文件加载环境变量配置
 python launch_multi_models.py --models model1 model2 --env-config env_config.json
 ```
 
 配置文件格式（env_config.json）：
+
 ```json
 {
   "VLLM_CONTIGUOUS_PA": "false",
@@ -155,6 +158,7 @@ python launch_multi_models.py --models model1 model2 --env-config env_config.jso
 ```
 
 ##### 单独设置环境变量
+
 ```bash
 # 通过命令行设置特定环境变量，该配置主要用于测试
 python launch_multi_models.py --models model1 model2 \
@@ -164,6 +168,7 @@ python launch_multi_models.py --models model1 model2 \
 ```
 
 #### 高级配置示例
+
 ```bash
 python launch_multi_models.py \
     --models /data/models/gte-modernbert-base /data/models/gte-reranker-modernbert-base \
@@ -174,20 +179,22 @@ python launch_multi_models.py \
     --log-file custom_server.log
 ```
 
-
 #### 查看可用预设
+
 ```bash
 python launch_multi_models.py --list-env-presets
 ```
 
 #### 停止所有模型
+
 ```bash
 python launch_multi_models.py --stop-all
 ```
 
 ### 方法二：手动启动服务
 
 #### 基础启动命令
+
 ```bash
 VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=true python3 -m \
     vllm.entrypoints.openai.mm_api_server \
@@ -205,16 +212,16 @@ VLLM_CONTIGUOUS_PA=false VLLM_SKIP_WARMUP=true python3 -m \
 可以根据模型大小再作调整。
 - 2个模型：35% GPU内存/模型
 
-
 ### 方法三：API动态管理
 
 #### 查看当前模型列表
+
 ```bash
 curl http://localhost:8771/v1/models
 ```
 
-
 #### 验证模型可用性
+
 ```bash
 # 测试Embedding模型
 curl http://localhost:8771/v1/embeddings \
@@ -238,6 +245,7 @@ curl localhost:8771/rerank \
 ### 基础API客户端配置
 
 #### 嵌入模型配置
+
 ```python
 # 嵌入API端点
 embed_url = f"{base_url}/v1/embeddings"
@@ -251,6 +259,7 @@ embed_data = {
 ```
 
 #### 重排序模型配置
+
 ```python
 # 重排序API端点
 rerank_url = f"{base_url}/rerank"
@@ -264,7 +273,6 @@ rerank_data = {
 }
 ```
 
-
 ## 调优参数
 
 ### 环境变量调优
@@ -296,9 +304,11 @@ PT_HPU_LAZY_MODE=1               # 启用HPU延迟模式
 VLLM_CONTIGUOUS_PA=false         # 禁用连续PA
 VLLM_SKIP_WARMUP=false           # 不禁用预热（启用预热以优化性能）
 ```
+
 #### 不同模型特别调优
 
 针对gte和bge 中小于1B的模型，请将PT_HPU_LAZY_MODE 设置为0，其他参数类型的模型，请使用默认PT_HPU_LAZY_MODE 为1，即可
+
 ```
 python
 python launch_multi_models.py --models /data/models/gte-modernbert-base /data/models/gte-reranker-modernbert-base \