Support vLLM v0.16.0 (#510)

pan-x-c · web-flow · commit 02c7c8eafcbd · 2026-02-26T17:52:48.000+08:00
diff --git a/docs/sphinx_doc/_static/favicon.ico b/docs/sphinx_doc/_static/favicon.ico
diff --git a/docs/sphinx_doc/source/conf.py b/docs/sphinx_doc/source/conf.py
@@ -85,6 +85,8 @@ def get_recent_tags(n: int) -> list:
 
 html_logo = "../_static/logo.svg"
 
+html_favicon = "../_static/favicon.ico"
+
 html_theme_options = {
     "navigation_depth": 3,
     "article_header_end": "article_header_customized.html",
diff --git a/docs/sphinx_doc/source/tutorial/trinity_installation.md b/docs/sphinx_doc/source/tutorial/trinity_installation.md
@@ -82,6 +82,14 @@ uv sync --extra vllm --extra dev --extra flash_attn
 # uv sync --extra tinker --extra dev
 ```
 
+```{tip}
+If you can't install flash-attn due to network error or compiler error, you can try to install it from our pre-compiled wheel:
+
+`python scripts/install/install_flash_attn.py`
+
+If you are using `uv`, add `--uv` flag to the command above.
+```
+
 ---
 
 ## Using Docker
diff --git a/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md b/docs/sphinx_doc/source_zh/tutorial/trinity_installation.md
@@ -82,6 +82,14 @@ uv sync --extra vllm --extra dev --extra flash_attn
 # uv sync --extra tinker --extra dev
 ```
 
+```{tip}
+如果安装 flash-attn 时遇到网络错误或编译错误，您可以尝试从我们预编译的 wheel 安装：
+
+`python scripts/install/install_flash_attn.py`
+
+如果您使用 `uv`，请在上述命令后添加 `--uv` 参数。
+```
+
 ---
 
 ## 使用 Docker
diff --git a/pyproject.toml b/pyproject.toml
@@ -52,10 +52,10 @@ trinity = "trinity.cli.launcher:main"
 
 [project.optional-dependencies]
 vllm = [
-    "vllm>=0.10.2,<=0.15.1,!=0.11.0,!=0.12.0",
+    "vllm>=0.10.2,<=0.16.0,!=0.11.0,!=0.12.0",
     # v0.11 has bug when prefix-caching is enabled so we exclude it
     # v0.12 has a huge performance regression so we exclude it
-    # v0.10.2 is the most stable version, but we allow up to 0.15.1 for new features
+    # v0.10.2 is the most stable version, but we allow up to 0.16.0 for new features
 ]
 data = [
     "py-data-juicer>=1.4.3"
diff --git a/scripts/install/install_flash_attn.py b/scripts/install/install_flash_attn.py
@@ -0,0 +1,110 @@
+"""This script is used to install flash-attn from a pre-built wheel hosted on an OSS bucket.
+Useful for mainland China users who have difficulty installing flash-attn from PyPI due to network issues.
+"""
+import os
+import platform
+import subprocess
+import sys
+import tempfile
+
+import torch
+import typer
+
+app = typer.Typer()
+FLASH_VERSION = "2.8.1"
+
+
+def check_flash_attn_installed():
+    try:
+        import flash_attn
+
+        print(f"flash_attn version: {flash_attn.__version__}")
+        return True
+    except ImportError:
+        return False
+
+
+def install_flash_attn(uv: bool = False, keep_wheel: bool = False):
+    # Get torch version
+    TORCH_VERSION_RAW = torch.__version__
+    torch_major, torch_minor = TORCH_VERSION_RAW.split(".")[:2]
+    torch_version = f"{torch_major}.{torch_minor}"
+
+    # Get python version
+    python_version = f"cp{sys.version_info.major}{sys.version_info.minor}"
+
+    # Get platform name
+    platform_name = platform.system().lower() + "_" + platform.machine()
+
+    # Get cxx11_abi
+    cxx11_abi = str(torch._C._GLIBCXX_USE_CXX11_ABI).upper()
+
+    # Is ROCM
+    # torch.version.hip/cuda are runtime attributes not in type stubs
+    IS_ROCM = hasattr(torch.version, "hip") and torch.version.hip is not None  # type: ignore[attr-defined]
+
+    if IS_ROCM:
+        print("We currently do not host ROCm wheels for flash-attn.")
+        sys.exit(1)
+    else:
+        torch_cuda_version = torch.version.cuda  # type: ignore[attr-defined]
+        cuda_major = torch_cuda_version.split(".")[0] if torch_cuda_version else None
+        if cuda_major != "12":
+            print("Only CUDA 12 wheels are hosted for flash-attn.")
+            sys.exit(1)
+        cuda_version = "12"
+        wheel_filename = (
+            f"flash_attn-{FLASH_VERSION}%2Bcu{cuda_version}torch{torch_version}"
+            f"cxx11abi{cxx11_abi}-{python_version}-{python_version}-{platform_name}.whl"
+        )
+        local_filename = (
+            f"flash_attn-{FLASH_VERSION}-{python_version}-{python_version}-{platform_name}.whl"
+        )
+
+    wheel_url = (
+        "https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com"
+        f"/AgentScope/download/flash-attn/{FLASH_VERSION}/{wheel_filename}"
+    )
+
+    print(f"wheel_url: {wheel_url}")
+    print(f"target_local_file: {local_filename}")
+
+    def _install_helper(local_path: str):
+        subprocess.run(["wget", wheel_url, "-O", local_path], check=True)
+        install_cmd = (
+            ["uv", "pip", "install", local_path]
+            if uv
+            else [sys.executable, "-m", "pip", "install", local_path]
+        )
+        subprocess.run(install_cmd, check=True)
+
+    if keep_wheel:
+        local_path = os.path.abspath(local_filename)
+        _install_helper(local_path)
+    else:
+        with tempfile.TemporaryDirectory() as tempdir:
+            local_path = os.path.join(tempdir, local_filename)
+            _install_helper(local_path)
+
+    # Try to import flash_attn
+    if not check_flash_attn_installed():
+        print("Failed to install flash_attn.")
+        sys.exit(1)
+
+
+@app.command()
+def main(
+    uv: bool = typer.Option(False, help="Use uv pip to install instead of pip"),
+    keep_wheel: bool = typer.Option(
+        False, help="Keep the downloaded wheel file in current directory"
+    ),
+):
+    """Install flash-attn from a pre-built wheel."""
+    if check_flash_attn_installed():
+        print("flash_attn is already installed. Skipping installation.")
+        return
+    install_flash_attn(uv=uv, keep_wheel=keep_wheel)
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/tests/common/vllm_test.py b/tests/common/vllm_test.py
@@ -1033,7 +1033,7 @@ async def test_api_tool_calls(self):
         print_debug(f"    > Finish Reason: {choice.finish_reason}")
         self.assertEqual(choice.finish_reason, "tool_calls")
         if self.enable_thinking:
-            self.assertIsNotNone(choice.message.reasoning_content)
+            self.assertIsNotNone(choice.message.reasoning)
         self.assertIsNotNone(choice.message.tool_calls)
         self.assertEqual(len(choice.message.tool_calls), 1)
 
diff --git a/trinity/common/models/vllm_patch/worker_patch.py b/trinity/common/models/vllm_patch/worker_patch.py
@@ -13,10 +13,10 @@
 def patch_vllm_prompt_logprobs(model_runner: GPUModelRunner):  # noqa: C901
     """Patch vLLM model runner to support prompt logprobs extraction."""
     version = get_vllm_version()
-    if version < parse_version("0.10.2") or version > parse_version("0.15.1"):
+    if version < parse_version("0.10.2") or version > parse_version("0.16.0"):
         raise ValueError(
             f"Unsupported vllm version: {vllm.__version__}. "
-            "This patch requires vllm version >= 0.10.2, <= 0.15.1."
+            "This patch requires vllm version >= 0.10.2, <= 0.16.0."
         )
     is_v0102 = version == parse_version("0.10.2")
 
@@ -237,15 +237,21 @@ def _get_prompt_logprobs_dict_v12(
 
             # Compute prompt logprobs.
             logprobs = self.sampler.compute_logprobs(logits)
-            token_ids, logprobs, ranks = self.sampler.gather_logprobs(
+            logprob_tensors = self.sampler.gather_logprobs(
                 logprobs, num_prompt_logprobs, tgt_token_ids
             )
 
             # Transfer GPU->CPU async.
             chunk_slice = slice(start_idx, start_idx + num_logits)
-            logprobs_tensors.logprob_token_ids[chunk_slice].copy_(token_ids, non_blocking=True)
-            logprobs_tensors.logprobs[chunk_slice].copy_(logprobs, non_blocking=True)
-            logprobs_tensors.selected_token_ranks[chunk_slice].copy_(ranks, non_blocking=True)
+            logprobs_tensors.logprob_token_ids[chunk_slice].copy_(
+                logprob_tensors.logprob_token_ids, non_blocking=True
+            )
+            logprobs_tensors.logprobs[chunk_slice].copy_(
+                logprob_tensors.logprobs, non_blocking=True
+            )
+            logprobs_tensors.selected_token_ranks[chunk_slice].copy_(
+                logprob_tensors.selected_token_ranks, non_blocking=True
+            )
 
         # Remove requests that have completed prefill from the batch
         # num_prompt_logprobs_dict.