From 6816fc6a6fc9e97633d9321c08e84df05d95f02d Mon Sep 17 00:00:00 2001 From: "zhongze.jiang" Date: Mon, 10 Nov 2025 16:30:42 +0800 Subject: [PATCH] support vLLM >=0.11.0 (V1 engine only) --- README.md | 6 +++++- cosyvoice/vllm/cosyvoice2.py | 19 ++++++++++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 1d32e44e1..b3ae35ec6 100644 --- a/README.md +++ b/README.md @@ -173,14 +173,18 @@ for i, j in enumerate(cosyvoice.inference_zero_shot(text_generator(), '希望你 ``` #### CosyVoice2 vllm Usage -If you want to use vllm for inference, please install `vllm==v0.9.0`. Older vllm version do not support CosyVoice2 inference. +CosyVoice2 now supports **vLLM 0.11.x+ (V1 engine)** and **vLLM 0.9.0 (legacy)**. +Older vllm version(<0.9.0) do not support CosyVoice2 inference, and versions in between (e.g., 0.10.x) are not tested. Notice that `vllm==v0.9.0` has a lot of specific requirements, for example `torch==2.7.0`. You can create a new env to in case your hardward do not support vllm and old env is corrupted. ``` sh conda create -n cosyvoice_vllm --clone cosyvoice conda activate cosyvoice_vllm +# for vllm==0.9.0 pip install vllm==v0.9.0 transformers==4.51.3 -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com +# for vllm>=0.11.0 +pip install vllm==v0.11.0 transformers==4.57.1 -i https://mirrors.aliyun.com/pypi/simple/ --trusted-host=mirrors.aliyun.com python vllm_example.py ``` diff --git a/cosyvoice/vllm/cosyvoice2.py b/cosyvoice/vllm/cosyvoice2.py index de0bc76bf..4d8712353 100644 --- a/cosyvoice/vllm/cosyvoice2.py +++ b/cosyvoice/vllm/cosyvoice2.py @@ -23,6 +23,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """Inference-only Qwen2 model compatible with HuggingFace weights.""" +from typing import Optional +from packaging.version import parse as vparse +import vllm + +# vLLM-0.11.0+ only support V1 engine +VLLM_V1_ENGINE_ONLY: bool = vparse(vllm.__version__) >= vparse("0.11.0") +if VLLM_V1_ENGINE_ONLY: + from vllm.v1.sample.metadata import SamplingMetadata + from vllm.model_executor.models.qwen2 import * @@ -87,10 +96,14 @@ def forward( def compute_logits( self, hidden_states: torch.Tensor, - sampling_metadata: SamplingMetadata, + sampling_metadata: Optional[SamplingMetadata] = None, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.lm_head, hidden_states, - sampling_metadata, self.lm_head.bias) + if VLLM_V1_ENGINE_ONLY: + logits = self.logits_processor(self.lm_head, hidden_states, + self.lm_head.bias) + else: + logits = self.logits_processor(self.lm_head, hidden_states, + sampling_metadata, self.lm_head.bias) return logits def load_weights(self, weights: Iterable[tuple[str,