vllm-project · divyanshsinghvi · Dec 20, 2025 · Dec 20, 2025 · Dec 20, 2025 · Dec 20, 2025
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -34,6 +34,7 @@ th {
 |`LongcatImagePipeline` | LongCat-Image | `meituan-longcat/LongCat-Image` |
 |`LongCatImageEditPipeline` | LongCat-Image-Edit | `meituan-longcat/LongCat-Image-Edit` |
 |`StableDiffusion3Pipeline` | Stable-Diffusion-3 | `stabilityai/stable-diffusion-3.5-medium` |
+|`CosyVoice3Model` | CosyVoice3 | `FunAudioLLM/Fun-CosyVoice3-0.5B-2512` |
 |`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` |
 |`FluxPipeline` | FLUX.1-dev | `black-forest-labs/FLUX.1-dev` |
 |`OmniGen2Pipeline` | OmniGen2 | `OmniGen2/OmniGen2` |

@@ -0,0 +1,59 @@
+## Setup
+
+Install dependencies:
+```
+uv pip install -e .
+```
+
+> **Note:** This includes required libraries such as `librosa`, `soundfile`,
+> `onnxruntime`, `x-transformers`, and `einops` via
+> `requirements/common.txt` and platform-specific requirements files.
+
+Download the model snapshot:
+```
+from huggingface_hub import snapshot_download
+snapshot_download('FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir='pretrained_models/Fun-CosyVoice3-0.5B')
+```
+
+Add `config.json` in `pretrained_models/Fun-CosyVoice3-0.5B/`:
+```json
+{
+    "model_type": "cosyvoice3",
+    "architectures": [
+        "CosyVoice3Model"
+    ]
+}
+```
+
+> **Why `config.json` is required:**
+> `AutoConfig.register("cosyvoice3", CosyVoice3Config)` only registers a class mapping.
+> The loader still needs `model_type: "cosyvoice3"` from `config.json` to select that class.
+> If no `config.json` is present, model type cannot be inferred automatically.
+> If your downloaded checkpoint already includes a valid `config.json` with
+> `model_type: "cosyvoice3"`, this manual step can be skipped.
+
+Run the offline verification script:
+```
+python examples/offline_inference/text_to_speech/verify_e2e_cosyvoice.py \
+  --model pretrained_models/Fun-CosyVoice3-0.5B \
+  --tokenizer pretrained_models/Fun-CosyVoice3-0.5B/CosyVoice-BlankEN
+```
+
+## Implementation Overview
+
+CosyVoice3 runs as a 2-stage Omni pipeline:
+- Stage 0 (text_speech_lm) converts text + prompt audio to speech tokens.
+- Stage 1 (chunk_aware_flow_matching) converts speech tokens + prompt features to audio.
+
+Key components in `vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py`:
+- `CosyVoice3MultiModalProcessor` builds the multimodal inputs:
+  - Tokenizes `prompt` and `prompt_text`.
+  - Extracts speech tokens and mel features from the prompt audio.
+  - Extracts a speaker embedding.
+- `CosyVoice3Model` implements both stages:
+  - Stage 0 uses `CosyVoice3LM` and outputs speech tokens + conditioning features.
+  - Stage 1 runs the flow model (DiT-based CFM) and HiFiGAN to synthesize waveform.
+
+Stage wiring is configured in `vllm_omni/model_executor/stage_configs/cosyvoice3.yaml`:
+- Stage 0 emits latent speech tokens .
+- Stage 1 consumes them via `custom_process_input_func` and outputs audio.
@@ -0,0 +1,201 @@
+import argparse
+import os
+from pathlib import Path
+
+import librosa
+import numpy as np
+import soundfile as sf
+from vllm import SamplingParams
+from vllm.assets.audio import AudioAsset
+
+from vllm_omni.entrypoints.omni import Omni
+from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
+from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer
+from vllm_omni.model_executor.models.cosyvoice3.utils import extract_text_token
+
+
+def _ensure_mel_filters_asset() -> None:
+    repo_root = Path(__file__).resolve().parents[3]
+    filters_path = repo_root / "vllm_omni" / "model_executor" / "models" / "cosyvoice3" / "assets" / "mel_filters.npz"
+    if filters_path.exists():
+        return
+
+    source_url = "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz"
+    raise FileNotFoundError(
+        "Missing CosyVoice3 mel filter asset:\n"
+        f"  {filters_path}\n"
+        "Download it with:\n"
+        f"  mkdir -p {filters_path.parent} && "
+        f"curl -L {source_url} -o {filters_path}"
+    )
+
+
+def run_e2e():
+    parser = argparse.ArgumentParser()
+    # ""FunAudioLLM/Fun-CosyVoice3-0.5B-2512
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Path to CosyVoice3 model directory (e.g., pretrained_models/Fun-CosyVoice3-0.5B/).",
+    )
+    parser.add_argument("--stage-config", type=str, default="vllm_omni/model_executor/stage_configs/cosyvoice3.yaml")
+    parser.add_argument("--prompt", type=str, default="Hello, this is a test of the CosyVoice system capability.")
+    parser.add_argument(
+        "--prompt-text",
+        type=str,
+        default="You are a helpful assistant.<|endofprompt|>Testing my voices. Why should I not?",
+    )
+    parser.add_argument("--audio-path", type=str, default="prompt.wav")
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        required=True,
+        help="Path to tokenizer directory (e.g., <model_path>/CosyVoice-BlankEN).",
+    )
+    args = parser.parse_args()
+    _ensure_mel_filters_asset()
+    # Ensure tokenizer directory exists
+    if not os.path.exists(args.tokenizer):
+        raise FileNotFoundError(f"{args.tokenizer} does not exist!")
+
+    # Ensure stage config exists
+    if not os.path.exists(args.stage_config):
+        raise FileNotFoundError(f"{args.stage_config} does not exist!")
+
+    print(f"Initializing cosyvoice E2E with model={args.model}")
+
+    # Initialize Omni
+    # This spins up the engine(s) based on the stage config
+    # We pass trust_remote_code=True same as Qwen examples
+    omni = Omni(
+        model=args.model,
+        stage_configs_path=args.stage_config,
+        trust_remote_code=True,
+        tokenizer=args.tokenizer,
+        log_stats=True,
+    )
+
+    # Map CosyVoice sampling config into vLLM SamplingParams for stage 0.
+    try:
+        # TODO: This is not working correctly right now.
+        hf_config = omni.instance.stage_list[0].vllm_config.model_config.hf_config
+        sampling_cfg = hf_config.llm["sampling"]
+    except Exception:
+        sampling_cfg = {"top_p": 0.8, "top_k": 25, "eos_token_id": 6561 + 1}
+
+    print("Model initialized. Preparing inputs...")
+    if args.audio_path:
+        if not os.path.exists(args.audio_path):
+            raise FileNotFoundError(f"Audio file not found: {args.audio_path}")
+        # Load at native sample rate
+        audio_signal, sr = librosa.load(args.audio_path, sr=None)
+
+        # Validate sample rate before processing (similar to original CosyVoice)
+        min_sr = 16000
+        if sr < min_sr:
+            raise ValueError(
+                f"Audio sample rate {sr} Hz is too low. "
+                f"Minimum required: {min_sr} Hz. "
+                f"Please provide audio with sample rate >= {min_sr} Hz."
+            )
+
+        audio_data = (audio_signal.astype(np.float32), sr)
+    else:
+        audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
+
+    prompts = {
+        "prompt": args.prompt,
+        "multi_modal_data": {
+            "audio": audio_data,
+        },
+        "mm_processor_kwargs": {
+            "prompt_text": args.prompt_text,
+            "sample_rate": audio_data[1],
+        },
+    }
+
+    print(f"Generating for prompt: {args.prompt}")
+
+    config = CosyVoice3Config()
+    tokenizer = get_qwen_tokenizer(
+        token_path=args.tokenizer,
+        skip_special_tokens=config.skip_special_tokens,
+        version=config.version,
+    )
+    _, text_token_len = extract_text_token(args.prompt, tokenizer, config.allowed_special)
+    base_len = int(text_token_len)
+    min_len = int(base_len * config.min_token_text_ratio)
+    max_len = int(base_len * config.max_token_text_ratio)
+
+    # Build SamplingParams for each stage (GPT, S2Mel, Vocoder)
+    gpt_sampling = SamplingParams(
+        temperature=1.0,
+        top_p=sampling_cfg["top_p"],
+        top_k=sampling_cfg["top_k"],
+        repetition_penalty=2.0,
+        min_tokens=min_len,
+        max_tokens=max_len,
+        stop_token_ids=[sampling_cfg["eos_token_id"]],
+        # allowed_token_ids=list(range(6561+3)),
+        detokenize=False,
+    )
+    # Not used
+    s2mel_sampling = SamplingParams(
+        temperature=1.0,
+        top_p=1.0,
+        top_k=-1,
+        repetition_penalty=2.0,
+        max_tokens=256,
+        detokenize=False,
+    )
+
+    sampling_params_list = [gpt_sampling, s2mel_sampling]
+
+    # Start profiling (requires VLLM_TORCH_PROFILER_DIR env var)
+    if os.environ.get("VLLM_TORCH_PROFILER_DIR"):
+        print("Starting profiler...")
+        omni.start_profile()
+
+    # Generate (Omni orchestrator requires a per-stage SamplingParams list)
+    outputs = list(omni.generate(prompts, sampling_params_list=sampling_params_list[:2]))
+
+    # Stop profiling and get results
+    if os.environ.get("VLLM_TORCH_PROFILER_DIR"):
+        print("Stopping profiler...")
+        profile_results = omni.stop_profile()
+        print(f"Profile traces saved to: {profile_results}")
+
+    print(outputs)
+    # Verify outputs
+    print(f"Received {len(outputs)} outputs.")
+    for i, output in enumerate(outputs):
+        try:
+            ro_list = output.request_output or []
+            if not ro_list:
+                print("No request_output found.")
+                continue
+
+            for ro in ro_list:
+                # Multimodal output may be attached to RequestOutput or CompletionOutput.
+                mm = getattr(ro, "multimodal_output", None)
+                if not mm and ro.outputs:
+                    mm = getattr(ro.outputs[0], "multimodal_output", None)
+
+                if mm:
+                    print(f"Multimodal output keys: {mm.keys()}")
+                    if "audio" in mm:
+                        audio_out = mm["audio"]
+                        print(f"Generated Audio Shape: {audio_out.shape}")
+                        out_path = f"output_{i}.wav"
+                        sf.write(out_path, audio_out.cpu().numpy().squeeze(), 22050)
+                        print(f"Saved audio to {out_path}")
+                else:
+                    print("No multimodal output found.")
+        except Exception as e:
+            print(f"Error inspecting output: {e}")
+    omni.close()
+
+
+if __name__ == "__main__":
+    run_e2e()
@@ -202,3 +202,5 @@ extend-ignore-identifiers-re = [
     ".*[Oo]no_[Aa]nna.*",
     ".*cann.*",
 ]
+[tool.typos.default.extend-words]
+ue = "ue"
@@ -12,6 +12,8 @@ torchsde>=0.2.6
 openai-whisper>=20250625
 imageio[ffmpeg]>=2.37.2
 sox>=1.5.0
+x-transformers>=2.12.2
+einops>=0.8.1
 prettytable>=3.8.0
 aenum==3.1.16
 pyzmq>=25.0.0